net.maizegenetics.pangenome.fastaExtraction.ExtractFastaFromGVCFCBSU Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.fastaExtraction;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GVCFGenomeSequence;
import net.maizegenetics.dna.map.GVCFGenomeSequenceBuilder;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Command line program to pull the sequence for each taxa where each row is an anchor
* Multithreads the extraction so it can run quickly
* TODO Create a plugin which will do this once we recode GVCF->Fasta
* Created by zrm22 on 5/2/17.
*/
public class ExtractFastaFromGVCFCBSU {
public static void main(String args[]) {
ExtractFastaFromGVCFCBSU app = new ExtractFastaFromGVCFCBSU();
app.extractFasta(args[0],args[1], args[2]);
}
/**
* Method to setup the run by creating a file list
* @param refFileName
* @param intervalFileName
* @param gvcfListFileName
*/
public void extractFasta(String refFileName, String intervalFileName, String gvcfListFileName) {
try {
BufferedReader reader = new BufferedReader(new FileReader(gvcfListFileName));
//loop through the lines of the reader and add to a list
ArrayList gvcfFileNameList = new ArrayList<>();
String currentFileName = "";
while((currentFileName=reader.readLine())!=null) {
gvcfFileNameList.add(currentFileName);
}
for(int i = 0; i < gvcfFileNameList.size(); i++) {
System.out.println("Pulling sequence for taxa: "+gvcfFileNameList.get(i));
pullSequenceMultiThread(refFileName,gvcfFileNameList.get(i)+"_haplotype_caller_output.g.vcf",
intervalFileName,gvcfFileNameList.get(i)+"_MergedAnchorSequence.fa",
gvcfFileNameList.get(i)+"_stats.csv");
}
}catch(Exception e) {
e.printStackTrace();
}
}
/**
* Method which creates a GVCFGenomeSequence then extracts out the sequence in a multithreaded manner
* @param refFileName
* @param gvcfFileName
* @param intervalFileName
* @param outputFileName
* @param statFileName
*/
private void pullSequenceMultiThread(String refFileName, String gvcfFileName, String intervalFileName,
String outputFileName, String statFileName) {
try{
System.out.println("Creating GVCFSequence object");
GVCFGenomeSequence sequence = (GVCFGenomeSequence) GVCFGenomeSequenceBuilder.instance(refFileName, gvcfFileName);
System.out.println("Created Genome Sequence");
HashMap chrMap = new HashMap<>();
Object[] chrs = sequence.chromosomes().toArray();
for(int i = 0; i < chrs.length; i++) {
chrMap.put(""+((Chromosome)chrs[i]).getChromosomeNumber(),(Chromosome)chrs[i]);
}
BufferedWriter writer = new BufferedWriter(new FileWriter(outputFileName));
BufferedWriter statsWriter = new BufferedWriter(new FileWriter(statFileName));
BufferedReader reader = new BufferedReader(new FileReader(intervalFileName));
String currentLine = "";
int lineCounter = 0;
statsWriter.write("ID,RequestedSize, ExportedSize,HetCount,AltCount,Depth,GQ,Min_Depth,ZeroCoverageCount,HomoRefCount,HomoAltLowDepthCount,HomoAltHighDepthCount");
statsWriter.newLine();
ArrayList chromList = new ArrayList<>();
ArrayList startPosList = new ArrayList<>();
ArrayList endPosList = new ArrayList<>();
while((currentLine = reader.readLine())!=null) {
if(currentLine.equals("")) {
continue;
}
if(lineCounter%100==0) {
System.out.println("Counter: "+lineCounter);
}
lineCounter++;
String[] currentLineSplit = currentLine.split(":");
String chr = currentLineSplit[0];
String[] currentLineSplit2 = currentLineSplit[1].split("-");
String startPos = currentLineSplit2[0];
String endPos = currentLineSplit2[1];
chromList.add(chr);
startPosList.add(Integer.parseInt(startPos));
endPosList.add(Integer.parseInt(endPos));
}
LongAdder adder = new LongAdder();
//Set up the parallel stream and get each interval specified by the anchor file. Collects everything into a 2d ArrayList for the sequence and the stats
ArrayList> sequenceAndStatsForExport = (ArrayList>) IntStream.range(0,chromList.size()).parallel().mapToObj(index-> {
String chr = chromList.get(index);
int startPos = startPosList.get(index);
int endPos = endPosList.get(index);
adder.increment();
int intVal = adder.intValue();
if(intVal%1000==0) {
System.out.println("MultithreadCounter: "+intVal);
}
HashMap sequenceAndStatMap = sequence.chromosomeSequenceAndStats(chrMap.get(chr),startPos,endPos);
StringBuilder fastaLineBuilder = new StringBuilder();
fastaLineBuilder.append(">" +chr+ ":" + startPos + ":" + endPos);
fastaLineBuilder.append("\n");
fastaLineBuilder.append(sequenceAndStatMap.get("Sequence"));
String fastaLine = fastaLineBuilder.toString();
StringBuilder statLineBuilder = new StringBuilder();
statLineBuilder.append(chr + ":" + startPos + ":" + endPos+",");
statLineBuilder.append(sequenceAndStatMap.get("RefSize")+",");
statLineBuilder.append(sequenceAndStatMap.get("Size")+",");
statLineBuilder.append(sequenceAndStatMap.get("HetCount")+",");
statLineBuilder.append(sequenceAndStatMap.get("AltCount")+",");
statLineBuilder.append(sequenceAndStatMap.get("Depth")+",");
statLineBuilder.append(sequenceAndStatMap.get("GQ")+",");
statLineBuilder.append(sequenceAndStatMap.get("Min_Depth")+",");
statLineBuilder.append(sequenceAndStatMap.get("ZeroCoverageCount")+",");
statLineBuilder.append(sequenceAndStatMap.get("HomoRefCount")+",");
statLineBuilder.append(sequenceAndStatMap.get("HomoAltLowDepthCount")+",");
statLineBuilder.append(sequenceAndStatMap.get("HomoAltHighDepthCount"));
ArrayList exportStrings = new ArrayList<>();
exportStrings.add(fastaLine);
exportStrings.add(statLineBuilder.toString());
return exportStrings;
}).collect(Collectors.toList());
for(ArrayList currentStringToExport: sequenceAndStatsForExport) {
writer.write(currentStringToExport.get(0));
writer.newLine();
statsWriter.write(currentStringToExport.get(1));
statsWriter.newLine();
}
writer.close();
statsWriter.close();
}
catch(Exception e) {
e.printStackTrace();
}
}
}