All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.fastaExtraction.ExtractFastaFromGVCFCBSU Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.fastaExtraction;

import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GVCFGenomeSequence;
import net.maizegenetics.dna.map.GVCFGenomeSequenceBuilder;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * Command line program to pull the sequence for each taxa where each row is an anchor
 * Multithreads the extraction so it can run quickly
 * TODO Create a plugin which will do this once we recode GVCF->Fasta
 * Created by zrm22 on 5/2/17.
 */
public class ExtractFastaFromGVCFCBSU {
    public static void main(String args[]) {
        ExtractFastaFromGVCFCBSU app = new ExtractFastaFromGVCFCBSU();
        app.extractFasta(args[0],args[1], args[2]);
    }

    /**
     * Method to setup the run by creating a file list
     * @param refFileName
     * @param intervalFileName
     * @param gvcfListFileName
     */
    public void extractFasta(String refFileName, String intervalFileName, String gvcfListFileName) {
        try {
            BufferedReader reader = new BufferedReader(new FileReader(gvcfListFileName));

            //loop through the lines of the reader and add to a list
            ArrayList gvcfFileNameList = new ArrayList<>();
            String currentFileName = "";
            while((currentFileName=reader.readLine())!=null) {
                gvcfFileNameList.add(currentFileName);

            }

            for(int i = 0; i < gvcfFileNameList.size(); i++) {
                System.out.println("Pulling sequence for taxa: "+gvcfFileNameList.get(i));
                pullSequenceMultiThread(refFileName,gvcfFileNameList.get(i)+"_haplotype_caller_output.g.vcf",
                        intervalFileName,gvcfFileNameList.get(i)+"_MergedAnchorSequence.fa",
                        gvcfFileNameList.get(i)+"_stats.csv");
            }

        }catch(Exception e) {
            e.printStackTrace();
        }
    }

    /**
     * Method which creates a GVCFGenomeSequence then extracts out the sequence in a multithreaded manner
     * @param refFileName
     * @param gvcfFileName
     * @param intervalFileName
     * @param outputFileName
     * @param statFileName
     */
    private void pullSequenceMultiThread(String refFileName, String gvcfFileName, String intervalFileName,
                                         String outputFileName, String statFileName) {
        try{
            System.out.println("Creating GVCFSequence object");
            GVCFGenomeSequence sequence = (GVCFGenomeSequence) GVCFGenomeSequenceBuilder.instance(refFileName, gvcfFileName);
            System.out.println("Created Genome Sequence");
            HashMap chrMap = new HashMap<>();

            Object[] chrs = sequence.chromosomes().toArray();
            for(int i = 0; i < chrs.length; i++) {
                chrMap.put(""+((Chromosome)chrs[i]).getChromosomeNumber(),(Chromosome)chrs[i]);
            }

            BufferedWriter writer = new BufferedWriter(new FileWriter(outputFileName));
            BufferedWriter statsWriter = new BufferedWriter(new FileWriter(statFileName));
            BufferedReader reader = new BufferedReader(new FileReader(intervalFileName));

            String currentLine = "";
            int lineCounter = 0;
            statsWriter.write("ID,RequestedSize, ExportedSize,HetCount,AltCount,Depth,GQ,Min_Depth,ZeroCoverageCount,HomoRefCount,HomoAltLowDepthCount,HomoAltHighDepthCount");
            statsWriter.newLine();

            ArrayList chromList = new ArrayList<>();
            ArrayList startPosList = new ArrayList<>();
            ArrayList endPosList = new ArrayList<>();
            while((currentLine = reader.readLine())!=null) {
                if(currentLine.equals("")) {
                    continue;
                }
                if(lineCounter%100==0) {
                    System.out.println("Counter: "+lineCounter);
                }
                lineCounter++;
                String[] currentLineSplit = currentLine.split(":");
                String chr = currentLineSplit[0];
                String[] currentLineSplit2 = currentLineSplit[1].split("-");
                String startPos = currentLineSplit2[0];
                String endPos = currentLineSplit2[1];


                chromList.add(chr);
                startPosList.add(Integer.parseInt(startPos));
                endPosList.add(Integer.parseInt(endPos));

            }

            LongAdder adder = new LongAdder();

            //Set up the parallel stream and get each interval specified by the anchor file.  Collects everything into a 2d ArrayList for the sequence and the stats
            ArrayList> sequenceAndStatsForExport = (ArrayList>) IntStream.range(0,chromList.size()).parallel().mapToObj(index-> {
                String chr = chromList.get(index);
                int startPos = startPosList.get(index);
                int endPos = endPosList.get(index);
                adder.increment();
                int intVal = adder.intValue();
                if(intVal%1000==0) {
                    System.out.println("MultithreadCounter: "+intVal);
                }

                HashMap sequenceAndStatMap = sequence.chromosomeSequenceAndStats(chrMap.get(chr),startPos,endPos);

                StringBuilder fastaLineBuilder = new StringBuilder();
                fastaLineBuilder.append(">" +chr+ ":" + startPos + ":" + endPos);
                fastaLineBuilder.append("\n");
                fastaLineBuilder.append(sequenceAndStatMap.get("Sequence"));
                String fastaLine = fastaLineBuilder.toString();

                StringBuilder statLineBuilder = new StringBuilder();
                statLineBuilder.append(chr + ":" + startPos + ":" + endPos+",");
                statLineBuilder.append(sequenceAndStatMap.get("RefSize")+",");
                statLineBuilder.append(sequenceAndStatMap.get("Size")+",");
                statLineBuilder.append(sequenceAndStatMap.get("HetCount")+",");
                statLineBuilder.append(sequenceAndStatMap.get("AltCount")+",");
                statLineBuilder.append(sequenceAndStatMap.get("Depth")+",");
                statLineBuilder.append(sequenceAndStatMap.get("GQ")+",");
                statLineBuilder.append(sequenceAndStatMap.get("Min_Depth")+",");
                statLineBuilder.append(sequenceAndStatMap.get("ZeroCoverageCount")+",");
                statLineBuilder.append(sequenceAndStatMap.get("HomoRefCount")+",");
                statLineBuilder.append(sequenceAndStatMap.get("HomoAltLowDepthCount")+",");
                statLineBuilder.append(sequenceAndStatMap.get("HomoAltHighDepthCount"));
                ArrayList exportStrings = new ArrayList<>();
                exportStrings.add(fastaLine);
                exportStrings.add(statLineBuilder.toString());
                return exportStrings;
            }).collect(Collectors.toList());

            for(ArrayList currentStringToExport: sequenceAndStatsForExport) {
                writer.write(currentStringToExport.get(0));
                writer.newLine();
                statsWriter.write(currentStringToExport.get(1));
                statsWriter.newLine();
            }
            writer.close();
            statsWriter.close();
        }
        catch(Exception e) {
            e.printStackTrace();
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy