net.maizegenetics.dna.snp.io.BuilderFromVCF Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.
The newest version!
package net.maizegenetics.dna.snp.io;

import com.google.common.base.Splitter;
import com.google.common.collect.SetMultimap;

import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.*;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.score.AlleleDepthBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTable;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.ProgressListener;
import net.maizegenetics.util.Tassel5HDF5Constants;
import net.maizegenetics.util.Utils;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;

/**
 * Create an alignment based on VCF format file (either .txt or compressed).  Alleles are set as global reference.
 * e.g. code 
 * {@code
 * Alignment a=BuilderFromVCF.getBuilder(infileName).build();
 * }
 * 
 * TODO:  Add filtering while reading, provide an option to define the alleles as reference and alternate
 *
 * @author Ed Buckler
 */
public class BuilderFromVCF {

    private static final Logger myLogger=LogManager.getLogger(BuilderFromVCF.class);
    private static final Pattern WHITESPACE_PATTERN=Pattern.compile("[\\s]+");
    private static final Pattern TAB_PATTERN = Pattern.compile("[\\t]+");
    private HeaderPositions hp=null;
    private final String infile;
    private boolean includeDepth=false;
    private boolean inMemory=true;
    private String hdf5Outfile=null;
    private GenotypeTableBuilder hdf5GenoTableBuilder=null;
    private final ProgressListener myProgressListener;

    private BuilderFromVCF(String infile,ProgressListener listener) {
        this.infile=infile;
        this.myProgressListener = listener;
    }
    

    /**
     * Create a builder for loading a VCF file into memory
     * @param infile name of the VCF file to be load
     * @return a builder
     */
    public static BuilderFromVCF getBuilder(String infile) {
        return new BuilderFromVCF(infile,null);
    }
    
    /**
     * Create a builder for loading a VCF file into memory
     * @param infile name of the VCF file to be load
     * @param listener Progress listener for GUI
     * @return a builder
     */
    public static BuilderFromVCF getBuilder(String infile, ProgressListener listener) {
        return new BuilderFromVCF(infile,listener);
    }

    /**
     * Create a HDF5 version of file from the VCF file.  This is useful in many cases where VCF files are
     * too large to load fully into memory.
     * @param hdf5Outfile
     * @return
     */
    public BuilderFromVCF convertToHDF5(String hdf5Outfile) {
        inMemory=false;
        this.hdf5Outfile=hdf5Outfile;
        //int numberOfSites=Utils.getNumberLinesNotHashOrBlank(hdf5Outfile);
        //hdf5GenoTableBuilder=new GenotypeTableBuilder(hdf5Outfile,numberOfSites);
        return this;
    }



    public BuilderFromVCF keepDepth() {
        includeDepth=true;
        return this;
    }

    public GenotypeTable buildAndSortInMemory() {
        return buildEngine(true);
    }

    public GenotypeTable build() {
        return buildEngine(false);
    }

    //TODO provide options on caching to use, read only some sites, etc.
    private GenotypeTable buildEngine(boolean fullSort) {
        long time=System.nanoTime();
        GenotypeTable result=null;
        int totalSites=-1;//unknown
        GenotypeTableBuilder gtbDiskBuild=null;
        ExecutorService pool = null;
        try {

            int numThreads=Runtime.getRuntime().availableProcessors();
            pool=Executors.newFixedThreadPool(numThreads);
            
            BufferedReader r=Utils.getBufferedReader(infile, -1);
            //Read the ## annotation rows
            String currLine;
            Map infoMap=new HashMap<>();
            Map formatMap=new HashMap<>();
            Map> sampAnnoBuild=new TreeMap<>();
            currLine=parseVCFHeadersIntoMaps(infoMap,formatMap,sampAnnoBuild,r);

            TaxaList taxaList=processTaxa(currLine,sampAnnoBuild);
            if(inMemory==false) {
                totalSites=Utils.getNumberLinesNotHashOrBlank(infile);
                System.out.println("TotalSite count:"+totalSites);
                gtbDiskBuild=GenotypeTableBuilder.getSiteIncremental(taxaList,totalSites,hdf5Outfile);
            }
            int linesAtTime=(inMemory)?1<<12:Tassel5HDF5Constants.BLOCK_SIZE;  //this is a critical lines with 20% or more swings.  Needs to be optimized with transposing
            //  int linesAtTime=1<<8;  //better for with lots of taxa.
            ArrayList txtLines=new ArrayList<>(linesAtTime);
            ArrayList pbs=new ArrayList<>();
            List> futures = new ArrayList<>();
            int sitesRead=0;
            while ((currLine=r.readLine())!=null) {
                if(currLine.startsWith("#")) continue;
                txtLines.add(currLine);
                sitesRead++;
               if (sitesRead%linesAtTime==0) {
                    ProcessVCFBlock pb;
                    if(inMemory) {
                        pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines,includeDepth);}
                    else{
                        pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, sitesRead-txtLines.size(),gtbDiskBuild, includeDepth);
                    }
                    //pbs.add(pb);
                        //     pb.run(); //used for testing
                    try {
                        if(inMemory) {
                            futures.add(pool.submit(pb));
                        }
                        else {
                            //Put a future on the queue
                            futures.add(pool.submit(pb));
                            //If We are streaming to HDF5, we need to block temporarily and clean out the queue.
                            if(futures.size()>=numThreads) {
                            //if(futures.size()>0) {
                                for(Future future : futures) {
                                    pbs.add(future.get());
                                }
                                futures = new ArrayList<>();
                            }
                           
                        }
                    }
                    catch(Exception e) {
                        myLogger.debug(e.getMessage(), e);
                        throw new IllegalStateException(e.getMessage());
                    }
                    txtLines=new ArrayList<>(linesAtTime);
                }
            }
            r.close();
            //Handle whatever is left over in the file
            if (txtLines.size()>0) {
                ProcessVCFBlock pb;//=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines);
                if(inMemory) {
                    pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, includeDepth);}
                else{
                    pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, sitesRead-txtLines.size(),gtbDiskBuild, includeDepth);
                }
                //pbs.add(pb);
                    //  pb.run(); //used for testing
                
                try {
                    futures.add(pool.submit(pb));
                }
                catch(Exception e) {
                    myLogger.debug(e.getMessage(), e);
                throw new IllegalStateException(e.getMessage());
                }
            }
            int numFutures = futures.size();
            int count = 0;
            for(Future future : futures) {
                try {
                    pbs.add(future.get());
                }
                catch(Exception e) {
                    myLogger.debug(e.getMessage(),e);
                    throw new IllegalStateException(e.getMessage());
                }
                if(myProgressListener != null) {
                    count++;
                    myProgressListener.progress(count * 100 / numFutures,null);
                }
            }
            pool.shutdown();
            
            if(inMemory) {
                //result=completeInMemoryBuilding(futures, taxaList, sitesRead, includeDepth, fullSort);
                result=completeInMemoryBuilding(pbs, taxaList, sitesRead, includeDepth, fullSort);
            } else {
                gtbDiskBuild.build();
            }
            
//            int currentSite=0;
//            PositionListBuilder posBuild=new PositionListBuilder();
//            GenotypeCallTableBuilder gb=GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), lines);
//            AlleleDepthBuilder db=null;
//            if(includeDepth) db=AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(),lines,6);
//            for (ProcessVCFBlock pb : pbs) {
//                posBuild.addAll(pb.getBlkPosList());
//                byte[][] bgTS=pb.getGenoTS();
//                for (int t=0; t pbs, TaxaList taxaList, int numberOfSites, boolean includeDepth, boolean fullSort) {
        int currentSite=0;
        PositionListBuilder posBuild=new PositionListBuilder();
        GenotypeCallTableBuilder gb=GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), numberOfSites);
        AlleleDepthBuilder db=null;
       
        if(includeDepth) db=AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(),numberOfSites,taxaList);
     
        
        for (ProcessVCFBlock pb: pbs) {
            posBuild.addAll(pb.getBlkPosList());
            byte[][] bgTS=pb.getGenoTS();
            for (int t=0; t infoMap, Map formatMap,
        Map> sampAnnoBuild, BufferedReader r) throws IOException {
        String currLine;
        while (((currLine=r.readLine())!=null)&&(currLine.startsWith("##"))) {
            String[] cat=currLine.split("=",2);
            if(cat.length<2) continue;
            switch (cat[0]) {
//                    case "##INFO":
//                        infoMap.put(mapOfAnno.get("ID"), mapOfAnno.get("Description"));
//                        break;
//                    case "##FILTER":break;
//                    case "##FORMAT":
//                        formatMap.put(mapOfAnno.get("ID"),mapOfAnno.get("Description"));
//                        break;
                case "##SAMPLE":
                    SetMultimap mapOfAnno=TaxaListIOUtils.parseVCFHeadersIntoMap(cat[1]);
                    String taxaID=mapOfAnno.get("ID").iterator().next();
                    if(taxaID==null) break;
                    sampAnnoBuild.put(taxaID,mapOfAnno);
                    break;
                case "##PEDIGREE":break;
                default : break;
            }

//                System.out.println(currLine);
        }
        return currLine;
    }

    private static String getReplaceCommaWithinQuote(String s) {
        StringBuilder sb =new StringBuilder(s);
        boolean inQuote=false;
        for (int i=0; i0) snpID=input.substring(tabPos[hp.SNPID_INDEX-1]+1, tabPos[hp.SNPID_INDEX]);
                String refS=input.substring(tabPos[hp.REF_INDEX-1]+1, tabPos[hp.REF_INDEX]);
                String alt=input.substring(tabPos[hp.ALT_INDEX-1]+1, tabPos[hp.ALT_INDEX]);
                String variants;
                if(alt.equals(".")) {variants=refS;}
                else {variants=(refS+"/"+alt).replace(',','/')
                        .replace("", "+").replace('I', '+')
                        .replace("", "-").replace('D', '-')
                        .replace("*", "N");}
            
                //GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, currentPosition)
                //                                               .knownVariants(variants); //TODO strand, variants,
                //ZRM 8_26
                GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, Integer.parseInt(input.substring(tabPos[hp.POSITION_INDEX-1]+1, tabPos[hp.POSITION_INDEX])))
                        .knownVariants(variants) //TODO strand, variants,
                        ;
                if(snpID!=null && !snpID.equals(".")) {
                    apb.snpName(snpID);
                }
                //byte[] alleles=new byte[(variants.length()+1)/2];
                byte[] alleles = new byte[variants.split("/").length];
                for (int i = 0, varInd=0; i < alleles.length; i++, varInd+=2) {
                    alleles[i]=NucleotideAlignmentConstants.getNucleotideAlleleByte(variants.charAt(varInd));
                }
                /***ZRM 8_27 New code ***/
                String[] variantList = variants.split("/");
                if(variantList[0].length()>1) {
                    String[] parsedVariantList = new String[variantList.length];
                    //alt deletion
                    for(int i = 0; i < variantList.length; i++) {
                        //Pull off the first character if it exists
                        if(variantList[i].length()>1) {
                            parsedVariantList[i] = variantList[i].substring(1);
                            if(parsedVariantList[i].length()==0) {
                                parsedVariantList[i] = "-";
                            }
                        }
                        else {
                            //Mark as deletion
                            parsedVariantList[i] = "-";
                        }
                    }
                    for(int i = 0; i variantList[0].length()) {
                            isIndel = true;
                            break;
                        }
                    }
                    if(isIndel) {
                        String[] parsedVariantList = new String[variantList.length];
                        //ref+alt deletion
                        for(int i = 0; i < variantList.length; i++) {
                            //Pull off the first character if it exists
                            if(variantList[i].length()>1) {
                                parsedVariantList[i] = variantList[i].substring(1);
                                if(parsedVariantList[i].length()==0) {
                                    parsedVariantList[i] = "-";
                                }
                            }
                            else {
                                //Mark as deletion
                                parsedVariantList[i] = "-";
                            }
                        }
                        for(int i = 0; i 1) {
                    apb.allele(WHICH_ALLELE.Alternate, alleles[1]);
                }
                for(String annoS: Splitter.on(";").split(input.substring(tabPos[hp.INFO_INDEX-1]+1, tabPos[hp.INFO_INDEX]))) {
                    apb.addAnno(annoS);
                }
                blkPosList.add(apb.build());
                final int iGT=0; //genotype index
                int iAD=-1,iDP=-1,iGQ=-1, iPL=-1;  //alleleDepth, overall depth, genotypeQuality, phredGenotypeLikelihoods
                if(hp.FORMAT_INDEX>=0) {
                    //Check to see if FORMAT tag is missing. Only applicable for single taxa files
                    if(tabPos[hp.FORMAT_INDEX]==0) {
                        throw new IllegalStateException("Error Processing VCF: Missing FORMAT tag.");
                    }
                    String unsplitInput = input.substring(tabPos[hp.FORMAT_INDEX-1]+1, tabPos[hp.FORMAT_INDEX]);
                    if(unsplitInput.length()==0|| !unsplitInput.startsWith("GT")) {
                        //Check to see it has the GT field
                        if(unsplitInput.contains("GT")) {
                            throw new IllegalStateException("Error Processing VCF Block: GT field is not in first position of FORMAT.");
                        }
                        //If GT isnt in, we assume that it is missing FORMAT
                        else {
                            throw new IllegalStateException("Error Processing VCF Block: Missing FORMAT tag.");
                        }
                    }
                    String[] formatS = unsplitInput.split(":");
                    
                    iAD=firstEqualIndex(formatS,"AD");
                }
                int t=0;
                for(String taxaAllG: Splitter.on("\t").split(input.substring(tabPos[hp.NUM_HAPMAP_NON_TAXA_HEADERS-1]+1))) {
                    int f=0;
                    //if(taxaAllG.equals(".")) {
                    if(taxaAllG.startsWith(".")) {
                        gTS[t][s] = GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
                        //need to still move up the taxa counter by one as we have covered this now.
                        t++;
                        continue;
                    }
                    for(String fieldS: Splitter.on(":").split(taxaAllG)) {
                        if(f==iGT) {
                            //String "[.0-9]\\/[.0-9]||[.0-9]\\|[.0-9]" will match a valid diploid
                            if (!fieldS.equals(".")) { //[TAS-509] Check to make sure we are using diploids in the form 0/1 or 0|0
                                int a1 = 0;
                                int a2 = 0;
                                //Should be this, but for speed we just check the character
                                //if(fieldS.split("|/").length ==1) {
                                if(fieldS.length() ==1) {
                                    a1 = fieldS.charAt(0) - '0';
                                    a2 = fieldS.charAt(0) - '0';
                                }
                                else {
                                    //zrm22 Jul 31, 2017
                                    //int a1 = fieldS.charAt(0) - '0';
                                    //int a2 = fieldS.charAt(2) - '0';
                                    
                                    a1 = fieldS.charAt(0) - '0';
                                    a2 = fieldS.charAt(2) - '0';
                                }
                                if(a1>alleles.length-1 || a2>alleles.length-1) {
                                    Position pos = blkPosList.get(blkPosList.size()-1);
                                    throw new IllegalStateException("\nError Processing VCF block: Mismatch of alleles.\n  At Chromosome "+ pos.getChromosome().getName() + ", Position "+pos.getPosition() +".\nAllele ID larger than number of alleles" );
                                }
                                if (a1 < 0 || a2 < 0) {
                                    gTS[t][s] = GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
                                }
                                else {
                                    gTS[t][s] =
                                            GenotypeTableUtils.getDiploidValue(alleles[a1], alleles[a2]);
                                }
                            }
                            else {	//[TAS-509] if it isnt a diploid error out early
                                throw new IllegalStateException("Error Processing VCF block: Found haploid information for the element: "
                                        + taxaAllG + ".\nExpected a diploid entry.");
                            }
                        } else if((f==iAD)&&keepDepth) {
                           // System.out.println("GTS: "+gTS[t][s] + " "+fieldS);
                            if(gTS[t][s]!=GenotypeTable.UNKNOWN_DIPLOID_ALLELE) {
                                int i=0;
                                for(String ad: Splitter.on(",").split(fieldS)){
                                    if(i>=alleles.length) continue;
                                    if(alleles[i]==GenotypeTable.UNKNOWN_ALLELE || ad.equals(".") || alleles[i]==NucleotideAlignmentConstants.UNDEFINED_ALLELE ||
                                            alleles[i]==NucleotideAlignmentConstants.UNDEFINED_DIPLOID_ALLELE) {  //no position for depth of unknown alleles or depth is set to missing, so skip
                                        //Uncomment when converted
                                        //dTS[t][alleles[i++]][s] = AlleleDepthUtil.depthIntToByte(AlleleDepthUtil.DEPTH_MISSING);
                                        //Comment next two lines when converted
                                        i++;
                                        continue;
                                        }
                                    
                                    int adInt=Integer.parseInt(ad);
                                    dTS[t][alleles[i++]][s]=AlleleDepthUtil.depthIntToByte(adInt);
                                }
                            }
                        }
                        f++;
                    }
                    t++;
                }
            } catch(IllegalStateException e) {
                e.printStackTrace();
            	throw e;
            }
            catch(Exception e) {
                System.err.println("Err Site Number:"+s);
                System.err.println("Err Site Number:"+blkPosList.get(blkPosList.size()-1).toString());
                System.err.println("Err:"+input);
                throw e;
            }

        }
        txtL=null;
        if(hdf5Builder!=null) {
            addResultsToHDF5Builder();
            gTS=null;
            dTS=null;
            txtL=null;
            blkPosList.clear();
        }
        //TODO TAS-315 Create memory efficient VCF to HDF5 insert writing to Builder of direct.
        return this;
    }

    private void addResultsToHDF5Builder() {

        hdf5Builder.addSiteBlock(startSite, PositionListBuilder.getInstance(blkPosList), gTS, dTS);
    }

    int getSiteNumber() {
        return siteN;
    }

    byte[][] getGenoTS() {
        return gTS;
    }

    byte[][][] getDepthTS() {
        return dTS;
    }

    ArrayList getBlkPosList() {
        return blkPosList;
    }

    private static int firstEqualIndex(String[] sa, String match) {
        for (int i=0; i