All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.snp.io.BuilderFromVCF Maven / Gradle / Ivy

Go to download

TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage disequilibrium.

The newest version!
package net.maizegenetics.dna.snp.io;

import com.google.common.base.Splitter;
import com.google.common.collect.SetMultimap;

import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.*;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.score.AlleleDepthBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTable;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.ProgressListener;
import net.maizegenetics.util.Tassel5HDF5Constants;
import net.maizegenetics.util.Utils;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;

/**
 * Create an alignment based on VCF format file (either .txt or compressed).  Alleles are set as global reference.
 * e.g. code 

* {@code * Alignment a=BuilderFromVCF.getBuilder(infileName).build(); * } *

* TODO: Add filtering while reading, provide an option to define the alleles as reference and alternate * * @author Ed Buckler */ public class BuilderFromVCF { private static final Logger myLogger=LogManager.getLogger(BuilderFromVCF.class); private static final Pattern WHITESPACE_PATTERN=Pattern.compile("[\\s]+"); private static final Pattern TAB_PATTERN = Pattern.compile("[\\t]+"); private HeaderPositions hp=null; private final String infile; private boolean includeDepth=false; private boolean inMemory=true; private String hdf5Outfile=null; private GenotypeTableBuilder hdf5GenoTableBuilder=null; private final ProgressListener myProgressListener; private BuilderFromVCF(String infile,ProgressListener listener) { this.infile=infile; this.myProgressListener = listener; } /** * Create a builder for loading a VCF file into memory * @param infile name of the VCF file to be load * @return a builder */ public static BuilderFromVCF getBuilder(String infile) { return new BuilderFromVCF(infile,null); } /** * Create a builder for loading a VCF file into memory * @param infile name of the VCF file to be load * @param listener Progress listener for GUI * @return a builder */ public static BuilderFromVCF getBuilder(String infile, ProgressListener listener) { return new BuilderFromVCF(infile,listener); } /** * Create a HDF5 version of file from the VCF file. This is useful in many cases where VCF files are * too large to load fully into memory. * @param hdf5Outfile * @return */ public BuilderFromVCF convertToHDF5(String hdf5Outfile) { inMemory=false; this.hdf5Outfile=hdf5Outfile; //int numberOfSites=Utils.getNumberLinesNotHashOrBlank(hdf5Outfile); //hdf5GenoTableBuilder=new GenotypeTableBuilder(hdf5Outfile,numberOfSites); return this; } public BuilderFromVCF keepDepth() { includeDepth=true; return this; } public GenotypeTable buildAndSortInMemory() { return buildEngine(true); } public GenotypeTable build() { return buildEngine(false); } //TODO provide options on caching to use, read only some sites, etc. private GenotypeTable buildEngine(boolean fullSort) { long time=System.nanoTime(); GenotypeTable result=null; int totalSites=-1;//unknown GenotypeTableBuilder gtbDiskBuild=null; ExecutorService pool = null; try { int numThreads=Runtime.getRuntime().availableProcessors(); pool=Executors.newFixedThreadPool(numThreads); BufferedReader r=Utils.getBufferedReader(infile, -1); //Read the ## annotation rows String currLine; Map infoMap=new HashMap<>(); Map formatMap=new HashMap<>(); Map> sampAnnoBuild=new TreeMap<>(); currLine=parseVCFHeadersIntoMaps(infoMap,formatMap,sampAnnoBuild,r); TaxaList taxaList=processTaxa(currLine,sampAnnoBuild); if(inMemory==false) { totalSites=Utils.getNumberLinesNotHashOrBlank(infile); System.out.println("TotalSite count:"+totalSites); gtbDiskBuild=GenotypeTableBuilder.getSiteIncremental(taxaList,totalSites,hdf5Outfile); } int linesAtTime=(inMemory)?1<<12:Tassel5HDF5Constants.BLOCK_SIZE; //this is a critical lines with 20% or more swings. Needs to be optimized with transposing // int linesAtTime=1<<8; //better for with lots of taxa. ArrayList txtLines=new ArrayList<>(linesAtTime); ArrayList pbs=new ArrayList<>(); List> futures = new ArrayList<>(); int sitesRead=0; while ((currLine=r.readLine())!=null) { if(currLine.startsWith("#")) continue; txtLines.add(currLine); sitesRead++; if (sitesRead%linesAtTime==0) { ProcessVCFBlock pb; if(inMemory) { pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines,includeDepth);} else{ pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, sitesRead-txtLines.size(),gtbDiskBuild, includeDepth); } //pbs.add(pb); // pb.run(); //used for testing try { if(inMemory) { futures.add(pool.submit(pb)); } else { //Put a future on the queue futures.add(pool.submit(pb)); //If We are streaming to HDF5, we need to block temporarily and clean out the queue. if(futures.size()>=numThreads) { //if(futures.size()>0) { for(Future future : futures) { pbs.add(future.get()); } futures = new ArrayList<>(); } } } catch(Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException(e.getMessage()); } txtLines=new ArrayList<>(linesAtTime); } } r.close(); //Handle whatever is left over in the file if (txtLines.size()>0) { ProcessVCFBlock pb;//=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines); if(inMemory) { pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, includeDepth);} else{ pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, sitesRead-txtLines.size(),gtbDiskBuild, includeDepth); } //pbs.add(pb); // pb.run(); //used for testing try { futures.add(pool.submit(pb)); } catch(Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException(e.getMessage()); } } int numFutures = futures.size(); int count = 0; for(Future future : futures) { try { pbs.add(future.get()); } catch(Exception e) { myLogger.debug(e.getMessage(),e); throw new IllegalStateException(e.getMessage()); } if(myProgressListener != null) { count++; myProgressListener.progress(count * 100 / numFutures,null); } } pool.shutdown(); if(inMemory) { //result=completeInMemoryBuilding(futures, taxaList, sitesRead, includeDepth, fullSort); result=completeInMemoryBuilding(pbs, taxaList, sitesRead, includeDepth, fullSort); } else { gtbDiskBuild.build(); } // int currentSite=0; // PositionListBuilder posBuild=new PositionListBuilder(); // GenotypeCallTableBuilder gb=GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), lines); // AlleleDepthBuilder db=null; // if(includeDepth) db=AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(),lines,6); // for (ProcessVCFBlock pb : pbs) { // posBuild.addAll(pb.getBlkPosList()); // byte[][] bgTS=pb.getGenoTS(); // for (int t=0; t pbs, TaxaList taxaList, int numberOfSites, boolean includeDepth, boolean fullSort) { int currentSite=0; PositionListBuilder posBuild=new PositionListBuilder(); GenotypeCallTableBuilder gb=GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), numberOfSites); AlleleDepthBuilder db=null; if(includeDepth) db=AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(),numberOfSites,taxaList); for (ProcessVCFBlock pb: pbs) { posBuild.addAll(pb.getBlkPosList()); byte[][] bgTS=pb.getGenoTS(); for (int t=0; t infoMap, Map formatMap, Map> sampAnnoBuild, BufferedReader r) throws IOException { String currLine; while (((currLine=r.readLine())!=null)&&(currLine.startsWith("##"))) { String[] cat=currLine.split("=",2); if(cat.length<2) continue; switch (cat[0]) { // case "##INFO": // infoMap.put(mapOfAnno.get("ID"), mapOfAnno.get("Description")); // break; // case "##FILTER":break; // case "##FORMAT": // formatMap.put(mapOfAnno.get("ID"),mapOfAnno.get("Description")); // break; case "##SAMPLE": SetMultimap mapOfAnno=TaxaListIOUtils.parseVCFHeadersIntoMap(cat[1]); String taxaID=mapOfAnno.get("ID").iterator().next(); if(taxaID==null) break; sampAnnoBuild.put(taxaID,mapOfAnno); break; case "##PEDIGREE":break; default : break; } // System.out.println(currLine); } return currLine; } private static String getReplaceCommaWithinQuote(String s) { StringBuilder sb =new StringBuilder(s); boolean inQuote=false; for (int i=0; i0) snpID=input.substring(tabPos[hp.SNPID_INDEX-1]+1, tabPos[hp.SNPID_INDEX]); String refS=input.substring(tabPos[hp.REF_INDEX-1]+1, tabPos[hp.REF_INDEX]); String alt=input.substring(tabPos[hp.ALT_INDEX-1]+1, tabPos[hp.ALT_INDEX]); String variants; if(alt.equals(".")) {variants=refS;} else {variants=(refS+"/"+alt).replace(',','/') .replace("", "+").replace('I', '+') .replace("", "-").replace('D', '-') .replace("*", "N");} //GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, currentPosition) // .knownVariants(variants); //TODO strand, variants, //ZRM 8_26 GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, Integer.parseInt(input.substring(tabPos[hp.POSITION_INDEX-1]+1, tabPos[hp.POSITION_INDEX]))) .knownVariants(variants) //TODO strand, variants, ; if(snpID!=null && !snpID.equals(".")) { apb.snpName(snpID); } //byte[] alleles=new byte[(variants.length()+1)/2]; byte[] alleles = new byte[variants.split("/").length]; for (int i = 0, varInd=0; i < alleles.length; i++, varInd+=2) { alleles[i]=NucleotideAlignmentConstants.getNucleotideAlleleByte(variants.charAt(varInd)); } /***ZRM 8_27 New code ***/ String[] variantList = variants.split("/"); if(variantList[0].length()>1) { String[] parsedVariantList = new String[variantList.length]; //alt deletion for(int i = 0; i < variantList.length; i++) { //Pull off the first character if it exists if(variantList[i].length()>1) { parsedVariantList[i] = variantList[i].substring(1); if(parsedVariantList[i].length()==0) { parsedVariantList[i] = "-"; } } else { //Mark as deletion parsedVariantList[i] = "-"; } } for(int i = 0; i variantList[0].length()) { isIndel = true; break; } } if(isIndel) { String[] parsedVariantList = new String[variantList.length]; //ref+alt deletion for(int i = 0; i < variantList.length; i++) { //Pull off the first character if it exists if(variantList[i].length()>1) { parsedVariantList[i] = variantList[i].substring(1); if(parsedVariantList[i].length()==0) { parsedVariantList[i] = "-"; } } else { //Mark as deletion parsedVariantList[i] = "-"; } } for(int i = 0; i 1) { apb.allele(WHICH_ALLELE.Alternate, alleles[1]); } for(String annoS: Splitter.on(";").split(input.substring(tabPos[hp.INFO_INDEX-1]+1, tabPos[hp.INFO_INDEX]))) { apb.addAnno(annoS); } blkPosList.add(apb.build()); final int iGT=0; //genotype index int iAD=-1,iDP=-1,iGQ=-1, iPL=-1; //alleleDepth, overall depth, genotypeQuality, phredGenotypeLikelihoods if(hp.FORMAT_INDEX>=0) { //Check to see if FORMAT tag is missing. Only applicable for single taxa files if(tabPos[hp.FORMAT_INDEX]==0) { throw new IllegalStateException("Error Processing VCF: Missing FORMAT tag."); } String unsplitInput = input.substring(tabPos[hp.FORMAT_INDEX-1]+1, tabPos[hp.FORMAT_INDEX]); if(unsplitInput.length()==0|| !unsplitInput.startsWith("GT")) { //Check to see it has the GT field if(unsplitInput.contains("GT")) { throw new IllegalStateException("Error Processing VCF Block: GT field is not in first position of FORMAT."); } //If GT isnt in, we assume that it is missing FORMAT else { throw new IllegalStateException("Error Processing VCF Block: Missing FORMAT tag."); } } String[] formatS = unsplitInput.split(":"); iAD=firstEqualIndex(formatS,"AD"); } int t=0; for(String taxaAllG: Splitter.on("\t").split(input.substring(tabPos[hp.NUM_HAPMAP_NON_TAXA_HEADERS-1]+1))) { int f=0; //if(taxaAllG.equals(".")) { if(taxaAllG.startsWith(".")) { gTS[t][s] = GenotypeTable.UNKNOWN_DIPLOID_ALLELE; //need to still move up the taxa counter by one as we have covered this now. t++; continue; } for(String fieldS: Splitter.on(":").split(taxaAllG)) { if(f==iGT) { //String "[.0-9]\\/[.0-9]||[.0-9]\\|[.0-9]" will match a valid diploid if (!fieldS.equals(".")) { //[TAS-509] Check to make sure we are using diploids in the form 0/1 or 0|0 int a1 = 0; int a2 = 0; //Should be this, but for speed we just check the character //if(fieldS.split("|/").length ==1) { if(fieldS.length() ==1) { a1 = fieldS.charAt(0) - '0'; a2 = fieldS.charAt(0) - '0'; } else { //zrm22 Jul 31, 2017 //int a1 = fieldS.charAt(0) - '0'; //int a2 = fieldS.charAt(2) - '0'; a1 = fieldS.charAt(0) - '0'; a2 = fieldS.charAt(2) - '0'; } if(a1>alleles.length-1 || a2>alleles.length-1) { Position pos = blkPosList.get(blkPosList.size()-1); throw new IllegalStateException("\nError Processing VCF block: Mismatch of alleles.\n At Chromosome "+ pos.getChromosome().getName() + ", Position "+pos.getPosition() +".\nAllele ID larger than number of alleles" ); } if (a1 < 0 || a2 < 0) { gTS[t][s] = GenotypeTable.UNKNOWN_DIPLOID_ALLELE; } else { gTS[t][s] = GenotypeTableUtils.getDiploidValue(alleles[a1], alleles[a2]); } } else { //[TAS-509] if it isnt a diploid error out early throw new IllegalStateException("Error Processing VCF block: Found haploid information for the element: " + taxaAllG + ".\nExpected a diploid entry."); } } else if((f==iAD)&&keepDepth) { // System.out.println("GTS: "+gTS[t][s] + " "+fieldS); if(gTS[t][s]!=GenotypeTable.UNKNOWN_DIPLOID_ALLELE) { int i=0; for(String ad: Splitter.on(",").split(fieldS)){ if(i>=alleles.length) continue; if(alleles[i]==GenotypeTable.UNKNOWN_ALLELE || ad.equals(".") || alleles[i]==NucleotideAlignmentConstants.UNDEFINED_ALLELE || alleles[i]==NucleotideAlignmentConstants.UNDEFINED_DIPLOID_ALLELE) { //no position for depth of unknown alleles or depth is set to missing, so skip //Uncomment when converted //dTS[t][alleles[i++]][s] = AlleleDepthUtil.depthIntToByte(AlleleDepthUtil.DEPTH_MISSING); //Comment next two lines when converted i++; continue; } int adInt=Integer.parseInt(ad); dTS[t][alleles[i++]][s]=AlleleDepthUtil.depthIntToByte(adInt); } } } f++; } t++; } } catch(IllegalStateException e) { e.printStackTrace(); throw e; } catch(Exception e) { System.err.println("Err Site Number:"+s); System.err.println("Err Site Number:"+blkPosList.get(blkPosList.size()-1).toString()); System.err.println("Err:"+input); throw e; } } txtL=null; if(hdf5Builder!=null) { addResultsToHDF5Builder(); gTS=null; dTS=null; txtL=null; blkPosList.clear(); } //TODO TAS-315 Create memory efficient VCF to HDF5 insert writing to Builder of direct. return this; } private void addResultsToHDF5Builder() { hdf5Builder.addSiteBlock(startSite, PositionListBuilder.getInstance(blkPosList), gTS, dTS); } int getSiteNumber() { return siteN; } byte[][] getGenoTS() { return gTS; } byte[][][] getDepthTS() { return dTS; } ArrayList getBlkPosList() { return blkPosList; } private static int firstEqualIndex(String[] sa, String match) { for (int i=0; i




© 2015 - 2024 Weber Informatics LLC | Privacy Policy