net.maizegenetics.dna.snp.io.BuilderFromVCF Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tassel Show documentation
Show all versions of tassel Show documentation
TASSEL is a software package to evaluate traits associations, evolutionary patterns, and linkage
disequilibrium.
The newest version!
package net.maizegenetics.dna.snp.io;
import com.google.common.base.Splitter;
import com.google.common.collect.SetMultimap;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.*;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.score.AlleleDepthBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTable;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.ProgressListener;
import net.maizegenetics.util.Tassel5HDF5Constants;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;
/**
* Create an alignment based on VCF format file (either .txt or compressed). Alleles are set as global reference.
* e.g. code
* {@code
* Alignment a=BuilderFromVCF.getBuilder(infileName).build();
* }
*
* TODO: Add filtering while reading, provide an option to define the alleles as reference and alternate
*
* @author Ed Buckler
*/
public class BuilderFromVCF {
private static final Logger myLogger=LogManager.getLogger(BuilderFromVCF.class);
private static final Pattern WHITESPACE_PATTERN=Pattern.compile("[\\s]+");
private static final Pattern TAB_PATTERN = Pattern.compile("[\\t]+");
private HeaderPositions hp=null;
private final String infile;
private boolean includeDepth=false;
private boolean inMemory=true;
private String hdf5Outfile=null;
private GenotypeTableBuilder hdf5GenoTableBuilder=null;
private final ProgressListener myProgressListener;
private BuilderFromVCF(String infile,ProgressListener listener) {
this.infile=infile;
this.myProgressListener = listener;
}
/**
* Create a builder for loading a VCF file into memory
* @param infile name of the VCF file to be load
* @return a builder
*/
public static BuilderFromVCF getBuilder(String infile) {
return new BuilderFromVCF(infile,null);
}
/**
* Create a builder for loading a VCF file into memory
* @param infile name of the VCF file to be load
* @param listener Progress listener for GUI
* @return a builder
*/
public static BuilderFromVCF getBuilder(String infile, ProgressListener listener) {
return new BuilderFromVCF(infile,listener);
}
/**
* Create a HDF5 version of file from the VCF file. This is useful in many cases where VCF files are
* too large to load fully into memory.
* @param hdf5Outfile
* @return
*/
public BuilderFromVCF convertToHDF5(String hdf5Outfile) {
inMemory=false;
this.hdf5Outfile=hdf5Outfile;
//int numberOfSites=Utils.getNumberLinesNotHashOrBlank(hdf5Outfile);
//hdf5GenoTableBuilder=new GenotypeTableBuilder(hdf5Outfile,numberOfSites);
return this;
}
public BuilderFromVCF keepDepth() {
includeDepth=true;
return this;
}
public GenotypeTable buildAndSortInMemory() {
return buildEngine(true);
}
public GenotypeTable build() {
return buildEngine(false);
}
//TODO provide options on caching to use, read only some sites, etc.
private GenotypeTable buildEngine(boolean fullSort) {
long time=System.nanoTime();
GenotypeTable result=null;
int totalSites=-1;//unknown
GenotypeTableBuilder gtbDiskBuild=null;
ExecutorService pool = null;
try {
int numThreads=Runtime.getRuntime().availableProcessors();
pool=Executors.newFixedThreadPool(numThreads);
BufferedReader r=Utils.getBufferedReader(infile, -1);
//Read the ## annotation rows
String currLine;
Map infoMap=new HashMap<>();
Map formatMap=new HashMap<>();
Map> sampAnnoBuild=new TreeMap<>();
currLine=parseVCFHeadersIntoMaps(infoMap,formatMap,sampAnnoBuild,r);
TaxaList taxaList=processTaxa(currLine,sampAnnoBuild);
if(inMemory==false) {
totalSites=Utils.getNumberLinesNotHashOrBlank(infile);
System.out.println("TotalSite count:"+totalSites);
gtbDiskBuild=GenotypeTableBuilder.getSiteIncremental(taxaList,totalSites,hdf5Outfile);
}
int linesAtTime=(inMemory)?1<<12:Tassel5HDF5Constants.BLOCK_SIZE; //this is a critical lines with 20% or more swings. Needs to be optimized with transposing
// int linesAtTime=1<<8; //better for with lots of taxa.
ArrayList txtLines=new ArrayList<>(linesAtTime);
ArrayList pbs=new ArrayList<>();
List> futures = new ArrayList<>();
int sitesRead=0;
while ((currLine=r.readLine())!=null) {
if(currLine.startsWith("#")) continue;
txtLines.add(currLine);
sitesRead++;
if (sitesRead%linesAtTime==0) {
ProcessVCFBlock pb;
if(inMemory) {
pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines,includeDepth);}
else{
pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, sitesRead-txtLines.size(),gtbDiskBuild, includeDepth);
}
//pbs.add(pb);
// pb.run(); //used for testing
try {
if(inMemory) {
futures.add(pool.submit(pb));
}
else {
//Put a future on the queue
futures.add(pool.submit(pb));
//If We are streaming to HDF5, we need to block temporarily and clean out the queue.
if(futures.size()>=numThreads) {
//if(futures.size()>0) {
for(Future future : futures) {
pbs.add(future.get());
}
futures = new ArrayList<>();
}
}
}
catch(Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException(e.getMessage());
}
txtLines=new ArrayList<>(linesAtTime);
}
}
r.close();
//Handle whatever is left over in the file
if (txtLines.size()>0) {
ProcessVCFBlock pb;//=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines);
if(inMemory) {
pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, includeDepth);}
else{
pb=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, sitesRead-txtLines.size(),gtbDiskBuild, includeDepth);
}
//pbs.add(pb);
// pb.run(); //used for testing
try {
futures.add(pool.submit(pb));
}
catch(Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException(e.getMessage());
}
}
int numFutures = futures.size();
int count = 0;
for(Future future : futures) {
try {
pbs.add(future.get());
}
catch(Exception e) {
myLogger.debug(e.getMessage(),e);
throw new IllegalStateException(e.getMessage());
}
if(myProgressListener != null) {
count++;
myProgressListener.progress(count * 100 / numFutures,null);
}
}
pool.shutdown();
if(inMemory) {
//result=completeInMemoryBuilding(futures, taxaList, sitesRead, includeDepth, fullSort);
result=completeInMemoryBuilding(pbs, taxaList, sitesRead, includeDepth, fullSort);
} else {
gtbDiskBuild.build();
}
// int currentSite=0;
// PositionListBuilder posBuild=new PositionListBuilder();
// GenotypeCallTableBuilder gb=GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), lines);
// AlleleDepthBuilder db=null;
// if(includeDepth) db=AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(),lines,6);
// for (ProcessVCFBlock pb : pbs) {
// posBuild.addAll(pb.getBlkPosList());
// byte[][] bgTS=pb.getGenoTS();
// for (int t=0; t pbs, TaxaList taxaList, int numberOfSites, boolean includeDepth, boolean fullSort) {
int currentSite=0;
PositionListBuilder posBuild=new PositionListBuilder();
GenotypeCallTableBuilder gb=GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), numberOfSites);
AlleleDepthBuilder db=null;
if(includeDepth) db=AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(),numberOfSites,taxaList);
for (ProcessVCFBlock pb: pbs) {
posBuild.addAll(pb.getBlkPosList());
byte[][] bgTS=pb.getGenoTS();
for (int t=0; t infoMap, Map formatMap,
Map> sampAnnoBuild, BufferedReader r) throws IOException {
String currLine;
while (((currLine=r.readLine())!=null)&&(currLine.startsWith("##"))) {
String[] cat=currLine.split("=",2);
if(cat.length<2) continue;
switch (cat[0]) {
// case "##INFO":
// infoMap.put(mapOfAnno.get("ID"), mapOfAnno.get("Description"));
// break;
// case "##FILTER":break;
// case "##FORMAT":
// formatMap.put(mapOfAnno.get("ID"),mapOfAnno.get("Description"));
// break;
case "##SAMPLE":
SetMultimap mapOfAnno=TaxaListIOUtils.parseVCFHeadersIntoMap(cat[1]);
String taxaID=mapOfAnno.get("ID").iterator().next();
if(taxaID==null) break;
sampAnnoBuild.put(taxaID,mapOfAnno);
break;
case "##PEDIGREE":break;
default : break;
}
// System.out.println(currLine);
}
return currLine;
}
private static String getReplaceCommaWithinQuote(String s) {
StringBuilder sb =new StringBuilder(s);
boolean inQuote=false;
for (int i=0; i> taxaAnnotation) {
String[] header = TAB_PATTERN.split(readLn);
hp=new HeaderPositions(header);
int numTaxa=header.length-hp.NUM_HAPMAP_NON_TAXA_HEADERS;
TaxaListBuilder tlb=new TaxaListBuilder();
for (int i=0; i taMap=taxaAnnotation.get(taxonID);
if(taMap!=null) {
for (Map.Entry en : taMap.entries()) {
if(en.getKey().equals("ID")) continue; //skip the IDs as these became the name
String s=en.getValue().replace("\"","");
at.addAnno(en.getKey(),s);
}
}
tlb.add(at.build());
}
return tlb.build();
}
}
class HeaderPositions {
final int NUM_HAPMAP_NON_TAXA_HEADERS;
final int GENOIDX;
final int SNPID_INDEX;
// final int VARIANT_INDEX;
final int FILTER_INDEX;
final int QUAL_INDEX;
final int CHROMOSOME_INDEX;
final int POSITION_INDEX;
final int REF_INDEX;
final int ALT_INDEX;
final int INFO_INDEX;
final int FORMAT_INDEX;
public HeaderPositions(String[] header){
int chrIdx=firstEqualIndex(header,"#CHROM");
if(chrIdx<0) chrIdx=firstEqualIndex(header,"#CHR");
CHROMOSOME_INDEX=chrIdx;
POSITION_INDEX=firstEqualIndex(header,"POS");
SNPID_INDEX=firstEqualIndex(header,"ID");
REF_INDEX=firstEqualIndex(header,"REF");
ALT_INDEX=firstEqualIndex(header,"ALT");
QUAL_INDEX=firstEqualIndex(header,"QUAL");
FILTER_INDEX=firstEqualIndex(header,"FILTER");
INFO_INDEX=firstEqualIndex(header,"INFO");
FORMAT_INDEX=firstEqualIndex(header,"FORMAT");
NUM_HAPMAP_NON_TAXA_HEADERS=Math.max(INFO_INDEX,FORMAT_INDEX)+1;
GENOIDX=NUM_HAPMAP_NON_TAXA_HEADERS;
}
private static int firstEqualIndex(String[] sa, String match) {
for (int i=0; i {
private static final Pattern WHITESPACE_PATTERN=Pattern.compile("\\s");
private static final Pattern SLASH_PATTERN=Pattern.compile("/");
private final HeaderPositions hp;
private final int taxaN;
private final int siteN;
private final int startSite; //if unknown Int.Mini
private final GenotypeTableBuilder hdf5Builder; //null is building in memory
private ArrayList txtL;
private byte[][] gTS; //genotypes
private byte[][][] dTS; //depth
private final ArrayList blkPosList;
private final boolean keepDepth;
private ProcessVCFBlock(int taxaN, HeaderPositions hp, ArrayList txtL, int startSite,
GenotypeTableBuilder hdf5Builder, boolean keepDepth) {
this.taxaN=taxaN;
this.siteN=txtL.size();
this.txtL=txtL;
this.hp=hp;
blkPosList=new ArrayList<>(siteN);
this.startSite=startSite;
this.hdf5Builder=hdf5Builder;
this.keepDepth=keepDepth;
}
/*Used to process VCF blocks and return the result for a in memory GenotypeTable*/
static ProcessVCFBlock getInstance(int taxaN, HeaderPositions hp, ArrayList txtL,boolean keepDepth) {
return new ProcessVCFBlock(taxaN, hp, txtL, Integer.MIN_VALUE, null,keepDepth);
}
/*Used to process VCF blocks and return the result for on disk HDF5 GenotypeTable*/
static ProcessVCFBlock getInstance(int taxaN, HeaderPositions hp, ArrayList txtL, int startSite, GenotypeTableBuilder hdf5Builder, boolean keepDepth) {
return new ProcessVCFBlock(taxaN, hp, txtL, startSite, hdf5Builder,keepDepth);
}
@Override
public ProcessVCFBlock call() throws Exception{
Map chromosomeLookup=new HashMap<>();
gTS=new byte[taxaN][siteN];
if(keepDepth==true) dTS=new byte[taxaN][6][siteN];
for (int s=0; s0) snpID=input.substring(tabPos[hp.SNPID_INDEX-1]+1, tabPos[hp.SNPID_INDEX]);
String refS=input.substring(tabPos[hp.REF_INDEX-1]+1, tabPos[hp.REF_INDEX]);
String alt=input.substring(tabPos[hp.ALT_INDEX-1]+1, tabPos[hp.ALT_INDEX]);
String variants;
if(alt.equals(".")) {variants=refS;}
else {variants=(refS+"/"+alt).replace(',','/')
.replace("", "+").replace('I', '+')
.replace("", "-").replace('D', '-')
.replace("*", "N");}
//GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, currentPosition)
// .knownVariants(variants); //TODO strand, variants,
//ZRM 8_26
GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, Integer.parseInt(input.substring(tabPos[hp.POSITION_INDEX-1]+1, tabPos[hp.POSITION_INDEX])))
.knownVariants(variants) //TODO strand, variants,
;
if(snpID!=null && !snpID.equals(".")) {
apb.snpName(snpID);
}
//byte[] alleles=new byte[(variants.length()+1)/2];
byte[] alleles = new byte[variants.split("/").length];
for (int i = 0, varInd=0; i < alleles.length; i++, varInd+=2) {
alleles[i]=NucleotideAlignmentConstants.getNucleotideAlleleByte(variants.charAt(varInd));
}
/***ZRM 8_27 New code ***/
String[] variantList = variants.split("/");
if(variantList[0].length()>1) {
String[] parsedVariantList = new String[variantList.length];
//alt deletion
for(int i = 0; i < variantList.length; i++) {
//Pull off the first character if it exists
if(variantList[i].length()>1) {
parsedVariantList[i] = variantList[i].substring(1);
if(parsedVariantList[i].length()==0) {
parsedVariantList[i] = "-";
}
}
else {
//Mark as deletion
parsedVariantList[i] = "-";
}
}
for(int i = 0; i variantList[0].length()) {
isIndel = true;
break;
}
}
if(isIndel) {
String[] parsedVariantList = new String[variantList.length];
//ref+alt deletion
for(int i = 0; i < variantList.length; i++) {
//Pull off the first character if it exists
if(variantList[i].length()>1) {
parsedVariantList[i] = variantList[i].substring(1);
if(parsedVariantList[i].length()==0) {
parsedVariantList[i] = "-";
}
}
else {
//Mark as deletion
parsedVariantList[i] = "-";
}
}
for(int i = 0; i 1) {
apb.allele(WHICH_ALLELE.Alternate, alleles[1]);
}
for(String annoS: Splitter.on(";").split(input.substring(tabPos[hp.INFO_INDEX-1]+1, tabPos[hp.INFO_INDEX]))) {
apb.addAnno(annoS);
}
blkPosList.add(apb.build());
final int iGT=0; //genotype index
int iAD=-1,iDP=-1,iGQ=-1, iPL=-1; //alleleDepth, overall depth, genotypeQuality, phredGenotypeLikelihoods
if(hp.FORMAT_INDEX>=0) {
//Check to see if FORMAT tag is missing. Only applicable for single taxa files
if(tabPos[hp.FORMAT_INDEX]==0) {
throw new IllegalStateException("Error Processing VCF: Missing FORMAT tag.");
}
String unsplitInput = input.substring(tabPos[hp.FORMAT_INDEX-1]+1, tabPos[hp.FORMAT_INDEX]);
if(unsplitInput.length()==0|| !unsplitInput.startsWith("GT")) {
//Check to see it has the GT field
if(unsplitInput.contains("GT")) {
throw new IllegalStateException("Error Processing VCF Block: GT field is not in first position of FORMAT.");
}
//If GT isnt in, we assume that it is missing FORMAT
else {
throw new IllegalStateException("Error Processing VCF Block: Missing FORMAT tag.");
}
}
String[] formatS = unsplitInput.split(":");
iAD=firstEqualIndex(formatS,"AD");
}
int t=0;
for(String taxaAllG: Splitter.on("\t").split(input.substring(tabPos[hp.NUM_HAPMAP_NON_TAXA_HEADERS-1]+1))) {
int f=0;
//if(taxaAllG.equals(".")) {
if(taxaAllG.startsWith(".")) {
gTS[t][s] = GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
//need to still move up the taxa counter by one as we have covered this now.
t++;
continue;
}
for(String fieldS: Splitter.on(":").split(taxaAllG)) {
if(f==iGT) {
//String "[.0-9]\\/[.0-9]||[.0-9]\\|[.0-9]" will match a valid diploid
if (!fieldS.equals(".")) { //[TAS-509] Check to make sure we are using diploids in the form 0/1 or 0|0
int a1 = 0;
int a2 = 0;
//Should be this, but for speed we just check the character
//if(fieldS.split("|/").length ==1) {
if(fieldS.length() ==1) {
a1 = fieldS.charAt(0) - '0';
a2 = fieldS.charAt(0) - '0';
}
else {
//zrm22 Jul 31, 2017
//int a1 = fieldS.charAt(0) - '0';
//int a2 = fieldS.charAt(2) - '0';
a1 = fieldS.charAt(0) - '0';
a2 = fieldS.charAt(2) - '0';
}
if(a1>alleles.length-1 || a2>alleles.length-1) {
Position pos = blkPosList.get(blkPosList.size()-1);
throw new IllegalStateException("\nError Processing VCF block: Mismatch of alleles.\n At Chromosome "+ pos.getChromosome().getName() + ", Position "+pos.getPosition() +".\nAllele ID larger than number of alleles" );
}
if (a1 < 0 || a2 < 0) {
gTS[t][s] = GenotypeTable.UNKNOWN_DIPLOID_ALLELE;
}
else {
gTS[t][s] =
GenotypeTableUtils.getDiploidValue(alleles[a1], alleles[a2]);
}
}
else { //[TAS-509] if it isnt a diploid error out early
throw new IllegalStateException("Error Processing VCF block: Found haploid information for the element: "
+ taxaAllG + ".\nExpected a diploid entry.");
}
} else if((f==iAD)&&keepDepth) {
// System.out.println("GTS: "+gTS[t][s] + " "+fieldS);
if(gTS[t][s]!=GenotypeTable.UNKNOWN_DIPLOID_ALLELE) {
int i=0;
for(String ad: Splitter.on(",").split(fieldS)){
if(i>=alleles.length) continue;
if(alleles[i]==GenotypeTable.UNKNOWN_ALLELE || ad.equals(".") || alleles[i]==NucleotideAlignmentConstants.UNDEFINED_ALLELE ||
alleles[i]==NucleotideAlignmentConstants.UNDEFINED_DIPLOID_ALLELE) { //no position for depth of unknown alleles or depth is set to missing, so skip
//Uncomment when converted
//dTS[t][alleles[i++]][s] = AlleleDepthUtil.depthIntToByte(AlleleDepthUtil.DEPTH_MISSING);
//Comment next two lines when converted
i++;
continue;
}
int adInt=Integer.parseInt(ad);
dTS[t][alleles[i++]][s]=AlleleDepthUtil.depthIntToByte(adInt);
}
}
}
f++;
}
t++;
}
} catch(IllegalStateException e) {
e.printStackTrace();
throw e;
}
catch(Exception e) {
System.err.println("Err Site Number:"+s);
System.err.println("Err Site Number:"+blkPosList.get(blkPosList.size()-1).toString());
System.err.println("Err:"+input);
throw e;
}
}
txtL=null;
if(hdf5Builder!=null) {
addResultsToHDF5Builder();
gTS=null;
dTS=null;
txtL=null;
blkPosList.clear();
}
//TODO TAS-315 Create memory efficient VCF to HDF5 insert writing to Builder of direct.
return this;
}
private void addResultsToHDF5Builder() {
hdf5Builder.addSiteBlock(startSite, PositionListBuilder.getInstance(blkPosList), gTS, dTS);
}
int getSiteNumber() {
return siteN;
}
byte[][] getGenoTS() {
return gTS;
}
byte[][][] getDepthTS() {
return dTS;
}
ArrayList getBlkPosList() {
return blkPosList;
}
private static int firstEqualIndex(String[] sa, String match) {
for (int i=0; i