All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.dna.snp.io.BuilderFromVCF Maven / Gradle / Ivy

package net.maizegenetics.dna.snp.io;

import com.google.common.base.Splitter;
import com.google.common.collect.SetMultimap;
import net.maizegenetics.dna.WHICH_ALLELE;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GeneralPosition;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.map.PositionListBuilder;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.GenotypeTableBuilder;
import net.maizegenetics.dna.snp.GenotypeTableUtils;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTable;
import net.maizegenetics.dna.snp.genotypecall.GenotypeCallTableBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthBuilder;
import net.maizegenetics.dna.snp.score.AlleleDepthUtil;
import net.maizegenetics.taxa.TaxaList;
import net.maizegenetics.taxa.TaxaListBuilder;
import net.maizegenetics.taxa.TaxaListIOUtils;
import net.maizegenetics.taxa.Taxon;
import net.maizegenetics.util.ProgressListener;
import net.maizegenetics.util.Utils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Pattern;

/**
 * Create an alignment based on VCF format file (either .txt or compressed).  Alleles are set as global reference.
 * e.g. code 

* {@code * Alignment a=BuilderFromVCF.getBuilder(infileName).build(); * } *

* TODO: Add filtering while reading, provide an option to define the alleles as reference and alternate * * @author Ed Buckler */ public class BuilderFromVCF { private static final Logger myLogger = LogManager.getLogger(BuilderFromVCF.class); private static final Pattern WHITESPACE_PATTERN = Pattern.compile("[\\s]+"); private static final Pattern TAB_PATTERN = Pattern.compile("[\\t]+"); private HeaderPositions hp = null; private final String infile; private boolean includeDepth = false; private final ProgressListener myProgressListener; private BuilderFromVCF(String infile, ProgressListener listener) { this.infile = infile; this.myProgressListener = listener; } /** * Create a builder for loading a VCF file into memory * * @param infile name of the VCF file to be load * * @return a builder */ public static BuilderFromVCF getBuilder(String infile) { return new BuilderFromVCF(infile, null); } /** * Create a builder for loading a VCF file into memory * * @param infile name of the VCF file to be load * @param listener Progress listener for GUI * * @return a builder */ public static BuilderFromVCF getBuilder(String infile, ProgressListener listener) { return new BuilderFromVCF(infile, listener); } public BuilderFromVCF keepDepth() { includeDepth = true; return this; } public GenotypeTable buildAndSortInMemory() { return buildEngine(true); } public GenotypeTable build() { return buildEngine(false); } //TODO provide options on caching to use, read only some sites, etc. private GenotypeTable buildEngine(boolean fullSort) { long time = System.nanoTime(); GenotypeTable result = null; int totalSites = -1;//unknown GenotypeTableBuilder gtbDiskBuild = null; ExecutorService pool = null; try { int numThreads = Runtime.getRuntime().availableProcessors(); pool = Executors.newFixedThreadPool(numThreads); BufferedReader r = Utils.getBufferedReader(infile, -1); //Read the ## annotation rows String currLine; Map infoMap = new HashMap<>(); Map formatMap = new HashMap<>(); Map> sampAnnoBuild = new TreeMap<>(); currLine = parseVCFHeadersIntoMaps(infoMap, formatMap, sampAnnoBuild, r); TaxaList taxaList = processTaxa(currLine, sampAnnoBuild); int linesAtTime = 1 << 12; // int linesAtTime=1<<8; //better for with lots of taxa. ArrayList txtLines = new ArrayList<>(linesAtTime); ArrayList pbs = new ArrayList<>(); List> futures = new ArrayList<>(); int sitesRead = 0; while ((currLine = r.readLine()) != null) { if (currLine.startsWith("#")) continue; txtLines.add(currLine); sitesRead++; if (sitesRead % linesAtTime == 0) { ProcessVCFBlock pb; pb = ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, includeDepth); //pbs.add(pb); // pb.run(); //used for testing try { futures.add(pool.submit(pb)); } catch (Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException(e.getMessage()); } txtLines = new ArrayList<>(linesAtTime); } } r.close(); //Handle whatever is left over in the file if (txtLines.size() > 0) { ProcessVCFBlock pb;//=ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines); pb = ProcessVCFBlock.getInstance(taxaList.numberOfTaxa(), hp, txtLines, includeDepth); //pbs.add(pb); // pb.run(); //used for testing try { futures.add(pool.submit(pb)); } catch (Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException(e.getMessage()); } } int numFutures = futures.size(); int count = 0; for (Future future : futures) { try { pbs.add(future.get()); } catch (Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException(e.getMessage()); } if (myProgressListener != null) { count++; myProgressListener.progress(count * 100 / numFutures, null); } } pool.shutdown(); //result=completeInMemoryBuilding(futures, taxaList, sitesRead, includeDepth, fullSort); result = completeInMemoryBuilding(pbs, taxaList, sitesRead, includeDepth, fullSort); // int currentSite=0; // PositionListBuilder posBuild=new PositionListBuilder(); // GenotypeCallTableBuilder gb=GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), lines); // AlleleDepthBuilder db=null; // if(includeDepth) db=AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(),lines,6); // for (ProcessVCFBlock pb : pbs) { // posBuild.addAll(pb.getBlkPosList()); // byte[][] bgTS=pb.getGenoTS(); // for (int t=0; t pbs, TaxaList taxaList, int numberOfSites, boolean includeDepth, boolean fullSort) { int currentSite = 0; PositionListBuilder posBuild = new PositionListBuilder(); GenotypeCallTableBuilder gb = GenotypeCallTableBuilder.getUnphasedNucleotideGenotypeBuilder(taxaList.numberOfTaxa(), numberOfSites); AlleleDepthBuilder db = null; if (includeDepth) db = AlleleDepthBuilder.getInstance(taxaList.numberOfTaxa(), numberOfSites, taxaList); for (ProcessVCFBlock pb : pbs) { posBuild.addAll(pb.getBlkPosList()); byte[][] bgTS = pb.getGenoTS(); for (int t = 0; t < bgTS.length; t++) { gb.setBaseRangeForTaxon(t, currentSite, bgTS[t]); } if (includeDepth) { byte[][][] bdTS = pb.getDepthTS(); for (int t = 0; t < bgTS.length; t++) { db.setDepthRangeForTaxon(t, currentSite, bdTS[t]); } } currentSite += pb.getSiteNumber(); } //Check that result is in correct order. If not, either try to sort or just throw an error (determined by what was passed to fullSort) if (posBuild.validateOrdering() == false) { if (fullSort) { int[] siteRedirect = posBuild.sort(); gb.reorderPositions(siteRedirect); if (includeDepth) { db.reorderPositions(siteRedirect); } if (posBuild.validateOrdering() == false) { //Double-check post-sort ordering. Should never happen, but just to be safe throw new IllegalStateException("BuilderFromVCF: Ordering of VCF file held in memory failed."); } } else { throw new IllegalStateException("BuilderFromVCF: Ordering incorrect. VCF file must be ordered by position. Please first use SortGenotypeFilePlugin to correctly order the file."); } } GenotypeCallTable g = gb.build(); if (includeDepth) { return GenotypeTableBuilder.getInstance(g, posBuild.build(), taxaList, db.build()); } else { return GenotypeTableBuilder.getInstance(g, posBuild.build(), taxaList); } } private static String parseVCFHeadersIntoMaps(Map infoMap, Map formatMap, Map> sampAnnoBuild, BufferedReader r) throws IOException { String currLine; while (((currLine = r.readLine()) != null) && (currLine.startsWith("##"))) { String[] cat = currLine.split("=", 2); if (cat.length < 2) continue; switch (cat[0]) { // case "##INFO": // infoMap.put(mapOfAnno.get("ID"), mapOfAnno.get("Description")); // break; // case "##FILTER":break; // case "##FORMAT": // formatMap.put(mapOfAnno.get("ID"),mapOfAnno.get("Description")); // break; case "##SAMPLE": SetMultimap mapOfAnno = TaxaListIOUtils.parseVCFHeadersIntoMap(cat[1]); String taxaID = mapOfAnno.get("ID").iterator().next(); if (taxaID == null) break; sampAnnoBuild.put(taxaID, mapOfAnno); break; case "##PEDIGREE": break; default: break; } // System.out.println(currLine); } return currLine; } private static String getReplaceCommaWithinQuote(String s) { StringBuilder sb = new StringBuilder(s); boolean inQuote = false; for (int i = 0; i < sb.length(); i++) { if (sb.charAt(i) == '\"') inQuote = (!inQuote); if (inQuote && sb.charAt(i) == ',') sb.setCharAt(i, (char) ((int) ',' + 256));//(char)167); if (inQuote && sb.charAt(i) == '=') sb.setCharAt(i, (char) ((int) '=' + 256));//(char)167); } return sb.toString(); } private TaxaList processTaxa(String readLn, Map> taxaAnnotation) { String[] header = TAB_PATTERN.split(readLn); hp = new HeaderPositions(header); int numTaxa = header.length - hp.NUM_HAPMAP_NON_TAXA_HEADERS; TaxaListBuilder tlb = new TaxaListBuilder(); for (int i = 0; i < numTaxa; i++) { String taxonID = header[i + hp.NUM_HAPMAP_NON_TAXA_HEADERS]; Taxon.Builder at = new Taxon.Builder(taxonID); SetMultimap taMap = taxaAnnotation.get(taxonID); if (taMap != null) { for (Map.Entry en : taMap.entries()) { if (en.getKey().equals("ID")) continue; //skip the IDs as these became the name String s = en.getValue().replace("\"", ""); at.addAnno(en.getKey(), s); } } tlb.add(at.build()); } return tlb.build(); } } class HeaderPositions { final int NUM_HAPMAP_NON_TAXA_HEADERS; final int GENOIDX; final int SNPID_INDEX; // final int VARIANT_INDEX; final int FILTER_INDEX; final int QUAL_INDEX; final int CHROMOSOME_INDEX; final int POSITION_INDEX; final int REF_INDEX; final int ALT_INDEX; final int INFO_INDEX; final int FORMAT_INDEX; public HeaderPositions(String[] header) { int chrIdx = firstEqualIndex(header, "#CHROM"); if (chrIdx < 0) chrIdx = firstEqualIndex(header, "#CHR"); CHROMOSOME_INDEX = chrIdx; POSITION_INDEX = firstEqualIndex(header, "POS"); SNPID_INDEX = firstEqualIndex(header, "ID"); REF_INDEX = firstEqualIndex(header, "REF"); ALT_INDEX = firstEqualIndex(header, "ALT"); QUAL_INDEX = firstEqualIndex(header, "QUAL"); FILTER_INDEX = firstEqualIndex(header, "FILTER"); INFO_INDEX = firstEqualIndex(header, "INFO"); FORMAT_INDEX = firstEqualIndex(header, "FORMAT"); NUM_HAPMAP_NON_TAXA_HEADERS = Math.max(INFO_INDEX, FORMAT_INDEX) + 1; GENOIDX = NUM_HAPMAP_NON_TAXA_HEADERS; } private static int firstEqualIndex(String[] sa, String match) { for (int i = 0; i < sa.length; i++) { if (sa[i].equals(match)) return i; } return -1; } } //class ProcessVCFBlock implements Runnable { class ProcessVCFBlock implements Callable { private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s"); private static final Pattern SLASH_PATTERN = Pattern.compile("/"); private final HeaderPositions hp; private final int taxaN; private final int siteN; private final int startSite; //if unknown Int.Mini private ArrayList txtL; private byte[][] gTS; //genotypes private byte[][][] dTS; //depth private final ArrayList blkPosList; private final boolean keepDepth; private ProcessVCFBlock(int taxaN, HeaderPositions hp, ArrayList txtL, int startSite, boolean keepDepth) { this.taxaN = taxaN; this.siteN = txtL.size(); this.txtL = txtL; this.hp = hp; blkPosList = new ArrayList<>(siteN); this.startSite = startSite; this.keepDepth = keepDepth; } /*Used to process VCF blocks and return the result for a in memory GenotypeTable*/ static ProcessVCFBlock getInstance(int taxaN, HeaderPositions hp, ArrayList txtL, boolean keepDepth) { return new ProcessVCFBlock(taxaN, hp, txtL, Integer.MIN_VALUE, keepDepth); } @Override public ProcessVCFBlock call() throws Exception { Map chromosomeLookup = new HashMap<>(); gTS = new byte[taxaN][siteN]; if (keepDepth == true) dTS = new byte[taxaN][6][siteN]; for (int s = 0; s < siteN; s++) { //really needs to use a Splitter iterator to make this cleaner if it is performant String input = txtL.get(s); try { int[] tabPos = new int[hp.NUM_HAPMAP_NON_TAXA_HEADERS + taxaN]; int tabIndex = 0; int len = input.length(); for (int i = 0; (tabIndex < hp.NUM_HAPMAP_NON_TAXA_HEADERS + taxaN) && (i < len); i++) { if (input.charAt(i) == '\t') { tabPos[tabIndex++] = i; } } String chrName = input.substring(0, tabPos[hp.CHROMOSOME_INDEX]); Chromosome currChr = chromosomeLookup.get(chrName); if (currChr == null) { currChr = new Chromosome(new String(chrName)); chromosomeLookup.put(chrName, currChr); } String snpID = null; if (hp.SNPID_INDEX > 0) snpID = input.substring(tabPos[hp.SNPID_INDEX - 1] + 1, tabPos[hp.SNPID_INDEX]); String refS = input.substring(tabPos[hp.REF_INDEX - 1] + 1, tabPos[hp.REF_INDEX]); String alt = input.substring(tabPos[hp.ALT_INDEX - 1] + 1, tabPos[hp.ALT_INDEX]); String variants; if (alt.equals(".")) { variants = refS; } else { variants = (refS + "/" + alt).replace(',', '/') .replace("", "+").replace('I', '+') .replace("", "-").replace('D', '-') .replace("*", "N"); } //GeneralPosition.Builder apb=new GeneralPosition.Builder(currChr, currentPosition) // .knownVariants(variants); //TODO strand, variants, //ZRM 8_26 GeneralPosition.Builder apb = new GeneralPosition.Builder(currChr, Integer.parseInt(input.substring(tabPos[hp.POSITION_INDEX - 1] + 1, tabPos[hp.POSITION_INDEX]))) .knownVariants(variants) //TODO strand, variants, ; if (snpID != null && !snpID.equals(".")) { apb.snpName(snpID); } //byte[] alleles=new byte[(variants.length()+1)/2]; byte[] alleles = new byte[variants.split("/").length]; for (int i = 0, varInd = 0; i < alleles.length; i++, varInd += 2) { alleles[i] = NucleotideAlignmentConstants.getNucleotideAlleleByte(variants.charAt(varInd)); } /***ZRM 8_27 New code ***/ String[] variantList = variants.split("/"); if (variantList[0].length() > 1) { String[] parsedVariantList = new String[variantList.length]; //alt deletion for (int i = 0; i < variantList.length; i++) { //Pull off the first character if it exists if (variantList[i].length() > 1) { parsedVariantList[i] = variantList[i].substring(1); if (parsedVariantList[i].length() == 0) { parsedVariantList[i] = "-"; } } else { //Mark as deletion parsedVariantList[i] = "-"; } } for (int i = 0; i < parsedVariantList.length; i++) { alleles[i] = NucleotideAlignmentConstants.getNucleotideAlleleByte(parsedVariantList[i].charAt(0)); } } else { //Check for reference deletion(insertion) //Loop through all variants to see if one alt is longer than the ref boolean isIndel = false; for (int i = 1; i < variantList.length; i++) { if (variantList[i].length() > variantList[0].length()) { isIndel = true; break; } } if (isIndel) { String[] parsedVariantList = new String[variantList.length]; //ref+alt deletion for (int i = 0; i < variantList.length; i++) { //Pull off the first character if it exists if (variantList[i].length() > 1) { parsedVariantList[i] = variantList[i].substring(1); if (parsedVariantList[i].length() == 0) { parsedVariantList[i] = "-"; } } else { //Mark as deletion parsedVariantList[i] = "-"; } } for (int i = 0; i < parsedVariantList.length; i++) { alleles[i] = NucleotideAlignmentConstants.getNucleotideAlleleByte(parsedVariantList[i].charAt(0)); } } else { //if not just put it in the allele array for (int i = 0; i < variantList.length; i++) { alleles[i] = NucleotideAlignmentConstants.getNucleotideAlleleByte(variantList[i].charAt(0)); } } } /***ZRM 8_27 New code end ***/ apb.allele(WHICH_ALLELE.Reference, alleles[0]); if (alleles.length > 1) { apb.allele(WHICH_ALLELE.Alternate, alleles[1]); } for (String annoS : Splitter.on(";").split(input.substring(tabPos[hp.INFO_INDEX - 1] + 1, tabPos[hp.INFO_INDEX]))) { apb.addAnno(annoS); } blkPosList.add(apb.build()); final int iGT = 0; //genotype index int iAD = -1, iDP = -1, iGQ = -1, iPL = -1; //alleleDepth, overall depth, genotypeQuality, phredGenotypeLikelihoods if (hp.FORMAT_INDEX >= 0) { //Check to see if FORMAT tag is missing. Only applicable for single taxa files if (tabPos[hp.FORMAT_INDEX] == 0) { throw new IllegalStateException("Error Processing VCF: Missing FORMAT tag."); } String unsplitInput = input.substring(tabPos[hp.FORMAT_INDEX - 1] + 1, tabPos[hp.FORMAT_INDEX]); if (unsplitInput.length() == 0 || !unsplitInput.startsWith("GT")) { //Check to see it has the GT field if (unsplitInput.contains("GT")) { throw new IllegalStateException("Error Processing VCF Block: GT field is not in first position of FORMAT."); } //If GT isnt in, we assume that it is missing FORMAT else { throw new IllegalStateException("Error Processing VCF Block: Missing FORMAT tag."); } } String[] formatS = unsplitInput.split(":"); iAD = firstEqualIndex(formatS, "AD"); } int t = 0; for (String taxaAllG : Splitter.on("\t").split(input.substring(tabPos[hp.NUM_HAPMAP_NON_TAXA_HEADERS - 1] + 1))) { int f = 0; //if(taxaAllG.equals(".")) { if (taxaAllG.startsWith(".")) { gTS[t][s] = GenotypeTable.UNKNOWN_GENOTYPE; //need to still move up the taxa counter by one as we have covered this now. t++; continue; } for (String fieldS : Splitter.on(":").split(taxaAllG)) { if (f == iGT) { //String "[.0-9]\\/[.0-9]||[.0-9]\\|[.0-9]" will match a valid diploid if (!fieldS.equals(".")) { //[TAS-509] Check to make sure we are using diploids in the form 0/1 or 0|0 int a1 = 0; int a2 = 0; //Should be this, but for speed we just check the character //if(fieldS.split("|/").length ==1) { if (fieldS.length() == 1) { a1 = fieldS.charAt(0) - '0'; a2 = fieldS.charAt(0) - '0'; } else { //zrm22 Jul 31, 2017 //int a1 = fieldS.charAt(0) - '0'; //int a2 = fieldS.charAt(2) - '0'; a1 = fieldS.charAt(0) - '0'; a2 = fieldS.charAt(2) - '0'; } if (a1 > alleles.length - 1 || a2 > alleles.length - 1) { Position pos = blkPosList.get(blkPosList.size() - 1); throw new IllegalStateException("\nError Processing VCF block: Mismatch of alleles.\n At Chromosome " + pos.getChromosome().getName() + ", Position " + pos.getPosition() + ".\nAllele ID larger than number of alleles"); } if (a1 < 0 || a2 < 0) { gTS[t][s] = GenotypeTable.UNKNOWN_GENOTYPE; } else { gTS[t][s] = GenotypeTableUtils.getDiploidValue(alleles[a1], alleles[a2]); } } else { //[TAS-509] if it isnt a diploid error out early throw new IllegalStateException("Error Processing VCF block: Found haploid information for the element: " + taxaAllG + ".\nExpected a diploid entry."); } } else if ((f == iAD) && keepDepth) { // System.out.println("GTS: "+gTS[t][s] + " "+fieldS); if (gTS[t][s] != GenotypeTable.UNKNOWN_GENOTYPE) { int i = 0; for (String ad : Splitter.on(",").split(fieldS)) { if (i >= alleles.length) continue; if (alleles[i] == GenotypeTable.UNKNOWN_ALLELE || ad.equals(".") || alleles[i] == NucleotideAlignmentConstants.UNDEFINED_ALLELE || alleles[i] == NucleotideAlignmentConstants.UNDEFINED_HOMOZYGOUS) { //no position for depth of unknown alleles or depth is set to missing, so skip //Uncomment when converted //dTS[t][alleles[i++]][s] = AlleleDepthUtil.depthIntToByte(AlleleDepthUtil.DEPTH_MISSING); //Comment next two lines when converted i++; continue; } int adInt = Integer.parseInt(ad); dTS[t][alleles[i++]][s] = AlleleDepthUtil.depthIntToByte(adInt); } } } f++; } t++; } } catch (IllegalStateException e) { e.printStackTrace(); throw e; } catch (Exception e) { System.err.println("Err Site Number:" + s); //System.err.println("Err Site Number:"+blkPosList.get(blkPosList.size()-1).toString()); //System.err.println("Err:"+input); e.printStackTrace(); throw e; } } txtL = null; return this; } int getSiteNumber() { return siteN; } byte[][] getGenoTS() { return gTS; } byte[][][] getDepthTS() { return dTS; } ArrayList getBlkPosList() { return blkPosList; } private static int firstEqualIndex(String[] sa, String match) { for (int i = 0; i < sa.length; i++) { if (sa[i].equals(match)) return i; } return -1; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy