All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.maizegenetics.pangenome.fastaExtraction.GVCFSequence Maven / Gradle / Ivy

There is a newer version: 1.10
Show newest version
package net.maizegenetics.pangenome.fastaExtraction;

import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.analysis.phg.ParseGVCF;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;

import java.io.File;
import java.util.*;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Collector;
import java.util.stream.Collectors;

/**
 * GVCFSequence.java
 * 

* Simple class to read in a GVCF then get the FASTA encoded sequence taking into account the variants in the GVCF file * Uses a RangeMap to hold the GVCF variant calls for a given position range. * Will coalesce any overlapping regions by concatenating the calls together in order of gvcf. *

* Created by zrm22 on 7/17/17. */ public class GVCFSequence implements GenomeSequence { private static final Logger myLogger = Logger.getLogger(GVCFSequence.class); //Setup instance variables protected GenomeSequence myReferenceSequence; protected RangeMap myGVCFRangeToAlleleCallMap; //Variables to make some of the GenomeSequence methods work easier protected Map chromLengthLookup; protected RangeMap wholeGenomeIndexMap; protected boolean myMissingAsRef; public GVCFSequence() {} /** * Basic Constructor to create a GVCFSequence * * @param refFileName File name of the reference sequence in FASTA format * @param gvcfFileName File name of the GVCF file containing the variants */ private GVCFSequence(String refFileName, String gvcfFileName, boolean missingAsRef) { this(GenomeSequenceBuilder.instance(refFileName), gvcfFileName,missingAsRef); } /** * Basic Constructor to create a GVCFSequence when the GenomeSequence was already created * * @param genomeSequence GenomeSequence object representing the reference sequence * @param gvcfFileName File name of the GVCF file containing the variants */ //Second constructor to the process faster when processing multiple fastas. Create the Reference GenomeSequence once. private GVCFSequence(GenomeSequence genomeSequence, String gvcfFileName, boolean missingAsRef) { myLogger.info("GVCFSequence: Creating GenomeSequnce from file:"); myReferenceSequence = genomeSequence; chromLengthLookup = setupChromLengthLookup(); wholeGenomeIndexMap = setupWholeGenomeIndexMap(); myGVCFRangeToAlleleCallMap = parseGVCFFileIntoRangeMap(gvcfFileName); myMissingAsRef = missingAsRef; } /** * Constructor to build a GVCFSequence from a List of VariantContexts * @param genomeSequence * @param variantContexts * @param missingAsRef * @param taxonName */ private GVCFSequence(GenomeSequence genomeSequence, List variantContexts, boolean missingAsRef, String taxonName) { myReferenceSequence = genomeSequence; chromLengthLookup = setupChromLengthLookup(); wholeGenomeIndexMap = setupWholeGenomeIndexMap(); myGVCFRangeToAlleleCallMap = parseVariantContextsIntoRangeMap(variantContexts, taxonName,0); myMissingAsRef = missingAsRef; } private GVCFSequence(GenomeSequence genomeSequence, List variantInfos, boolean missingAsRef) { myReferenceSequence = genomeSequence; chromLengthLookup = setupChromLengthLookup(); wholeGenomeIndexMap = setupWholeGenomeIndexMap(); myGVCFRangeToAlleleCallMap = parseVariantInfosIntoRangeMap(variantInfos); myMissingAsRef = missingAsRef; } /** * Returns an initialized GVCFSequence given the input Reference and GVCF Files. * * @param refFileName File name of the Reference file used in creating the GVCF. * @param gvcfFileName File name of the GVCF file which defines the variants of the taxon to the reference * @return GenomeSequence which can be used to get sequence out */ public static GenomeSequence instance(String refFileName, String gvcfFileName) { return new GVCFSequence(refFileName, gvcfFileName,false); } /** * Returns an initialized GVCFSequence given the input Reference and GVCF Files. * This should be used if you are running multiple GVCFs through using the same Reference File. * Currently will speed up processing each GVCF by about a minute. * * @param referenceSequence GenomeSequence object already read into memory. * @param gvcfFileName File name of the GVCF file which defines the variants of the taxon to the reference * @return GenomeSequence which can be used to get sequence out */ public static GenomeSequence instance(GenomeSequence referenceSequence, String gvcfFileName) { return new GVCFSequence(referenceSequence, gvcfFileName,false); } /** * Instance method to add in missing as Ref from a file * @param referenceSequence * @param gvcfFileName * @param missingAsRef * @return */ public static GenomeSequence instance(GenomeSequence referenceSequence, String gvcfFileName, boolean missingAsRef) { return new GVCFSequence(referenceSequence,gvcfFileName,missingAsRef); } /** * Instance method to create a GenomeSequenc from a List of VariantContexts * @param referenceSequence * @param variantContexts * @param missingAsRef * @param taxonName * @return */ public static GenomeSequence instance(GenomeSequence referenceSequence, List variantContexts, boolean missingAsRef, String taxonName) { return new GVCFSequence(referenceSequence, variantContexts,missingAsRef,taxonName); } public static GenomeSequence instance(GenomeSequence referenceSequence, List variantContexts, boolean missingAsRef) { return new GVCFSequence(referenceSequence, variantContexts,missingAsRef); } /** * Method to get a map of taxon to its genome sequence for all the taxon in the vcf file * @param referenceSequence * @param vcfFileName * @param missingAsRef * @return */ public static Map allTaxonInstance(GenomeSequence referenceSequence, String vcfFileName, boolean missingAsRef) { try(VCFFileReader reader = new VCFFileReader(new File(vcfFileName), false)) { List variants = reader.iterator().stream().collect(Collectors.toList()); return reader.getFileHeader() .getGenotypeSamples() .stream() .collect(Collectors.toMap(taxon -> taxon, taxon -> new GVCFSequence(referenceSequence,variants,missingAsRef,taxon))); } catch(Exception e) { throw new IllegalStateException(e); } } /** * Simple method to setup the chromosome length lookup map to be used for the GenomeSequence methods * * @return map mapping the chromosome to its length on the ref sequence. */ protected Map setupChromLengthLookup() { return myReferenceSequence.chromosomes().stream().collect(Collectors.toMap(chrom -> chrom, chrom -> myReferenceSequence.chromosomeSize(chrom))); } /** * Method to setup the RangeMap containing the start and End position of each chromosome in order * Uses a new collect method which collects the stream into a RangeMap. * @return RangeMap containing the indices of each chromosome */ protected RangeMap setupWholeGenomeIndexMap() { LongAdder genomeIndex = new LongAdder(); return chromosomes().stream().sorted() .collect(Collector.of(TreeRangeMap::create, (tree, chrom) -> { int length = chromLengthLookup.get(chrom); tree.put(Range.closed(genomeIndex.longValue(), genomeIndex.longValue() + length - 1), chrom); genomeIndex.add(length); }, (result1, result2) -> { result1.putAll(result2); return result1; })); } /** * Method to take the GVCF file and parse it into a RangeMap. * Blocks of Reference sequence have a value of 'REFRANGE' which redirects to the Reference GenomeSequence * Single SNPs are stored as single element ranges * Indels are stored as well. Deletions generally take up multiple reference positions * * @param gvcfFile fileName for the GVCF file * @return RangeMap which holds the variant calls(or reference) for ranges of Positions specified by the gvcfFile */ protected RangeMap parseGVCFFileIntoRangeMap(String gvcfFile) { RangeMap gvcfRangeMap = TreeRangeMap.create(); try { //Walk through the gvcf file using Terry's code //TODO replace with Streaming version when done. Tuple, BlockingQueue>> temp = ParseGVCF.parse(gvcfFile); List headers = temp.getX(); BlockingQueue> queue = temp.getY(); ParseGVCF.ProcessLines next = queue.take().get(); while (!next.isFinal()) { for (ParseGVCF.GVCFLine current : next.processedLines()) { // use current //For each line grab the GT call and see if it should be a REFRANGE or something else(alt call) if (current.isReferenceBlock()) { //If it is a reference block, add REFRANGE to the RangeMap addRange(gvcfRangeMap, Range.closed(Position.of(current.chromosome(), current.startPosition()), Position.of(current.chromosome(), current.endPosition())), "REFRANGE"); } else { //Check the GT call and export whatever string is there if (current.isHomozygous()) { //if it is homozygous we can just grab the first entry in the GT field //not a block put in Range.closed(pos,endpos) //Most of the time for SNPs or insertions endpos = pos. String allele = current.genotypes().get(0); addRange(gvcfRangeMap, Range.closed(Position.of(current.chromosome(), current.startPosition()), Position.of(current.chromosome(), current.endPosition())), allele); } else { //TODO handle the hets correctly } } } next = queue.take().get(); } } catch (Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException("GVCF To Fasta: Problem parsing: " + gvcfFile); } return gvcfRangeMap; } /** * Method to parse the variantContexts into rangeMap for a single taxon. This will simply call parseVariantContextsIntoRangeMap but with a gameteIndex of 0 * @param variantContexts * @param taxonName * @return */ protected RangeMap parseVariantContextsIntoRangeMap(List variantContexts, String taxonName) { return parseVariantContextsIntoRangeMap(variantContexts,taxonName,0); } /** * Method to parse the List of variantContexts into the rangemap required by this class. Allows you to build the GVCFSequence from memory. * @param variantContexts * @param taxonName * @return */ protected RangeMap parseVariantContextsIntoRangeMap(List variantContexts, String taxonName, int gameteIndex) { RangeMap gvcfRangeMap = TreeRangeMap.create(); try { //Remove any vcf records where we do not have the taxon List unknownVariantsRemovedList = variantContexts.stream() .filter(variantContext -> variantContext.getGenotype(taxonName)!=null) .collect(Collectors.toList()); Chromosome chr = null; String prevContigString = null; for(VariantContext currentVariant : unknownVariantsRemovedList) { if(chr == null || !prevContigString.equals(currentVariant.getContig())) { chr = Chromosome.instance(currentVariant.getContig()); prevContigString = currentVariant.getContig(); } //Check to see if currentVariant is a reference block //This REFRANGE should not be confused with the PHG API ReferenceRange object which represents an anchor. //This REFRANGE is a reference block in a VCF file(with END annotation) if(isRefRange(currentVariant)) { addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.getStart()), Position.of(chr, currentVariant.getEnd())), "REFRANGE"); } //If not, its either SNP, indel or straight monomorphic call //Get the allele value from the genotype and add to the map. else { Genotype currentGenotype = currentVariant.getGenotype(taxonName); addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.getStart()), Position.of(chr, currentVariant.getEnd())), currentGenotype.getAllele(gameteIndex).getBaseString()); } } } catch (Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException("GVCFSequence parseVariantContextsIntoRangeMap error converting List to RangeMap for Taxon:" + taxonName); } return gvcfRangeMap; } /** * Method to parse the List of variantContexts into the rangemap required by this class. Allows you to build the GVCFSequence from memory. * @param variantInfos * @return */ protected RangeMap parseVariantInfosIntoRangeMap(List variantInfos) { RangeMap gvcfRangeMap = TreeRangeMap.create(); try { Chromosome chr = null; String prevContigString = null; for(HaplotypeNode.VariantInfo currentVariant : variantInfos) { if(chr == null || !prevContigString.equals(currentVariant.chromosome())) { chr = Chromosome.instance(currentVariant.chromosome()); prevContigString = currentVariant.chromosome(); } //Check to see if currentVariant is a reference block //This REFRANGE should not be confused with the PHG API ReferenceRange object which represents an anchor. //This REFRANGE is a reference block in a VCF file(with END annotation) if(!currentVariant.isVariant()) { addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.start()), Position.of(chr, currentVariant.end())), "REFRANGE"); } //If not, its either SNP, indel or straight monomorphic call //Get the allele value from the genotype and add to the map. else { String currentGenotype = currentVariant.genotypeString(); addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.start()), Position.of(chr, currentVariant.end())), currentGenotype); } } } catch (Exception e) { myLogger.debug(e.getMessage(), e); throw new IllegalStateException("GVCFSequence parseVariantContextsIntoRangeMap error converting List to RangeMap"); } return gvcfRangeMap; } /** * addRange method adapted from Louis Wasserman's(author of RangeMap) answer here: https://stackoverflow.com/questions/41418416/intersecting-ranges-when-using-rangemap * This method will concatenate the string if the range overlaps an existing range in the map. Because VCF files should be ordered(in the spec) the concatenation should be ordered correctly * * @param originalRangeMap rangeMap object which we want to add a new range to * @param newRange Range object which we need to make sure the range does not collide with any ranges in originalRangeMap * @param newCall String value for the new call associated with newRange */ protected void addRange(RangeMap originalRangeMap, Range newRange, String newCall) { //Get the list of overlapping ranges with the new range //This will just return the intersecting range and not the full range that intersects. //ex if [1,10] is in the map and we are adding [8,12], overlaps will contain [8,10] List, String>> overlaps = new ArrayList<>( originalRangeMap.subRangeMap(newRange).asMapOfRanges().entrySet()); //if overlaps has length, we have to merge ranges together if (overlaps.size() != 0) { //because gvcf is ordered, we can just grab the first one and append the call on the end //Because overlap just gives intersecting range, we need to get the original range map so we can union Map.Entry, String> overlappingEntry = originalRangeMap.getEntry(overlaps.get(0).getKey().lowerEndpoint()); //Fixing overlapping bug PHG-65 if(newCall.equals("REFRANGE") && overlappingEntry.getValue().equals("REFRANGE")) { //If both are REFRANGEs we should only store a REFRANGE //This means they are consecutive and should be handled like originally done. //then use the combined range and assign the call String newMergedCall = overlappingEntry.getValue(); //putCoalescing will merge the two records together originalRangeMap.putCoalescing(newRange, newMergedCall); } else if(newCall.equals("REFRANGE") && !overlappingEntry.getValue().equals("REFRANGE")) { //Because GVCF is ordered, shift the newCall's starting position up to the overlappingEntry.key().upperEndPoint //make sure shifting will not remove the new range(in examples where new range is 1 bp) Position endPositionOfOverlap = overlappingEntry.getKey().upperEndpoint(); if(endPositionOfOverlap.getPosition() < newRange.upperEndpoint().getPosition()) { Position newStartPosition = Position.of(endPositionOfOverlap.getChromosome(), endPositionOfOverlap.getPosition() + 1); originalRangeMap.put(Range.closed(newStartPosition, newRange.upperEndpoint()), newCall); } //else do nothing as the 1 bp ref range was covered by a previous record(likely multi bp indel) } else if(!newCall.equals("REFRANGE") && overlappingEntry.getValue().equals("REFRANGE")) { //This means the previous record was a reference range that spanned too much //This likely does not happen much in practice Position endPositionOfOverlap = newRange.lowerEndpoint(); //Make sure we do not have a 1bp ref range already in the map if(endPositionOfOverlap.getPosition() > overlappingEntry.getKey().lowerEndpoint().getPosition()) { Position newEndPosition = Position.of(endPositionOfOverlap.getChromosome(), endPositionOfOverlap.getPosition() - 1); //Remove the current record originalRangeMap.remove(overlappingEntry.getKey()); //Add in the refrange with new bounds and the call originalRangeMap.put(Range.closed(overlappingEntry.getKey().lowerEndpoint(), newEndPosition), overlappingEntry.getValue()); originalRangeMap.put(newRange,newCall); } else { //We need to remove the previous entry and add in the new one as it overrides the REF call originalRangeMap.remove(overlappingEntry.getKey()); originalRangeMap.put(newRange,newCall); } } else { //This means they are consecutive and should be handled like originally done. //then use the combined range and assign the call String newMergedCall = overlappingEntry.getValue() + newCall; //Changing the above to use RangeMap.putCoalescing //First update the existing Range Key with the new value //Second coalesce by the new range //The range map will merge the two overlapping ranges and will use newMergedCall as the value originalRangeMap.put(overlappingEntry.getKey(), newMergedCall); originalRangeMap.putCoalescing(newRange, newMergedCall); } } //else just add the range and value to the map else { originalRangeMap.put(newRange, newCall); } } /** * Simple method to tell if a variantContext is in a reference block in the VCF file. * * This RefRange is not to be confused with the ReferenceRange object in the PHG api. * @param currentVariant * @return */ protected boolean isRefRange(VariantContext currentVariant) { //If the record has a END annotation and the length of the allele ==1 it is ref range return currentVariant.hasAttribute("END") && currentVariant.getReference().getBaseString().length()==1; } /** * Simple method to determine if the position in the rangeMap is a Reference Block * * @param position Position object we want to check * @return true if RangeMap.get(position) is REFRANGE, false if not. */ protected boolean isPositionRefBlock(Position position) { if (myGVCFRangeToAlleleCallMap.get(position).equals("REFRANGE")) { return true; } else { return false; } } /** * Simple method to check to see if the position is currently in the map. * * @param position Position object we want to check * @return true if RangeMap.get(position) != null */ protected boolean isPositionInMap(Position position) { if (myGVCFRangeToAlleleCallMap.get(position) == null) { return false; } else { return true; } } //Implementing the methods from GenomeSequence @Override public Set chromosomes() { return myReferenceSequence.chromosomes(); } @Override public byte[] chromosomeSequence(Chromosome chrom) { return chromosomeSequence(chrom, 1, myReferenceSequence.chromosomeSize(chrom)); } @Override public byte[] chromosomeSequence(Chromosome chrom, int startSite, int endSite) { return NucleotideAlignmentConstants.convertHaplotypeStringToAlleleByteArray(genotypeAsString(chrom, startSite, endSite)); } @Override public byte[] genomeSequence(long startSite, long lastSite) { return NucleotideAlignmentConstants.convertHaplotypeStringToAlleleByteArray(genomeSequenceAsString(startSite, lastSite)); } @Override public String genomeSequenceAsString(long startSite, long lastSite) { StringBuilder sequenceBuilder = new StringBuilder(); long currentSiteToGet = startSite; while (currentSiteToGet < lastSite) { Map.Entry, Chromosome> rangeChromEntry = wholeGenomeIndexMap.getEntry(currentSiteToGet); int chrStart = (int) (currentSiteToGet - rangeChromEntry.getKey().lowerEndpoint()); int chrLast = (int) Math.min(rangeChromEntry.getKey().upperEndpoint() - rangeChromEntry.getKey().lowerEndpoint(), lastSite - rangeChromEntry.getKey().lowerEndpoint()); String chromSeq = genotypeAsString(rangeChromEntry.getValue(), chrStart + 1, chrLast + 1); //+1 for 0 based genome, 1 based chromosomes sequenceBuilder.append(chromSeq); currentSiteToGet += chromSeq.length(); } return sequenceBuilder.toString(); } @Override public Map> fullRefCoordinateToChromCoordinate(ArrayList coordinates) { return myReferenceSequence.fullRefCoordinateToChromCoordinate(coordinates); } @Override public int chromosomeSize(Chromosome chromosome) { return myReferenceSequence.chromosomeSize(chromosome); } @Override public long genomeSize() { return myReferenceSequence.genomeSize(); } @Override public int numberOfChromosomes() { return myReferenceSequence.numberOfChromosomes(); } @Override public byte genotype(Chromosome chrom, int position) { return genotype(chrom, Position.of(chrom.getName(), position)); } @Override public byte genotype(Chromosome chrom, Position positionObject) { return NucleotideAlignmentConstants.getNucleotideAlleleByte(genotypeAsString(chrom, positionObject)); } @Override public String genotypeAsString(Chromosome chrom, int position) { return genotypeAsString(chrom, Position.of(chrom, position)); } @Override public String genotypeAsString(Chromosome chrom, Position positionObject) { String currentValue = myGVCFRangeToAlleleCallMap.get(positionObject); if (currentValue == null) { if(myMissingAsRef) { return myReferenceSequence.genotypeAsString(chrom,positionObject); } else { return GenotypeTable.UNKNOWN_ALLELE_STR; } } if (currentValue.equals("REFRANGE")) { //use the genomeSequence to get the call return myReferenceSequence.genotypeAsString(positionObject.getChromosome(), positionObject.getPosition()); } else { return currentValue; } } @Override public String genotypeAsString(Chromosome chrom, int startSite, int endSite) { StringBuilder sequence = new StringBuilder(); for (int currentPosition = startSite; currentPosition <= endSite; currentPosition++) { Position currentPositionObject = Position.of(chrom, currentPosition); Map.Entry, String> currentEntry = myGVCFRangeToAlleleCallMap.getEntry(currentPositionObject); //Check to make sure we have an entry if(currentEntry!=null) { if (!currentEntry.getValue().equals("REFRANGE")) { //If it is not a refRange, just get the GVCF allele variant sequence.append(currentEntry.getValue()); } else { //Figure out what the start and end point should be if we have a refBlock so we know how much reference we should extract //Need to make sure we stay within the requested boundaries as the RefBlock can go past. int resolvedStartSite = Math.max(currentEntry.getKey().lowerEndpoint().getPosition(),currentPosition); int resolvedEndSite = Math.min(currentEntry.getKey().upperEndpoint().getPosition(), endSite); //Extract the reference for the region sequence.append(myReferenceSequence.genotypeAsString(chrom, resolvedStartSite, resolvedEndSite )); } //Slide up the currentPosition pointer currentPosition = currentEntry.getKey().upperEndpoint().getPosition(); } else { //This means that it is missing for this site. if(myMissingAsRef) { //If this option is set, get the reference allele for this site sequence.append(myReferenceSequence.genotypeAsString(chrom,currentPosition)); } else { //Else, put a single N. sequence.append(GenotypeTable.UNKNOWN_ALLELE_STR); } } } return sequence.toString(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy