net.maizegenetics.pangenome.fastaExtraction.GVCFSequence Maven / Gradle / Ivy
Show all versions of phg Show documentation
package net.maizegenetics.pangenome.fastaExtraction;
import com.google.common.collect.Range;
import com.google.common.collect.RangeMap;
import com.google.common.collect.TreeRangeMap;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import net.maizegenetics.analysis.phg.ParseGVCF;
import net.maizegenetics.dna.map.Chromosome;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.GenomeSequenceBuilder;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.GenotypeTable;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.HaplotypeNode;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;
import java.io.File;
import java.util.*;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Collector;
import java.util.stream.Collectors;
/**
* GVCFSequence.java
*
* Simple class to read in a GVCF then get the FASTA encoded sequence taking into account the variants in the GVCF file
* Uses a RangeMap to hold the GVCF variant calls for a given position range.
* Will coalesce any overlapping regions by concatenating the calls together in order of gvcf.
*
* Created by zrm22 on 7/17/17.
*/
public class GVCFSequence implements GenomeSequence {
private static final Logger myLogger = Logger.getLogger(GVCFSequence.class);
//Setup instance variables
protected GenomeSequence myReferenceSequence;
protected RangeMap myGVCFRangeToAlleleCallMap;
//Variables to make some of the GenomeSequence methods work easier
protected Map chromLengthLookup;
protected RangeMap wholeGenomeIndexMap;
protected boolean myMissingAsRef;
public GVCFSequence() {}
/**
* Basic Constructor to create a GVCFSequence
*
* @param refFileName File name of the reference sequence in FASTA format
* @param gvcfFileName File name of the GVCF file containing the variants
*/
private GVCFSequence(String refFileName, String gvcfFileName, boolean missingAsRef) {
this(GenomeSequenceBuilder.instance(refFileName), gvcfFileName,missingAsRef);
}
/**
* Basic Constructor to create a GVCFSequence when the GenomeSequence was already created
*
* @param genomeSequence GenomeSequence object representing the reference sequence
* @param gvcfFileName File name of the GVCF file containing the variants
*/
//Second constructor to the process faster when processing multiple fastas. Create the Reference GenomeSequence once.
private GVCFSequence(GenomeSequence genomeSequence, String gvcfFileName, boolean missingAsRef) {
myLogger.info("GVCFSequence: Creating GenomeSequnce from file:");
myReferenceSequence = genomeSequence;
chromLengthLookup = setupChromLengthLookup();
wholeGenomeIndexMap = setupWholeGenomeIndexMap();
myGVCFRangeToAlleleCallMap = parseGVCFFileIntoRangeMap(gvcfFileName);
myMissingAsRef = missingAsRef;
}
/**
* Constructor to build a GVCFSequence from a List of VariantContexts
* @param genomeSequence
* @param variantContexts
* @param missingAsRef
* @param taxonName
*/
private GVCFSequence(GenomeSequence genomeSequence, List variantContexts, boolean missingAsRef, String taxonName) {
myReferenceSequence = genomeSequence;
chromLengthLookup = setupChromLengthLookup();
wholeGenomeIndexMap = setupWholeGenomeIndexMap();
myGVCFRangeToAlleleCallMap = parseVariantContextsIntoRangeMap(variantContexts, taxonName,0);
myMissingAsRef = missingAsRef;
}
private GVCFSequence(GenomeSequence genomeSequence, List variantInfos, boolean missingAsRef) {
myReferenceSequence = genomeSequence;
chromLengthLookup = setupChromLengthLookup();
wholeGenomeIndexMap = setupWholeGenomeIndexMap();
myGVCFRangeToAlleleCallMap = parseVariantInfosIntoRangeMap(variantInfos);
myMissingAsRef = missingAsRef;
}
/**
* Returns an initialized GVCFSequence given the input Reference and GVCF Files.
*
* @param refFileName File name of the Reference file used in creating the GVCF.
* @param gvcfFileName File name of the GVCF file which defines the variants of the taxon to the reference
* @return GenomeSequence which can be used to get sequence out
*/
public static GenomeSequence instance(String refFileName, String gvcfFileName) {
return new GVCFSequence(refFileName, gvcfFileName,false);
}
/**
* Returns an initialized GVCFSequence given the input Reference and GVCF Files.
* This should be used if you are running multiple GVCFs through using the same Reference File.
* Currently will speed up processing each GVCF by about a minute.
*
* @param referenceSequence GenomeSequence object already read into memory.
* @param gvcfFileName File name of the GVCF file which defines the variants of the taxon to the reference
* @return GenomeSequence which can be used to get sequence out
*/
public static GenomeSequence instance(GenomeSequence referenceSequence, String gvcfFileName) {
return new GVCFSequence(referenceSequence, gvcfFileName,false);
}
/**
* Instance method to add in missing as Ref from a file
* @param referenceSequence
* @param gvcfFileName
* @param missingAsRef
* @return
*/
public static GenomeSequence instance(GenomeSequence referenceSequence, String gvcfFileName, boolean missingAsRef) {
return new GVCFSequence(referenceSequence,gvcfFileName,missingAsRef);
}
/**
* Instance method to create a GenomeSequenc from a List of VariantContexts
* @param referenceSequence
* @param variantContexts
* @param missingAsRef
* @param taxonName
* @return
*/
public static GenomeSequence instance(GenomeSequence referenceSequence, List variantContexts, boolean missingAsRef, String taxonName) {
return new GVCFSequence(referenceSequence, variantContexts,missingAsRef,taxonName);
}
public static GenomeSequence instance(GenomeSequence referenceSequence, List variantContexts, boolean missingAsRef) {
return new GVCFSequence(referenceSequence, variantContexts,missingAsRef);
}
/**
* Method to get a map of taxon to its genome sequence for all the taxon in the vcf file
* @param referenceSequence
* @param vcfFileName
* @param missingAsRef
* @return
*/
public static Map allTaxonInstance(GenomeSequence referenceSequence, String vcfFileName, boolean missingAsRef) {
try(VCFFileReader reader = new VCFFileReader(new File(vcfFileName), false)) {
List variants = reader.iterator().stream().collect(Collectors.toList());
return reader.getFileHeader()
.getGenotypeSamples()
.stream()
.collect(Collectors.toMap(taxon -> taxon, taxon -> new GVCFSequence(referenceSequence,variants,missingAsRef,taxon)));
}
catch(Exception e) {
throw new IllegalStateException(e);
}
}
/**
* Simple method to setup the chromosome length lookup map to be used for the GenomeSequence methods
*
* @return map mapping the chromosome to its length on the ref sequence.
*/
protected Map setupChromLengthLookup() {
return myReferenceSequence.chromosomes().stream().collect(Collectors.toMap(chrom -> chrom, chrom -> myReferenceSequence.chromosomeSize(chrom)));
}
/**
* Method to setup the RangeMap containing the start and End position of each chromosome in order
* Uses a new collect method which collects the stream into a RangeMap.
* @return RangeMap containing the indices of each chromosome
*/
protected RangeMap setupWholeGenomeIndexMap() {
LongAdder genomeIndex = new LongAdder();
return chromosomes().stream().sorted()
.collect(Collector.of(TreeRangeMap::create, (tree, chrom) -> {
int length = chromLengthLookup.get(chrom);
tree.put(Range.closed(genomeIndex.longValue(), genomeIndex.longValue() + length - 1), chrom);
genomeIndex.add(length);
}, (result1, result2) -> {
result1.putAll(result2);
return result1;
}));
}
/**
* Method to take the GVCF file and parse it into a RangeMap.
* Blocks of Reference sequence have a value of 'REFRANGE' which redirects to the Reference GenomeSequence
* Single SNPs are stored as single element ranges
* Indels are stored as well. Deletions generally take up multiple reference positions
*
* @param gvcfFile fileName for the GVCF file
* @return RangeMap which holds the variant calls(or reference) for ranges of Positions specified by the gvcfFile
*/
protected RangeMap parseGVCFFileIntoRangeMap(String gvcfFile) {
RangeMap gvcfRangeMap = TreeRangeMap.create();
try {
//Walk through the gvcf file using Terry's code
//TODO replace with Streaming version when done.
Tuple, BlockingQueue>> temp = ParseGVCF.parse(gvcfFile);
List headers = temp.getX();
BlockingQueue> queue = temp.getY();
ParseGVCF.ProcessLines next = queue.take().get();
while (!next.isFinal()) {
for (ParseGVCF.GVCFLine current : next.processedLines()) {
// use current
//For each line grab the GT call and see if it should be a REFRANGE or something else(alt call)
if (current.isReferenceBlock()) {
//If it is a reference block, add REFRANGE to the RangeMap
addRange(gvcfRangeMap, Range.closed(Position.of(current.chromosome(), current.startPosition()),
Position.of(current.chromosome(), current.endPosition())), "REFRANGE");
} else {
//Check the GT call and export whatever string is there
if (current.isHomozygous()) {
//if it is homozygous we can just grab the first entry in the GT field
//not a block put in Range.closed(pos,endpos)
//Most of the time for SNPs or insertions endpos = pos.
String allele = current.genotypes().get(0);
addRange(gvcfRangeMap, Range.closed(Position.of(current.chromosome(), current.startPosition()),
Position.of(current.chromosome(), current.endPosition())), allele);
} else {
//TODO handle the hets correctly
}
}
}
next = queue.take().get();
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("GVCF To Fasta: Problem parsing: " + gvcfFile);
}
return gvcfRangeMap;
}
/**
* Method to parse the variantContexts into rangeMap for a single taxon. This will simply call parseVariantContextsIntoRangeMap but with a gameteIndex of 0
* @param variantContexts
* @param taxonName
* @return
*/
protected RangeMap parseVariantContextsIntoRangeMap(List variantContexts, String taxonName) {
return parseVariantContextsIntoRangeMap(variantContexts,taxonName,0);
}
/**
* Method to parse the List of variantContexts into the rangemap required by this class. Allows you to build the GVCFSequence from memory.
* @param variantContexts
* @param taxonName
* @return
*/
protected RangeMap parseVariantContextsIntoRangeMap(List variantContexts, String taxonName, int gameteIndex) {
RangeMap gvcfRangeMap = TreeRangeMap.create();
try {
//Remove any vcf records where we do not have the taxon
List unknownVariantsRemovedList = variantContexts.stream()
.filter(variantContext -> variantContext.getGenotype(taxonName)!=null)
.collect(Collectors.toList());
Chromosome chr = null;
String prevContigString = null;
for(VariantContext currentVariant : unknownVariantsRemovedList) {
if(chr == null || !prevContigString.equals(currentVariant.getContig())) {
chr = Chromosome.instance(currentVariant.getContig());
prevContigString = currentVariant.getContig();
}
//Check to see if currentVariant is a reference block
//This REFRANGE should not be confused with the PHG API ReferenceRange object which represents an anchor.
//This REFRANGE is a reference block in a VCF file(with END annotation)
if(isRefRange(currentVariant)) {
addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.getStart()),
Position.of(chr, currentVariant.getEnd())), "REFRANGE");
}
//If not, its either SNP, indel or straight monomorphic call
//Get the allele value from the genotype and add to the map.
else {
Genotype currentGenotype = currentVariant.getGenotype(taxonName);
addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.getStart()),
Position.of(chr, currentVariant.getEnd())), currentGenotype.getAllele(gameteIndex).getBaseString());
}
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("GVCFSequence parseVariantContextsIntoRangeMap error converting List to RangeMap for Taxon:" + taxonName);
}
return gvcfRangeMap;
}
/**
* Method to parse the List of variantContexts into the rangemap required by this class. Allows you to build the GVCFSequence from memory.
* @param variantInfos
* @return
*/
protected RangeMap parseVariantInfosIntoRangeMap(List variantInfos) {
RangeMap gvcfRangeMap = TreeRangeMap.create();
try {
Chromosome chr = null;
String prevContigString = null;
for(HaplotypeNode.VariantInfo currentVariant : variantInfos) {
if(chr == null || !prevContigString.equals(currentVariant.chromosome())) {
chr = Chromosome.instance(currentVariant.chromosome());
prevContigString = currentVariant.chromosome();
}
//Check to see if currentVariant is a reference block
//This REFRANGE should not be confused with the PHG API ReferenceRange object which represents an anchor.
//This REFRANGE is a reference block in a VCF file(with END annotation)
if(!currentVariant.isVariant()) {
addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.start()),
Position.of(chr, currentVariant.end())), "REFRANGE");
}
//If not, its either SNP, indel or straight monomorphic call
//Get the allele value from the genotype and add to the map.
else {
String currentGenotype = currentVariant.genotypeString();
addRange(gvcfRangeMap, Range.closed(Position.of(chr, currentVariant.start()),
Position.of(chr, currentVariant.end())), currentGenotype);
}
}
} catch (Exception e) {
myLogger.debug(e.getMessage(), e);
throw new IllegalStateException("GVCFSequence parseVariantContextsIntoRangeMap error converting List to RangeMap");
}
return gvcfRangeMap;
}
/**
* addRange method adapted from Louis Wasserman's(author of RangeMap) answer here: https://stackoverflow.com/questions/41418416/intersecting-ranges-when-using-rangemap
* This method will concatenate the string if the range overlaps an existing range in the map. Because VCF files should be ordered(in the spec) the concatenation should be ordered correctly
*
* @param originalRangeMap rangeMap object which we want to add a new range to
* @param newRange Range object which we need to make sure the range does not collide with any ranges in originalRangeMap
* @param newCall String value for the new call associated with newRange
*/
protected void addRange(RangeMap originalRangeMap, Range newRange, String newCall) {
//Get the list of overlapping ranges with the new range
//This will just return the intersecting range and not the full range that intersects.
//ex if [1,10] is in the map and we are adding [8,12], overlaps will contain [8,10]
List, String>> overlaps = new ArrayList<>(
originalRangeMap.subRangeMap(newRange).asMapOfRanges().entrySet());
//if overlaps has length, we have to merge ranges together
if (overlaps.size() != 0) {
//because gvcf is ordered, we can just grab the first one and append the call on the end
//Because overlap just gives intersecting range, we need to get the original range map so we can union
Map.Entry, String> overlappingEntry = originalRangeMap.getEntry(overlaps.get(0).getKey().lowerEndpoint());
//Fixing overlapping bug PHG-65
if(newCall.equals("REFRANGE") && overlappingEntry.getValue().equals("REFRANGE")) {
//If both are REFRANGEs we should only store a REFRANGE
//This means they are consecutive and should be handled like originally done.
//then use the combined range and assign the call
String newMergedCall = overlappingEntry.getValue();
//putCoalescing will merge the two records together
originalRangeMap.putCoalescing(newRange, newMergedCall);
}
else if(newCall.equals("REFRANGE") && !overlappingEntry.getValue().equals("REFRANGE")) {
//Because GVCF is ordered, shift the newCall's starting position up to the overlappingEntry.key().upperEndPoint
//make sure shifting will not remove the new range(in examples where new range is 1 bp)
Position endPositionOfOverlap = overlappingEntry.getKey().upperEndpoint();
if(endPositionOfOverlap.getPosition() < newRange.upperEndpoint().getPosition()) {
Position newStartPosition = Position.of(endPositionOfOverlap.getChromosome(), endPositionOfOverlap.getPosition() + 1);
originalRangeMap.put(Range.closed(newStartPosition, newRange.upperEndpoint()), newCall);
}
//else do nothing as the 1 bp ref range was covered by a previous record(likely multi bp indel)
}
else if(!newCall.equals("REFRANGE") && overlappingEntry.getValue().equals("REFRANGE")) {
//This means the previous record was a reference range that spanned too much
//This likely does not happen much in practice
Position endPositionOfOverlap = newRange.lowerEndpoint();
//Make sure we do not have a 1bp ref range already in the map
if(endPositionOfOverlap.getPosition() > overlappingEntry.getKey().lowerEndpoint().getPosition()) {
Position newEndPosition = Position.of(endPositionOfOverlap.getChromosome(), endPositionOfOverlap.getPosition() - 1);
//Remove the current record
originalRangeMap.remove(overlappingEntry.getKey());
//Add in the refrange with new bounds and the call
originalRangeMap.put(Range.closed(overlappingEntry.getKey().lowerEndpoint(), newEndPosition), overlappingEntry.getValue());
originalRangeMap.put(newRange,newCall);
}
else {
//We need to remove the previous entry and add in the new one as it overrides the REF call
originalRangeMap.remove(overlappingEntry.getKey());
originalRangeMap.put(newRange,newCall);
}
}
else {
//This means they are consecutive and should be handled like originally done.
//then use the combined range and assign the call
String newMergedCall = overlappingEntry.getValue() + newCall;
//Changing the above to use RangeMap.putCoalescing
//First update the existing Range Key with the new value
//Second coalesce by the new range
//The range map will merge the two overlapping ranges and will use newMergedCall as the value
originalRangeMap.put(overlappingEntry.getKey(), newMergedCall);
originalRangeMap.putCoalescing(newRange, newMergedCall);
}
}
//else just add the range and value to the map
else {
originalRangeMap.put(newRange, newCall);
}
}
/**
* Simple method to tell if a variantContext is in a reference block in the VCF file.
*
* This RefRange is not to be confused with the ReferenceRange object in the PHG api.
* @param currentVariant
* @return
*/
protected boolean isRefRange(VariantContext currentVariant) {
//If the record has a END annotation and the length of the allele ==1 it is ref range
return currentVariant.hasAttribute("END") && currentVariant.getReference().getBaseString().length()==1;
}
/**
* Simple method to determine if the position in the rangeMap is a Reference Block
*
* @param position Position object we want to check
* @return true if RangeMap.get(position) is REFRANGE, false if not.
*/
protected boolean isPositionRefBlock(Position position) {
if (myGVCFRangeToAlleleCallMap.get(position).equals("REFRANGE")) {
return true;
} else {
return false;
}
}
/**
* Simple method to check to see if the position is currently in the map.
*
* @param position Position object we want to check
* @return true if RangeMap.get(position) != null
*/
protected boolean isPositionInMap(Position position) {
if (myGVCFRangeToAlleleCallMap.get(position) == null) {
return false;
} else {
return true;
}
}
//Implementing the methods from GenomeSequence
@Override
public Set chromosomes() {
return myReferenceSequence.chromosomes();
}
@Override
public byte[] chromosomeSequence(Chromosome chrom) {
return chromosomeSequence(chrom, 1, myReferenceSequence.chromosomeSize(chrom));
}
@Override
public byte[] chromosomeSequence(Chromosome chrom, int startSite, int endSite) {
return NucleotideAlignmentConstants.convertHaplotypeStringToAlleleByteArray(genotypeAsString(chrom, startSite, endSite));
}
@Override
public byte[] genomeSequence(long startSite, long lastSite) {
return NucleotideAlignmentConstants.convertHaplotypeStringToAlleleByteArray(genomeSequenceAsString(startSite, lastSite));
}
@Override
public String genomeSequenceAsString(long startSite, long lastSite) {
StringBuilder sequenceBuilder = new StringBuilder();
long currentSiteToGet = startSite;
while (currentSiteToGet < lastSite) {
Map.Entry, Chromosome> rangeChromEntry = wholeGenomeIndexMap.getEntry(currentSiteToGet);
int chrStart = (int) (currentSiteToGet - rangeChromEntry.getKey().lowerEndpoint());
int chrLast = (int) Math.min(rangeChromEntry.getKey().upperEndpoint() - rangeChromEntry.getKey().lowerEndpoint(), lastSite - rangeChromEntry.getKey().lowerEndpoint());
String chromSeq = genotypeAsString(rangeChromEntry.getValue(), chrStart + 1, chrLast + 1); //+1 for 0 based genome, 1 based chromosomes
sequenceBuilder.append(chromSeq);
currentSiteToGet += chromSeq.length();
}
return sequenceBuilder.toString();
}
@Override
public Map> fullRefCoordinateToChromCoordinate(ArrayList coordinates) {
return myReferenceSequence.fullRefCoordinateToChromCoordinate(coordinates);
}
@Override
public int chromosomeSize(Chromosome chromosome) {
return myReferenceSequence.chromosomeSize(chromosome);
}
@Override
public long genomeSize() {
return myReferenceSequence.genomeSize();
}
@Override
public int numberOfChromosomes() {
return myReferenceSequence.numberOfChromosomes();
}
@Override
public byte genotype(Chromosome chrom, int position) {
return genotype(chrom, Position.of(chrom.getName(), position));
}
@Override
public byte genotype(Chromosome chrom, Position positionObject) {
return NucleotideAlignmentConstants.getNucleotideAlleleByte(genotypeAsString(chrom, positionObject));
}
@Override
public String genotypeAsString(Chromosome chrom, int position) {
return genotypeAsString(chrom, Position.of(chrom, position));
}
@Override
public String genotypeAsString(Chromosome chrom, Position positionObject) {
String currentValue = myGVCFRangeToAlleleCallMap.get(positionObject);
if (currentValue == null) {
if(myMissingAsRef) {
return myReferenceSequence.genotypeAsString(chrom,positionObject);
}
else {
return GenotypeTable.UNKNOWN_ALLELE_STR;
}
}
if (currentValue.equals("REFRANGE")) {
//use the genomeSequence to get the call
return myReferenceSequence.genotypeAsString(positionObject.getChromosome(), positionObject.getPosition());
} else {
return currentValue;
}
}
@Override
public String genotypeAsString(Chromosome chrom, int startSite, int endSite) {
StringBuilder sequence = new StringBuilder();
for (int currentPosition = startSite; currentPosition <= endSite; currentPosition++) {
Position currentPositionObject = Position.of(chrom, currentPosition);
Map.Entry, String> currentEntry = myGVCFRangeToAlleleCallMap.getEntry(currentPositionObject);
//Check to make sure we have an entry
if(currentEntry!=null) {
if (!currentEntry.getValue().equals("REFRANGE")) {
//If it is not a refRange, just get the GVCF allele variant
sequence.append(currentEntry.getValue());
}
else {
//Figure out what the start and end point should be if we have a refBlock so we know how much reference we should extract
//Need to make sure we stay within the requested boundaries as the RefBlock can go past.
int resolvedStartSite = Math.max(currentEntry.getKey().lowerEndpoint().getPosition(),currentPosition);
int resolvedEndSite = Math.min(currentEntry.getKey().upperEndpoint().getPosition(), endSite);
//Extract the reference for the region
sequence.append(myReferenceSequence.genotypeAsString(chrom, resolvedStartSite, resolvedEndSite ));
}
//Slide up the currentPosition pointer
currentPosition = currentEntry.getKey().upperEndpoint().getPosition();
}
else {
//This means that it is missing for this site.
if(myMissingAsRef) {
//If this option is set, get the reference allele for this site
sequence.append(myReferenceSequence.genotypeAsString(chrom,currentPosition));
}
else {
//Else, put a single N.
sequence.append(GenotypeTable.UNKNOWN_ALLELE_STR);
}
}
}
return sequence.toString();
}
}