net.maizegenetics.pangenome.hapcollapse.FillIndelsIntoConsensus Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
package net.maizegenetics.pangenome.hapcollapse;
import com.google.common.collect.*;
import htsjdk.variant.variantcontext.*;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;
import java.util.*;
import java.util.stream.Collector;
import java.util.stream.Collectors;
/**
* This class is created to fill in the agreeing indels removed from the consensus haplotypes during the merge process. There are various merge rules for when the indels disagree.
*/
public class FillIndelsIntoConsensus {
private static final Logger myLogger = Logger.getLogger(FillIndelsIntoConsensus.class);
public static enum INDEL_MERGE_RULE {setToN, mostCommonHaplotype, mostCommonHaplotypesAsHet, reference, longestIndel}
/**
* Method to add in the indels which were removed for the merging process.
* The indels need to be readded in due to the filtering process in MergeGVCFPlugin.
* There we removed all sites where there was an indel in any of the taxa. Even if it was just one.
* Because of this, we likely have a lot of reference alleles missing from the consensus for the majority of the taxa.
* This method will add in any calls which are in agreement by all the taxon in the cluster or in the case of disagreement it will use the INDEL_MERGE_RULE to determine what variantContext record to put back in.
*
* @param range
* @param refSequence
* @param consensusSequences
* @param rawConsensusWithIndels
* @param minComparableIndels
* @param mergeRule
* @return
*/
public static List> addInIndels(ReferenceRange range, GenomeSequence refSequence, List> consensusSequences, List> rawConsensusWithIndels, int minComparableIndels, INDEL_MERGE_RULE mergeRule) {
List> consensusWithIndels = new ArrayList<>();
Multimap taxonClusterNames = extractTaxonClusters(consensusSequences);
Map combinedNameToIndex = createTaxonNameMap(consensusSequences);
//Get a list of positions which are not indels stored in the consensus sequences
Set nonIndelPositions = extractNonIndelPositions(range,consensusSequences);
//Get a list of all global positions which contain at least one indel
Set globalIndelPositions = extractIndelPositions(range,nonIndelPositions);
List globalIndelPositionsSorted = globalIndelPositions.stream().sorted().collect(Collectors.toList());
//Get a multimap of all indel positions in the raw haplotypes
Multimap indelVariantsMappedByTaxa = rawConsensusWithIndels.stream()
.filter(variantContexts -> variantContexts.size()>0)
.flatMap(variantContexts -> variantContexts.stream())
.filter(variantContext -> (variantContext.isIndel() || variantContext.isVariant()) && !MergeGVCFUtils.isRefBlock(variantContext)) //Here we treat indels and variants the same...
.collect(Collector.of(ArrayListMultimap::create,
(map, variantContext) -> map.put(variantContext.getSampleNamesOrderedByName().get(0), variantContext),
(map1, map2) -> {
map1.putAll(map2);
return map1;
}));
//Loop through each clustered taxon name in the key set
for(String combinedTaxaName : taxonClusterNames.keySet()) {
//Extract the clusters where the Indels are missing
Map nonIndelCluster = computeNonIndelClusterMapping(consensusSequences.get(combinedNameToIndex.get(combinedTaxaName)));
//Create a mapping of all the positions where there is an indel
Map consensusIndels = new HashMap<>();
Multimap indelPositions = taxonClusterNames.get(combinedTaxaName).stream()
.flatMap(taxonName -> indelVariantsMappedByTaxa.get(taxonName).stream())
.collect(Collector.of(ArrayListMultimap::create,
(map, variantContext) -> map.put(Position.of(variantContext.getContig(), variantContext.getStart()),variantContext),
(map1, map2) -> {
map1.putAll(map2);
return map1;
}));
//Now we need to add in any reference base pairs that agree.
Map> refBlockPositions = extractRefBlocks(rawConsensusWithIndels,taxonClusterNames,combinedTaxaName);
RangeSet alreadyCoveredPositions = TreeRangeSet.create();
//Loop through all the known indels and see if we should add any of them back into the tables.
for(Position indelPosition :globalIndelPositionsSorted) {
if(alreadyCoveredPositions.contains(indelPosition)) {
continue;//means we have already covered this base pair in a previously added entry
}
List indels = new ArrayList<>();
if(indelPositions.containsKey(indelPosition)) {
indels = indelPositions.get(indelPosition).stream().collect(Collectors.toList());
}
int numberOfTaxaCoveringRefRange = countRefRanges(refBlockPositions,taxonClusterNames.get(combinedTaxaName),indelPosition);
// Check to make sure we have at least the number of indels is at least bigger than the minimumComparableIndels
if(indels.size() + numberOfTaxaCoveringRefRange >= minComparableIndels) {
Multiset> genotypeStrings = indels.stream()
.map(variantContext -> new Tuple(variantContext.getReference().getBaseString(),variantContext.getGenotype(0).getGenotypeString()))
.collect(Collector.of(HashMultiset::create,
(set,genotypeString)->set.add(genotypeString),
(set1,set2)->{
set1.addAll(set2);
return set1;
}));
long numberOfObservedAlleles = genotypeStrings.elementSet().size();
//We need to add one if we have any taxa which have a reference range
if(numberOfTaxaCoveringRefRange != 0) {
numberOfObservedAlleles++;
}
//If they are all a single indel or all reference calls, we add back the alleles immediately
if(numberOfObservedAlleles == 1) {
//If we only have reference calls make a reference variant call
if(numberOfTaxaCoveringRefRange != 0) {
//Now we need to create a new VariantContext object and add it to the correct consensus sequence.
VariantContext newVariant = createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName);
consensusIndels.put(indelPosition,createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName));
alreadyCoveredPositions.add(Range.closed(Position.of(newVariant.getContig(),newVariant.getStart()),
Position.of(newVariant.getContig(),newVariant.getEnd())));
}
else {
//else if we only have a variant/indel
//Now we need to create a new VariantContext object and add it to the correct consensus sequence.
Allele allele = indels.get(0).getGenotype(0).getAllele(0);
Genotype renamedGenotype = new GenotypeBuilder(indels.get(0).getGenotype(0))
.name(combinedTaxaName).alleles(Arrays.asList(allele,allele)).make();
VariantContextBuilder newVariant = new VariantContextBuilder(indels.get(0)).noGenotypes().genotypes(renamedGenotype);
consensusIndels.put(indelPosition,newVariant.make());
alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(0).getContig(),indels.get(0).getStart()),
Position.of(indels.get(0).getContig(),indels.get(0).getEnd())));
}
}
else {
//Handle different merge cases
if(mergeRule==INDEL_MERGE_RULE.setToN) {
//Set to N
//Do nothing as it will essentially set it to missing
alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(0).getContig(),indels.get(0).getStart()),
Position.of(indels.get(0).getContig(),indels.get(0).getEnd())));
}
else if(mergeRule==INDEL_MERGE_RULE.mostCommonHaplotype) {
//Most # of taxa
int highestCount = 0;
Tuple elementWithHighestCount = new Tuple<>("","");
for(Tuple genotypeString : genotypeStrings.elementSet()) {
if(highestCount < genotypeStrings.count(genotypeString)) {
highestCount = genotypeStrings.count(genotypeString);
elementWithHighestCount = genotypeString;
}
}
if(highestCount < numberOfTaxaCoveringRefRange) {
consensusIndels.put(indelPosition,createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName));
alreadyCoveredPositions.add(Range.closed(indelPosition,indelPosition));
}
else if(!elementWithHighestCount.getX().equals("") && !elementWithHighestCount.getY().equals("")) {
//find a VariantContext holding the genotypeString we are looking for
for(int i = 0; i < indels.size(); i++) {
if( indels.get(i).getReference().getBaseString().equals(elementWithHighestCount.getX()) &&
indels.get(i).getGenotype(0).getGenotypeString().equals(elementWithHighestCount.getY())) {
Allele allele = indels.get(i).getGenotype(0).getAllele(0);
Genotype renamedGenotype = new GenotypeBuilder(indels.get(i).getGenotype(0))
.name(combinedTaxaName).alleles(Arrays.asList(allele,allele)).make();
VariantContextBuilder newVariant = new VariantContextBuilder(indels.get(i)).noGenotypes().genotypes(renamedGenotype);
consensusIndels.put(indelPosition,newVariant.make());
alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(i).getContig(),indels.get(i).getStart()),
Position.of(indels.get(i).getContig(),indels.get(i).getEnd())));
break;
}
}
}
}
else if(mergeRule==INDEL_MERGE_RULE.mostCommonHaplotypesAsHet) { //TODO This is a mess, clean this up
//TODO evaluate if this should be removed as consensus haplotypes should not contain indels
//Top 2 taxa become het
Set> genotypesSortedByCount = Multisets.copyHighestCountFirst(genotypeStrings).elementSet();
List> genotypesSortedByCountList = genotypesSortedByCount.stream().collect(Collectors.toList());
//We know there must be at least 2 possibilities,
//If genotypesSortedByCount.size() == 1 we need to use the ref
if(genotypesSortedByCount.size() == 1) {
boolean refFirst = (numberOfTaxaCoveringRefRange >= genotypeStrings.count(genotypesSortedByCountList.get(0)))?true:false;
for(int i = 0; i < indels.size(); i++) {
if( indels.get(i).getReference().getBaseString().equals(genotypesSortedByCountList.get(0).getX()) &&
indels.get(i).getGenotype(0).getGenotypeString().equals(genotypesSortedByCountList.get(0).getY())) {
Allele refAllele = indels.get(i).getReference();
Allele altAllele = indels.get(i).getGenotype(0).getAllele(0);
GenotypeBuilder renamedGenoBuilder = new GenotypeBuilder(indels.get(i).getGenotype(0)).name(combinedTaxaName);
if(refFirst) {
renamedGenoBuilder = renamedGenoBuilder.alleles(Arrays.asList(refAllele,altAllele));
}
else {
renamedGenoBuilder = renamedGenoBuilder.alleles(Arrays.asList(altAllele,refAllele));
}
Genotype renamedGenotype = renamedGenoBuilder.make();
VariantContextBuilder newVariant = new VariantContextBuilder(indels.get(i)).noGenotypes().genotypes(renamedGenotype);
consensusIndels.put(indelPosition,newVariant.make());
alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(i).getContig(),indels.get(i).getStart()),
Position.of(indels.get(i).getContig(),indels.get(i).getEnd())));
break;
}
}
}
else {
//Otherwise we grab the first 2 and see if the ref is larger than any of them.
if(numberOfTaxaCoveringRefRange >= genotypeStrings.count(genotypesSortedByCountList.get(0))) {
//The reference bp has the highest depth.
VariantContext newReferenceHetVariant = createReferenceHetVariantContext(indelPosition,combinedTaxaName,genotypesSortedByCountList.get(0).getX(),genotypesSortedByCountList.get(0).getY(),true);
consensusIndels.put(indelPosition,newReferenceHetVariant);
alreadyCoveredPositions.add(Range.closed(Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getStart()),
Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getEnd())));
}
else if( numberOfObservedAlleles >= genotypeStrings.count(genotypesSortedByCountList.get(1))) {
VariantContext newReferenceHetVariant =createReferenceHetVariantContext(indelPosition,combinedTaxaName,genotypesSortedByCountList.get(0).getX(),genotypesSortedByCountList.get(0).getY(),false);
consensusIndels.put(indelPosition,newReferenceHetVariant);
alreadyCoveredPositions.add(Range.closed(Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getStart()),
Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getEnd())));
}
else {
//Use both alternate alleles
Allele refAllele = Allele.create(genotypesSortedByCountList.get(0).getX(),true);
Allele altAllele1 = Allele.create(genotypesSortedByCountList.get(0).getY(),false);
Allele altAllele2 = Allele.create(genotypesSortedByCountList.get(1).getY(),false);
for(int i = 0; i < indels.size(); i++) {
if( indels.get(i).getReference().getBaseString().equals(genotypesSortedByCountList.get(0).getX()) &&
indels.get(i).getGenotype(0).getGenotypeString().equals(genotypesSortedByCountList.get(0).getY())) {
GenotypeBuilder renamedGenoBuilder = new GenotypeBuilder(indels.get(i).getGenotype(0)).name(combinedTaxaName)
.alleles(Arrays.asList(altAllele1,altAllele2));
Genotype renamedGenotype = renamedGenoBuilder.make();
VariantContextBuilder newVariantBuilder = new VariantContextBuilder(indels.get(i))
.alleles(Arrays.asList(refAllele,altAllele1,altAllele2))
.noGenotypes().genotypes(renamedGenotype);
VariantContext newVariant = newVariantBuilder.make();
consensusIndels.put(indelPosition,newVariant);
alreadyCoveredPositions.add(Range.closed(Position.of(newVariant.getContig(),newVariant.getStart()),
Position.of(newVariant.getContig(),newVariant.getEnd())));
break;
}
}
}
}
}
else if(mergeRule==INDEL_MERGE_RULE.reference) {
//Reference only
//Now we need to create a new VariantContext object and add it to the correct consensus sequence.
VariantContext newVariant = createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName);
consensusIndels.put(indelPosition,newVariant);
alreadyCoveredPositions.add(Range.closed(Position.of(newVariant.getContig(),newVariant.getStart()),
Position.of(newVariant.getContig(),newVariant.getEnd())));
}
else if(mergeRule==INDEL_MERGE_RULE.longestIndel) {
//TODO Longest indel
//Need to check both insertions and deletions
myLogger.warn("INDEL_MERGE_RULE.longestIndel is not implemented yet.");
throw new IllegalStateException("INDEL_MERGE_RULE.longestIndel is not implemented yet. Please select a merge rule that is implemented");
}
}
}
}
//Add back in the indels to the list
List variantsWithIndels = new ArrayList<>();
for(int i = range.start(); i <= range.end(); i++) {
Position currentPosition = Position.of(range.chromosome(),i);
if(consensusIndels.containsKey(currentPosition)) {
variantsWithIndels.add(consensusIndels.get(currentPosition));
}
else if(nonIndelCluster.containsKey(currentPosition)){
variantsWithIndels.add(nonIndelCluster.get(currentPosition));
}
}
consensusWithIndels.add(variantsWithIndels);
}
return consensusWithIndels;
}
/**
* Simple method to create a mapping of non indel positions
* @param variants
* @return
*/
private static Map computeNonIndelClusterMapping(List variants) {
Map nonIndelMap = new HashMap<>();
for(VariantContext variant : variants) {
nonIndelMap.put(Position.of(variant.getContig(),variant.getStart()),variant);
}
return nonIndelMap;
}
/**
* Method to get a TaxonName stored in the variants to all the raw consensus names
* @param consensusSequences
* @return
*/
private static Multimap extractTaxonClusters(List> consensusSequences) {
Multimap clusterMapping = ArrayListMultimap.create();
for(List currentCluster : consensusSequences) {
String clusterName = currentCluster.get(0).getGenotype(0).getSampleName();
String[] individualTaxonNames = clusterName.split(":");
for(String currentTaxon:individualTaxonNames) {
if(currentTaxon.endsWith("_Haplotype_Caller_0")) {
clusterMapping.put(clusterName,currentTaxon.substring(0,currentTaxon.length()-19));
}
else if(currentTaxon.endsWith("_0")) {
clusterMapping.put(clusterName,currentTaxon.substring(0,currentTaxon.length()-2));
}
else {
clusterMapping.put(clusterName,currentTaxon);
}
}
}
return clusterMapping;
}
/**
* Method to extract out the ranges of positions which contain a reference call for each taxon.
* We originally tried using a Multiset of Positions, but it was not performant
* @param rawHaplotypeSequences
* @param taxonClusterNames
* @param combinedTaxaName
* @return
*/
private static Map> extractRefBlocks(List> rawHaplotypeSequences, Multimap taxonClusterNames, String combinedTaxaName ) {
Map> refBlocks = new HashMap<>();
//Loop through each raw List of Variant Contexts
for(List currentTaxonVariants : rawHaplotypeSequences) {
//Get the current taxon name and use check to see if it is one of the taxon in our current cluster
String currentTaxon = currentTaxonVariants.get(0).getGenotype(0).getSampleName();
if (taxonClusterNames.get(combinedTaxaName).contains(currentTaxon)) {
//Loop through each variant for this taxon and if it is a ref block add it to the RangeSet
for (VariantContext variantContext : currentTaxonVariants) {
if (MergeGVCFUtils.isRefBlock(variantContext)) {
//Check to see if we already have a range set created, if not create a new one.
if(!refBlocks.keySet().contains(currentTaxon)) {
refBlocks.put(currentTaxon,TreeRangeSet.create());
}
refBlocks.get(currentTaxon).add(Range.closed(Position.of(variantContext.getContig(),variantContext.getStart()),Position.of(variantContext.getContig(),variantContext.getEnd())));
}
}
}
}
return refBlocks;
}
/**
* Simple method to count up the reference ranges for a clusters taxon names for a given position.
* @param refBlocks
* @param taxonNames
* @param currentPosition
* @return
*/
private static int countRefRanges(Map> refBlocks, Collection taxonNames, Position currentPosition) {
int count = 0;
//Loop through the taxon names
for(String currentTaxon : taxonNames) {
//If the position is in the RangeSet of refBlocks for this taxa increment counter.
if(refBlocks.get(currentTaxon).contains(currentPosition)) {
count++;
}
}
return count;
}
/**
* Method to setup a Taxon Name Map for easy indexing
* @param consensusSequences
* @return
*/
private static Map createTaxonNameMap(List> consensusSequences) {
Map taxonToIndexMap = new HashMap<>();
for(int i = 0; i < consensusSequences.size(); i++) {
String clusterName = consensusSequences.get(i).get(0).getGenotype(0).getSampleName();
taxonToIndexMap.put(clusterName,i);
}
return taxonToIndexMap;
}
/**
* Extract a set of positions which were not filtered out due to indels
* @param range
* @param consensusSequences
* @return
*/
private static Set extractNonIndelPositions(ReferenceRange range, List> consensusSequences) {
Set nonIndelPositions = new HashSet<>();
for(List currentConsensus : consensusSequences) {
for(VariantContext variantContext : currentConsensus) {
for(int i = variantContext.getStart(); i <= variantContext.getEnd(); i++) {
nonIndelPositions.add(Position.of(variantContext.getContig(),i));
}
}
}
//Loop through the set
//Remove any Positions where the next Position is not in the set
//Basically remove the previous base pair for the indels
Set nonIndelPositionsRemovePrevBP = new HashSet<>();
for(Position currentPosition : nonIndelPositions) {
if(currentPosition.getPosition() extractIndelPositions(ReferenceRange range, Set nonIndelPositions) {
Set indels = new HashSet<>();
for(int i = range.start(); i <= range.end(); i++) {
Position currentPosition = Position.of(range.chromosome(),i);
if(!nonIndelPositions.contains(currentPosition)) {
indels.add(currentPosition);
}
}
return indels;
}
/**
* Method to create a reference VariantContext
* @param refSequence
* @param indelPosition
* @param combinedTaxaName
* @return
*/
private static VariantContext createReferenceVariantContext(GenomeSequence refSequence, Position indelPosition, String combinedTaxaName) {
String alleleString = NucleotideAlignmentConstants.getHaplotypeNucleotide(refSequence.genotype(indelPosition.getChromosome(),indelPosition.getPosition()));
Allele allele = Allele.create(alleleString,true);
Genotype renamedGenotype = new GenotypeBuilder()
.name(combinedTaxaName).alleles(Arrays.asList(allele,allele)).make();
VariantContextBuilder newVariant = new VariantContextBuilder()
.chr(indelPosition.getChromosome().getName())
.start(indelPosition.getPosition())
.stop(indelPosition.getPosition())
.alleles(Arrays.asList(allele))
.noGenotypes()
.genotypes(renamedGenotype);
return newVariant.make();
}
/**
* Method to make a het variant context
* TODO remove this later as we should not have hets in the haplotype
* @param indelPosition
* @param combinedTaxaName
* @param refAlleleString
* @param altAlleleString
* @param refFirst
* @return
*/
private static VariantContext createReferenceHetVariantContext(Position indelPosition, String combinedTaxaName,String refAlleleString, String altAlleleString, boolean refFirst) {
System.out.println(indelPosition);
Allele refAllele = Allele.create(refAlleleString,true);
Allele altAllele = Allele.create(altAlleleString, false);
GenotypeBuilder gtb = new GenotypeBuilder().name(combinedTaxaName);
List alleleList = new ArrayList<>();
if(refAlleleString.equals(altAlleleString)) {
gtb = gtb.alleles(Arrays.asList(refAllele));
alleleList.add(refAllele);
}
else if(refFirst) {
gtb = gtb.alleles(Arrays.asList(refAllele,altAllele));
alleleList.add(refAllele);
alleleList.add(altAllele);
}
else {
gtb = gtb.alleles(Arrays.asList(altAllele,refAllele));
alleleList.add(refAllele);
alleleList.add(altAllele);
}
Genotype renamedGenotype = gtb.make();
VariantContextBuilder newVariant = new VariantContextBuilder()
.chr(indelPosition.getChromosome().getName())
.start(indelPosition.getPosition())
.stop(indelPosition.getPosition()+refAlleleString.length()-1)
.alleles(alleleList)
.noGenotypes()
.genotypes(renamedGenotype);
return newVariant.make();
}
}