net.maizegenetics.pangenome.hapcollapse.FillIndelsIntoConsensus Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of phg Show documentation
PHG - Practical Haplotype Graph
There is a newer version: 1.10
package net.maizegenetics.pangenome.hapcollapse;

import com.google.common.collect.*;
import htsjdk.variant.variantcontext.*;
import net.maizegenetics.dna.map.GenomeSequence;
import net.maizegenetics.dna.map.Position;
import net.maizegenetics.dna.snp.NucleotideAlignmentConstants;
import net.maizegenetics.pangenome.api.ReferenceRange;
import net.maizegenetics.util.Tuple;
import org.apache.log4j.Logger;

import java.util.*;
import java.util.stream.Collector;
import java.util.stream.Collectors;

/**
 * This class is created to fill in the agreeing indels removed from the consensus haplotypes during the merge process.  There are various merge rules for when the indels disagree.
 */
public class FillIndelsIntoConsensus {

    private static final Logger myLogger = Logger.getLogger(FillIndelsIntoConsensus.class);

    public static enum INDEL_MERGE_RULE {setToN, mostCommonHaplotype, mostCommonHaplotypesAsHet, reference, longestIndel}


    /**
     * Method to add in the indels which were removed for the merging process.
     * The indels need to be readded in due to the filtering process in MergeGVCFPlugin.
     * There we removed all sites where there was an indel in any of the taxa.  Even if it was just one.
     * Because of this, we likely have a lot of reference alleles missing from the consensus for the majority of the taxa.
     * This method will add in any calls which are in agreement by all the taxon in the cluster or in the case of disagreement it will use the INDEL_MERGE_RULE to determine what variantContext record to put back in.
     *
     * @param range
     * @param refSequence
     * @param consensusSequences
     * @param rawConsensusWithIndels
     * @param minComparableIndels
     * @param mergeRule
     * @return
     */
    public static List> addInIndels(ReferenceRange range, GenomeSequence refSequence, List> consensusSequences, List> rawConsensusWithIndels, int minComparableIndels, INDEL_MERGE_RULE mergeRule) {
        List> consensusWithIndels = new ArrayList<>();
        Multimap taxonClusterNames = extractTaxonClusters(consensusSequences);

        Map combinedNameToIndex = createTaxonNameMap(consensusSequences);

        //Get a list of positions which are not indels stored in the consensus sequences
        Set nonIndelPositions = extractNonIndelPositions(range,consensusSequences);

        //Get a list of all global positions which contain at least one indel
        Set globalIndelPositions = extractIndelPositions(range,nonIndelPositions);


        List globalIndelPositionsSorted = globalIndelPositions.stream().sorted().collect(Collectors.toList());

        //Get a multimap of all indel positions in the raw haplotypes
        Multimap indelVariantsMappedByTaxa = rawConsensusWithIndels.stream()
                .filter(variantContexts -> variantContexts.size()>0)
                .flatMap(variantContexts -> variantContexts.stream())
                .filter(variantContext -> (variantContext.isIndel() || variantContext.isVariant()) && !MergeGVCFUtils.isRefBlock(variantContext)) //Here we treat indels and variants the same...
                .collect(Collector.of(ArrayListMultimap::create,
                        (map, variantContext) -> map.put(variantContext.getSampleNamesOrderedByName().get(0), variantContext),
                        (map1, map2) -> {
                            map1.putAll(map2);
                            return map1;
                        }));


        //Loop through each clustered taxon name in the key set
        for(String combinedTaxaName : taxonClusterNames.keySet()) {
           //Extract the clusters where the Indels are missing
            Map nonIndelCluster = computeNonIndelClusterMapping(consensusSequences.get(combinedNameToIndex.get(combinedTaxaName)));

            //Create a mapping of all the positions where there is an indel
            Map consensusIndels = new HashMap<>();
            Multimap indelPositions = taxonClusterNames.get(combinedTaxaName).stream()
                    .flatMap(taxonName -> indelVariantsMappedByTaxa.get(taxonName).stream())
                    .collect(Collector.of(ArrayListMultimap::create,
                            (map, variantContext) -> map.put(Position.of(variantContext.getContig(), variantContext.getStart()),variantContext),
                            (map1, map2) -> {
                                map1.putAll(map2);
                                return map1;
                            }));


            //Now we need to add in any reference base pairs that agree.
            Map> refBlockPositions =  extractRefBlocks(rawConsensusWithIndels,taxonClusterNames,combinedTaxaName);

            RangeSet alreadyCoveredPositions = TreeRangeSet.create();
            //Loop through all the known indels and see if we should add any of them back into the tables.
            for(Position indelPosition :globalIndelPositionsSorted) {
                if(alreadyCoveredPositions.contains(indelPosition)) {
                    continue;//means we have already covered this base pair in a previously added entry
                }
                List indels = new ArrayList<>();
                if(indelPositions.containsKey(indelPosition)) {
                    indels = indelPositions.get(indelPosition).stream().collect(Collectors.toList());
                }

                int numberOfTaxaCoveringRefRange = countRefRanges(refBlockPositions,taxonClusterNames.get(combinedTaxaName),indelPosition);

                // Check to make sure we have at least the number of indels is at least bigger than the minimumComparableIndels
                if(indels.size() + numberOfTaxaCoveringRefRange >= minComparableIndels) {
                    Multiset> genotypeStrings = indels.stream()
                            .map(variantContext -> new Tuple(variantContext.getReference().getBaseString(),variantContext.getGenotype(0).getGenotypeString()))
                            .collect(Collector.of(HashMultiset::create,
                                    (set,genotypeString)->set.add(genotypeString),
                                    (set1,set2)->{
                                        set1.addAll(set2);
                                        return set1;
                                    }));

                    long numberOfObservedAlleles = genotypeStrings.elementSet().size();
                    //We need to add one if we have any taxa which have a reference range
                    if(numberOfTaxaCoveringRefRange != 0) {
                        numberOfObservedAlleles++;
                    }

                    //If they are all a single indel or all reference calls, we add back the alleles immediately
                    if(numberOfObservedAlleles == 1) {
                        //If we only have reference calls make a reference variant call
                        if(numberOfTaxaCoveringRefRange != 0) {
                            //Now we need to create a new VariantContext object and add it to the correct consensus sequence.
                            VariantContext newVariant = createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName);
                            consensusIndels.put(indelPosition,createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName));
                            alreadyCoveredPositions.add(Range.closed(Position.of(newVariant.getContig(),newVariant.getStart()),
                                    Position.of(newVariant.getContig(),newVariant.getEnd())));
                        }
                        else {
                            //else if we only have a variant/indel
                            //Now we need to create a new VariantContext object and add it to the correct consensus sequence.
                            Allele allele = indels.get(0).getGenotype(0).getAllele(0);
                            Genotype renamedGenotype = new GenotypeBuilder(indels.get(0).getGenotype(0))
                                    .name(combinedTaxaName).alleles(Arrays.asList(allele,allele)).make();
                            VariantContextBuilder newVariant = new VariantContextBuilder(indels.get(0)).noGenotypes().genotypes(renamedGenotype);
                            consensusIndels.put(indelPosition,newVariant.make());
                            alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(0).getContig(),indels.get(0).getStart()),
                                                                    Position.of(indels.get(0).getContig(),indels.get(0).getEnd())));
                        }
                    }
                    else {
                        //Handle different merge cases

                        if(mergeRule==INDEL_MERGE_RULE.setToN) {
                            //Set to N
                            //Do nothing as it will essentially set it to missing
                            alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(0).getContig(),indels.get(0).getStart()),
                                    Position.of(indels.get(0).getContig(),indels.get(0).getEnd())));

                        }
                        else if(mergeRule==INDEL_MERGE_RULE.mostCommonHaplotype) {
                            //Most # of taxa
                            int highestCount = 0;
                            Tuple elementWithHighestCount = new Tuple<>("","");
                            for(Tuple genotypeString : genotypeStrings.elementSet()) {
                                if(highestCount < genotypeStrings.count(genotypeString)) {
                                    highestCount = genotypeStrings.count(genotypeString);
                                    elementWithHighestCount = genotypeString;
                                }
                            }

                            if(highestCount < numberOfTaxaCoveringRefRange) {
                                consensusIndels.put(indelPosition,createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName));
                                alreadyCoveredPositions.add(Range.closed(indelPosition,indelPosition));
                            }
                            else if(!elementWithHighestCount.getX().equals("") && !elementWithHighestCount.getY().equals("")) {
                                //find a VariantContext holding the genotypeString we are looking for

                                for(int i = 0; i < indels.size(); i++) {
                                    if( indels.get(i).getReference().getBaseString().equals(elementWithHighestCount.getX()) &&
                                        indels.get(i).getGenotype(0).getGenotypeString().equals(elementWithHighestCount.getY())) {
                                        Allele allele = indels.get(i).getGenotype(0).getAllele(0);
                                        Genotype renamedGenotype = new GenotypeBuilder(indels.get(i).getGenotype(0))
                                                .name(combinedTaxaName).alleles(Arrays.asList(allele,allele)).make();
                                        VariantContextBuilder newVariant = new VariantContextBuilder(indels.get(i)).noGenotypes().genotypes(renamedGenotype);
                                        consensusIndels.put(indelPosition,newVariant.make());
                                        alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(i).getContig(),indels.get(i).getStart()),
                                                Position.of(indels.get(i).getContig(),indels.get(i).getEnd())));
                                        break;
                                    }
                                }
                            }
                        }
                        else if(mergeRule==INDEL_MERGE_RULE.mostCommonHaplotypesAsHet) { //TODO This is a mess, clean this up
                            //TODO evaluate if this should be removed as consensus haplotypes should not contain indels
                            //Top 2 taxa become het
                            Set> genotypesSortedByCount = Multisets.copyHighestCountFirst(genotypeStrings).elementSet();
                            List> genotypesSortedByCountList = genotypesSortedByCount.stream().collect(Collectors.toList());
                            //We know there must be at least 2 possibilities,
                            //If genotypesSortedByCount.size() == 1 we need to use the ref
                            if(genotypesSortedByCount.size() == 1) {
                                boolean refFirst = (numberOfTaxaCoveringRefRange >= genotypeStrings.count(genotypesSortedByCountList.get(0)))?true:false;
                                for(int i = 0; i < indels.size(); i++) {
                                    if( indels.get(i).getReference().getBaseString().equals(genotypesSortedByCountList.get(0).getX()) &&
                                            indels.get(i).getGenotype(0).getGenotypeString().equals(genotypesSortedByCountList.get(0).getY())) {
                                        Allele refAllele = indels.get(i).getReference();
                                        Allele altAllele = indels.get(i).getGenotype(0).getAllele(0);
                                        GenotypeBuilder renamedGenoBuilder = new GenotypeBuilder(indels.get(i).getGenotype(0)).name(combinedTaxaName);
                                        if(refFirst) {
                                            renamedGenoBuilder = renamedGenoBuilder.alleles(Arrays.asList(refAllele,altAllele));
                                        }
                                        else {
                                            renamedGenoBuilder = renamedGenoBuilder.alleles(Arrays.asList(altAllele,refAllele));
                                        }
                                        Genotype renamedGenotype = renamedGenoBuilder.make();
                                        VariantContextBuilder newVariant = new VariantContextBuilder(indels.get(i)).noGenotypes().genotypes(renamedGenotype);
                                        consensusIndels.put(indelPosition,newVariant.make());
                                        alreadyCoveredPositions.add(Range.closed(Position.of(indels.get(i).getContig(),indels.get(i).getStart()),
                                                Position.of(indels.get(i).getContig(),indels.get(i).getEnd())));
                                        break;
                                    }
                                }
                            }
                            else {
                                //Otherwise we grab the first 2 and see if the ref is larger than any of them.

                                if(numberOfTaxaCoveringRefRange >= genotypeStrings.count(genotypesSortedByCountList.get(0))) {
                                    //The reference bp has the highest depth.
                                    VariantContext newReferenceHetVariant = createReferenceHetVariantContext(indelPosition,combinedTaxaName,genotypesSortedByCountList.get(0).getX(),genotypesSortedByCountList.get(0).getY(),true);
                                    consensusIndels.put(indelPosition,newReferenceHetVariant);
                                    alreadyCoveredPositions.add(Range.closed(Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getStart()),
                                            Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getEnd())));

                                }
                                else if( numberOfObservedAlleles >= genotypeStrings.count(genotypesSortedByCountList.get(1))) {
                                    VariantContext newReferenceHetVariant =createReferenceHetVariantContext(indelPosition,combinedTaxaName,genotypesSortedByCountList.get(0).getX(),genotypesSortedByCountList.get(0).getY(),false);
                                    consensusIndels.put(indelPosition,newReferenceHetVariant);
                                    alreadyCoveredPositions.add(Range.closed(Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getStart()),
                                            Position.of(newReferenceHetVariant.getContig(),newReferenceHetVariant.getEnd())));
                                }
                                else {
                                    //Use both alternate alleles
                                    Allele refAllele = Allele.create(genotypesSortedByCountList.get(0).getX(),true);
                                    Allele altAllele1 = Allele.create(genotypesSortedByCountList.get(0).getY(),false);
                                    Allele altAllele2 = Allele.create(genotypesSortedByCountList.get(1).getY(),false);

                                    for(int i = 0; i < indels.size(); i++) {
                                        if( indels.get(i).getReference().getBaseString().equals(genotypesSortedByCountList.get(0).getX()) &&
                                                indels.get(i).getGenotype(0).getGenotypeString().equals(genotypesSortedByCountList.get(0).getY())) {

                                            GenotypeBuilder renamedGenoBuilder = new GenotypeBuilder(indels.get(i).getGenotype(0)).name(combinedTaxaName)
                                                    .alleles(Arrays.asList(altAllele1,altAllele2));
                                            Genotype renamedGenotype = renamedGenoBuilder.make();
                                            VariantContextBuilder newVariantBuilder = new VariantContextBuilder(indels.get(i))
                                                    .alleles(Arrays.asList(refAllele,altAllele1,altAllele2))
                                                    .noGenotypes().genotypes(renamedGenotype);
                                            VariantContext newVariant = newVariantBuilder.make();
                                            consensusIndels.put(indelPosition,newVariant);
                                            alreadyCoveredPositions.add(Range.closed(Position.of(newVariant.getContig(),newVariant.getStart()),
                                                    Position.of(newVariant.getContig(),newVariant.getEnd())));
                                            break;
                                        }
                                    }

                                }
                            }

                        }
                        else if(mergeRule==INDEL_MERGE_RULE.reference) {
                            //Reference only
                            //Now we need to create a new VariantContext object and add it to the correct consensus sequence.
                            VariantContext newVariant = createReferenceVariantContext(refSequence,indelPosition,combinedTaxaName);
                            consensusIndels.put(indelPosition,newVariant);
                            alreadyCoveredPositions.add(Range.closed(Position.of(newVariant.getContig(),newVariant.getStart()),
                                    Position.of(newVariant.getContig(),newVariant.getEnd())));
                        }
                        else if(mergeRule==INDEL_MERGE_RULE.longestIndel) {
                            //TODO Longest indel
                            //Need to check both insertions and deletions
                            myLogger.warn("INDEL_MERGE_RULE.longestIndel is not implemented yet.");
                            throw new IllegalStateException("INDEL_MERGE_RULE.longestIndel is not implemented yet. Please select a merge rule that is implemented");
                        }

                    }
                }
            }


            //Add back in the indels to the list
            List variantsWithIndels = new ArrayList<>();
            for(int i = range.start(); i <= range.end(); i++) {
                Position currentPosition = Position.of(range.chromosome(),i);
                if(consensusIndels.containsKey(currentPosition)) {
                    variantsWithIndels.add(consensusIndels.get(currentPosition));
                }
                else if(nonIndelCluster.containsKey(currentPosition)){
                    variantsWithIndels.add(nonIndelCluster.get(currentPosition));
                }
            }

            consensusWithIndels.add(variantsWithIndels);
        }

        return consensusWithIndels;
    }

    /**
     * Simple method to create a mapping of non indel positions
     * @param variants
     * @return
     */
    private static Map computeNonIndelClusterMapping(List variants) {
        Map nonIndelMap = new HashMap<>();
        for(VariantContext variant : variants) {
            nonIndelMap.put(Position.of(variant.getContig(),variant.getStart()),variant);
        }
        return nonIndelMap;
    }

    /**
     * Method to get a TaxonName stored in the variants to all the raw consensus names
     * @param consensusSequences
     * @return
     */
    private static Multimap extractTaxonClusters(List> consensusSequences) {
        Multimap clusterMapping = ArrayListMultimap.create();

        for(List currentCluster : consensusSequences) {
            String clusterName = currentCluster.get(0).getGenotype(0).getSampleName();
            String[] individualTaxonNames = clusterName.split(":");
            for(String currentTaxon:individualTaxonNames) {
                if(currentTaxon.endsWith("_Haplotype_Caller_0")) {
                    clusterMapping.put(clusterName,currentTaxon.substring(0,currentTaxon.length()-19));
                }
                else if(currentTaxon.endsWith("_0")) {
                    clusterMapping.put(clusterName,currentTaxon.substring(0,currentTaxon.length()-2));
                }
                else {
                    clusterMapping.put(clusterName,currentTaxon);
                }
            }
        }

        return clusterMapping;
    }


    /**
     * Method to extract out the ranges of positions which contain a reference call for each taxon.
     * We originally tried using a Multiset of Positions, but it was not performant
     * @param rawHaplotypeSequences
     * @param taxonClusterNames
     * @param combinedTaxaName
     * @return
     */
    private static Map> extractRefBlocks(List> rawHaplotypeSequences, Multimap taxonClusterNames, String combinedTaxaName ) {
        Map> refBlocks = new HashMap<>();
        //Loop through each raw List of Variant Contexts
        for(List currentTaxonVariants : rawHaplotypeSequences) {
            //Get the current taxon name and use check to see if it is one of the taxon in our current cluster
            String currentTaxon = currentTaxonVariants.get(0).getGenotype(0).getSampleName();
            if (taxonClusterNames.get(combinedTaxaName).contains(currentTaxon)) {
                //Loop through each variant for this taxon and if it is a ref block add it to the RangeSet
                for (VariantContext variantContext : currentTaxonVariants) {
                    if (MergeGVCFUtils.isRefBlock(variantContext)) {
                        //Check to see if we already have a range set created, if not create a new one.
                        if(!refBlocks.keySet().contains(currentTaxon)) {
                            refBlocks.put(currentTaxon,TreeRangeSet.create());
                        }
                        refBlocks.get(currentTaxon).add(Range.closed(Position.of(variantContext.getContig(),variantContext.getStart()),Position.of(variantContext.getContig(),variantContext.getEnd())));
                    }
                }
            }
        }

        return refBlocks;
    }

    /**
     * Simple method to count up the reference ranges for a clusters taxon names for a given position.
     * @param refBlocks
     * @param taxonNames
     * @param currentPosition
     * @return
     */
    private static int countRefRanges(Map> refBlocks, Collection taxonNames, Position currentPosition) {
        int count = 0;
        //Loop through the taxon names
        for(String currentTaxon : taxonNames) {
            //If the position is in the RangeSet of refBlocks for this taxa increment counter.
            if(refBlocks.get(currentTaxon).contains(currentPosition)) {
                count++;
            }
        }
        return count;
    }

    /**
     * Method to setup a Taxon Name Map for easy indexing
     * @param consensusSequences
     * @return
     */
    private static Map createTaxonNameMap(List> consensusSequences) {
        Map taxonToIndexMap = new HashMap<>();
        for(int i = 0; i < consensusSequences.size(); i++) {
            String clusterName = consensusSequences.get(i).get(0).getGenotype(0).getSampleName();
            taxonToIndexMap.put(clusterName,i);
        }
        return taxonToIndexMap;
    }

    /**
     * Extract a set of positions which were not filtered out due to indels
     * @param range
     * @param consensusSequences
     * @return
     */
    private static Set extractNonIndelPositions(ReferenceRange range, List> consensusSequences) {
        Set nonIndelPositions = new HashSet<>();

        for(List currentConsensus : consensusSequences) {
            for(VariantContext variantContext : currentConsensus) {
                for(int i = variantContext.getStart(); i <= variantContext.getEnd(); i++) {
                    nonIndelPositions.add(Position.of(variantContext.getContig(),i));
                }
            }
        }

        //Loop through the set
        //Remove any Positions where the next Position is not in the set
        //Basically remove the previous base pair for the indels
        Set nonIndelPositionsRemovePrevBP = new HashSet<>();
        for(Position currentPosition : nonIndelPositions) {
            if(currentPosition.getPosition() extractIndelPositions(ReferenceRange range, Set nonIndelPositions) {
        Set indels = new HashSet<>();
        for(int i = range.start(); i <= range.end(); i++) {
            Position currentPosition = Position.of(range.chromosome(),i);
            if(!nonIndelPositions.contains(currentPosition)) {
                indels.add(currentPosition);
            }
        }

        return indels;
    }

    /**
     * Method to create a reference VariantContext
     * @param refSequence
     * @param indelPosition
     * @param combinedTaxaName
     * @return
     */
    private static VariantContext createReferenceVariantContext(GenomeSequence refSequence, Position indelPosition, String combinedTaxaName) {
        String alleleString = NucleotideAlignmentConstants.getHaplotypeNucleotide(refSequence.genotype(indelPosition.getChromosome(),indelPosition.getPosition()));
        Allele allele = Allele.create(alleleString,true);
        Genotype renamedGenotype = new GenotypeBuilder()
                .name(combinedTaxaName).alleles(Arrays.asList(allele,allele)).make();
        VariantContextBuilder newVariant = new VariantContextBuilder()
                .chr(indelPosition.getChromosome().getName())
                .start(indelPosition.getPosition())
                .stop(indelPosition.getPosition())
                .alleles(Arrays.asList(allele))
                .noGenotypes()
                .genotypes(renamedGenotype);
        return newVariant.make();
    }

    /**
     * Method to make a het variant context
     * TODO remove this later as we should not have hets in the haplotype
     * @param indelPosition
     * @param combinedTaxaName
     * @param refAlleleString
     * @param altAlleleString
     * @param refFirst
     * @return
     */
    private static VariantContext createReferenceHetVariantContext(Position indelPosition, String combinedTaxaName,String refAlleleString, String altAlleleString, boolean refFirst) {
        System.out.println(indelPosition);
        Allele refAllele = Allele.create(refAlleleString,true);
        Allele altAllele = Allele.create(altAlleleString, false);

        GenotypeBuilder gtb = new GenotypeBuilder().name(combinedTaxaName);
        List alleleList = new ArrayList<>();
        if(refAlleleString.equals(altAlleleString)) {
            gtb = gtb.alleles(Arrays.asList(refAllele));
            alleleList.add(refAllele);
        }
        else if(refFirst) {
            gtb = gtb.alleles(Arrays.asList(refAllele,altAllele));
            alleleList.add(refAllele);
            alleleList.add(altAllele);
        }
        else {
            gtb = gtb.alleles(Arrays.asList(altAllele,refAllele));
            alleleList.add(refAllele);
            alleleList.add(altAllele);
        }

        Genotype renamedGenotype = gtb.make();
        VariantContextBuilder newVariant = new VariantContextBuilder()
                .chr(indelPosition.getChromosome().getName())
                .start(indelPosition.getPosition())
                .stop(indelPosition.getPosition()+refAlleleString.length()-1)
                .alleles(alleleList)
                .noGenotypes()
                .genotypes(renamedGenotype);
        return newVariant.make();
    }

}