org.broadinstitute.hellbender.utils.pileup.PileupBasedAlleles Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.utils.pileup;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Streams;
import htsjdk.samtools.*;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.Allele;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.engine.AlignmentAndReferenceContext;
import org.broadinstitute.hellbender.engine.AlignmentContext;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.PileupDetectionArgumentCollection;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.haplotype.Event;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.util.*;
import java.util.stream.Collectors;


/**
 * Helper class for handling pileup allele detection supplement for assembly. This code is analogous but not exactly
 * equivalent to the DRAGEN ColumnwiseDetection approach.
 */
public final class PileupBasedAlleles {

    final static String MISMATCH_BASES_PERCENTAGE_TAG = "MZ";
    public static final double MISMATCH_BASES_PERCENTAGE_ADJUSMTENT = 1000.0;

    private final static int COUNT_IDX = 0;
    private final static int BAD_COUNT_IDX = 1;
    private final static int ASSEMBLY_BAD_COUNT_IDX = 2;

    /**
     * Searches a list of pileups for potential variants that are visible in the pileups but might be dropped from assembly.
     * Additionally, determine which events found in the pileups are "good" (the supporting reads pass several criteria) or
     * "bad" (the supporting reads fail a different set of conditions). Returns the set of "good" and "bad" reads for later
     * use in augmenting and pruning the set of assembled haplotypes and variants.The basic algorithm works
     * as follows:
     *  - iterate over every pileup and count indel and substitution events, recording which reads are "bad" according to various heuristics
     *  - Assign each detected event as "good", "bad", or nothing based on the number of "good" and "bad" supporting reads
     *
     * @param alignmentAndReferenceContextList  List of stored pileups and reference context information where every element is a base from the active region.
     *                                          NOTE: the expectation is that the stored pileups are based off of the ORIGINAL (un-clipped) reads from active region determination.
     * @param args                              Configuration arguments to use for filtering/annotations
     * @param headerForReads                    Header for the reads (only necessary for SAM file conversion)
     * @return Two sets of Events, those passing and those failing heuristic filters, respectively.
     */
    public static Pair, Set> goodAndBadPileupEvents(final List alignmentAndReferenceContextList, final PileupDetectionArgumentCollection args, final SAMFileHeader headerForReads, final int minBaseQualityScore) {
        if (!args.usePileupDetection) {
            return ImmutablePair.of(Collections.emptySet(), Collections.emptySet());
        }

        final Set goodEvents = new HashSet<>();
        final Set badEvents = new HashSet<>();

        // Iterate over every base
        for (int i = 0; i < alignmentAndReferenceContextList.size(); i++) {
            AlignmentAndReferenceContext alignmentAndReferenceContext = alignmentAndReferenceContextList.get(i);
            boolean onlyTrackDeletions = false;
            //Skip all work on sites that aren't active according to our heuristic
            if (args.activeRegionPhredThreshold > 0.0 && alignmentAndReferenceContext.getActivityScore() < args.activeRegionPhredThreshold ) {
                //This solves a discordance with Illumina where they count deletions (and thus construct them and filter on the threshold) not at the anchor base but at the first
                //deleted base on the reference. Consequently we must allow deletions to be detected one base upstream of adctive regions.
                if (args.detectIndels && i+1 < alignmentAndReferenceContextList.size() && alignmentAndReferenceContextList.get(i+1).getActivityScore() > args.activeRegionPhredThreshold ) {
                    onlyTrackDeletions = true;
                } else {
                    continue;
                }
            }

            final AlignmentContext alignmentContext = alignmentAndReferenceContext.getAlignmentContext();
            final String contig = alignmentContext.getContig();
            final int start = alignmentContext.getStart();
            final int end = alignmentContext.getEnd();

            final ReferenceContext referenceContext = alignmentAndReferenceContext.getReferenceContext();
            final MutableInt pileupDepth = new MutableInt(alignmentContext.size());
            final ReadPileup pileup = alignmentContext.getBasePileup();
            final byte refBase = referenceContext.getBase();

            //Key for counts arrays [support, bad reads, assembly bad reads]
            Map insertionCounts = new HashMap<>();
            Map deletionCounts = new HashMap<>();
            Map SNPCounts = new HashMap<>();

            for (PileupElement element : pileup) {
                final byte base = element.getBase();

                // Subtract out low quality bases to mimic the reading active region determination //TODO this might need to also ignore the qual basees
                if (element.getQual() < minBaseQualityScore) {
                    pileupDepth.decrement();
                }

                final boolean SNPFound = !onlyTrackDeletions && refBase != base && base != 'D' && element.getQual() > args.qualityForSnpsInPileupDetection;
                final boolean insertionFound = !onlyTrackDeletions && args.detectIndels && element.isBeforeInsertion();
                final boolean deletionFound = args.detectIndels && element.isBeforeDeletionStart();

                if (SNPFound || insertionFound || deletionFound) {
                    final boolean badPileup = isBadPileupRead(element.getRead(), args, headerForReads);
                    final boolean badAssembly = isBadAssemblyRead(element.getRead(), args);

                    if (SNPFound) {
                        incrementCounts(base, SNPCounts, badPileup, badAssembly);
                    }

                    if (insertionFound) {
                        incrementCounts(element.getBasesOfImmediatelyFollowingInsertion(), insertionCounts, badPileup, badAssembly);
                    }

                    if (deletionFound) {
                        incrementCounts(element.getLengthOfImmediatelyFollowingIndel(), deletionCounts, badPileup, badAssembly);
                    }
                }
            }

            final Map SNPEventsAndCounts = SNPCounts.entrySet().stream()
                    .collect(Collectors.toMap(entry -> new Event(contig, start, Allele.create(refBase, true), Allele.create(entry.getKey())), entry -> entry.getValue()));
            final Map insertionEventsAndCounts = insertionCounts.entrySet().stream()
                    .collect(Collectors.toMap(entry -> new Event(contig, start, Allele.create(refBase, true), Allele.create((char) refBase + entry.getKey())), entry -> entry.getValue()));
            final Map deletionEventsAndCounts = deletionCounts.entrySet().stream()
                    .collect(Collectors.toMap(entry -> new Event(contig, start, Allele.create(referenceContext.getBases(new SimpleInterval(contig, start, end + entry.getKey())), true), Allele.create(refBase)), entry -> entry.getValue()));

            Streams.concat(SNPEventsAndCounts.entrySet().stream(), insertionEventsAndCounts.entrySet().stream(), deletionEventsAndCounts.entrySet().stream())
                    .forEach(eventAndCounts -> {
                        final Event event = eventAndCounts.getKey();
                        final int[] counts = eventAndCounts.getValue();

                        if (passesPileupFilters(args, counts[COUNT_IDX], counts[BAD_COUNT_IDX], pileupDepth.intValue(), event.isIndel())) {
                            goodEvents.add(event);
                        }

                        if (failsAssemblyFilters(args, counts[COUNT_IDX], counts[ASSEMBLY_BAD_COUNT_IDX])) {
                            badEvents.add(event);
                        }
                    });
        }

        return ImmutablePair.of(goodEvents, badEvents);
    }

    /**
     * Apply the filters to discovered alleles
     * - Does it have greater than snpThreshold fraction of bases support in the pileups?
     * - Does it have greater than pileupAbsoluteDepth number of reads supporting it?
     * - Are the reads supporting alts at the site greater than badReadThreshold percent "good"? //TODO evaluate if this is worth doing on a per-allele basis or otherwise
     */
    private static boolean passesPileupFilters(final PileupDetectionArgumentCollection args, final int pileupSupport, final int pileupBadReads, final int pileupDepth, final boolean isIndel) {
        return ((double) pileupSupport / (double) pileupDepth) > (isIndel ? args.indelThreshold : args.snpThreshold)
                && pileupDepth >= args.pileupAbsoluteDepth
                && ((args.badReadThreshold <= 0.0) || (double) pileupBadReads / (double)pileupSupport <= args.badReadThreshold);
    }

    // TODO this is the most sketchy one... does a variant that fails pileup calling with only one bad read as support count as garbage by this tool...
    private static boolean failsAssemblyFilters(final PileupDetectionArgumentCollection args, final int pileupSupport, final int assemblyBadReads) {
        return (args.assemblyBadReadThreshold > 0.0) && (double) assemblyBadReads / (double) pileupSupport >= args.assemblyBadReadThreshold;
    }

    /**
     * Based on the illumina PileupDetection filtering code: We apply a number of configurable heuristics to the reads that support
     * alt alleles that may be added and for each read evaluate if its "bad" or not by each of the heurisitcs. Currently they are:
     * - Secondary/SA tag reads are bad
     * - Improperly paired reads are bad
     * - Reads with > 8% per-base edit distance to the reference are bad
     * - Reads 2 std deviations away from the standard insert size are bad (not implemented)
     *
     * @param read
     * @param args
     * @param headerForRead TODO get rid of this sam record conversion
     * @return true if any of the "badness" heuristics suggest we should consider the read suspect, false otherwise.
     */
    @VisibleForTesting
    static boolean isBadPileupRead(final GATKRead read, final PileupDetectionArgumentCollection args, final SAMFileHeader headerForRead) {
        if (args.badReadThreshold <= 0.0) {
            return false;
        }
        if (args.badReadProperPair && !read.isProperlyPaired()) {
            return true;
        }
        if (args.badReadSecondaryOrSupplementary && (read.isSecondaryAlignment() || read.hasAttribute("SA"))) {
            return true;
        }

        //TODO this conversion is really unnecessary. Perhaps we should expose a new SequenceUtil like NM tag calculation?...
        // Assert that the edit distance for the read is in line
        final Integer mismatchPercentage = read.getAttributeAsInteger(MISMATCH_BASES_PERCENTAGE_TAG);
        Utils.nonNull(mismatchPercentage);
        if ((mismatchPercentage / MISMATCH_BASES_PERCENTAGE_ADJUSMTENT) > args.badReadEditDistance) {
            return true;
        }

        //TODO add threshold descibed by illumina about insert size compared to the average
        if (args.templateLengthStd > 0 && args.templateLengthMean > 0) {
            SAMRecord samRecordForRead = read.convertToSAMRecord(headerForRead);
            int templateLength = samRecordForRead.getInferredInsertSize();
            // This is an illumina magic number... Its possible none of this is particularly important for Functional Equivalency.
            if (templateLength < args.templateLengthMean - 2.25 * args.templateLengthStd
                    || templateLength > args.templateLengthMean + 2.25 * args.templateLengthStd) {
                return true;
            }
        }
        return false;
    }

    @VisibleForTesting
    static boolean isBadAssemblyRead(final GATKRead read, final PileupDetectionArgumentCollection args) {
        if (args.assemblyBadReadThreshold <= 0.0) {
            return false;
        }
        // TODO other checks?
        Utils.nonNull(read.getAttributeAsInteger(MISMATCH_BASES_PERCENTAGE_TAG));
        return (read.getAttributeAsInteger(MISMATCH_BASES_PERCENTAGE_TAG) / MISMATCH_BASES_PERCENTAGE_ADJUSMTENT) > args.assemblyBadReadEditDistance;
    }

    // Helper method to manage the badness counting arrays
    // T is a Byte for a SNP (alt base), String for insertion (inserted bases), Integer for deletion (deletion length)
    private static  void incrementCounts(T altAllele, Map altCounts, boolean pileupBad, boolean assemblyBad){
        final int[] values = altCounts.computeIfAbsent(altAllele, (i) -> new int[3] );
        values[COUNT_IDX]+=1; values[BAD_COUNT_IDX]+=pileupBad?1:0; values[ASSEMBLY_BAD_COUNT_IDX]+=assemblyBad?1:0;
    }


    public static void addMismatchPercentageToRead(final GATKRead read, final SAMFileHeader headerForRead, final ReferenceContext referenceContext) {
        //TODO this conversion is really unnecessary. Perhaps we should expose a new SequenceUtil like NM tag calculation?...
        if (read.hasAttribute(MISMATCH_BASES_PERCENTAGE_TAG)){
            return;
        }

        SAMRecord samRecordForRead = read.convertToSAMRecord(headerForRead);
        final int nmScore;
        if (! read.hasAttribute("NM")) {
            nmScore = SequenceUtil.calculateSamNmTag(samRecordForRead, referenceContext.getBases(new SimpleInterval(read)), read.getStart() - 1);
        } else {
            nmScore = read.getAttributeAsInteger("NM");
        }
        // We adjust the NM score by any indels in the read
        int adjustedNMScore = nmScore - read.getCigarElements().stream().filter(element -> element.getOperator().isIndel()).mapToInt(CigarElement::getLength).sum();

        // NOTE: we store the percentage as an integer x1000 because that is what the read attributes support. 4 units of precision should be more than enough for this ratio in the first place whereas 3 is probably not (reads over 100 bases get binned).
        read.setAttribute(MISMATCH_BASES_PERCENTAGE_TAG, (int) (MISMATCH_BASES_PERCENTAGE_ADJUSMTENT * adjustedNMScore / (read.getCigarElements().stream().filter(element -> element.getOperator().isAlignment()).mapToInt(CigarElement::getLength).sum() )));
    }
}