org.broadinstitute.hellbender.tools.walkers.contamination.ContaminationModel Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.contamination;

import htsjdk.samtools.util.OverlapDetector;
import org.apache.commons.lang3.mutable.MutableDouble;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.optim.univariate.UnivariatePointValuePair;
import org.apache.commons.math3.stat.descriptive.rank.Percentile;
import org.apache.commons.math3.util.FastMath;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.walkers.qc.Pileup;
import org.broadinstitute.hellbender.utils.*;

import java.util.*;
import java.util.function.DoubleBinaryOperator;
import java.util.function.DoubleUnaryOperator;
import java.util.function.ToDoubleFunction;
import java.util.function.ToIntFunction;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * This is the probabilistic contamination model that we use to distinguish homs from hets
 * The model is similar to that of ContEst, in that it assumes that each contaminant read is independently drawn from
 * the population.  However, it also accounts for allelic CNVs and doesn't just apply to hom alt sites.
 *
 * Because this model makes assumptions about the form of contamination and doesn't model CNVs precisely (although it
 * is robust to them in that they don't lead to bad solutions), we don't actually use it for the final contamination
 * estimate.  Instead, we learn the model, then use it to predict which sites are hom alts.  It's important to do this right
 * because contamination makes hom alts look more like hets, and allelic CNVs make hets look more like hom alts, hence
 * distinguishing correctly is not trivial.
 */
public class ContaminationModel {
    public static final double INITIAL_MAF_THRESHOLD = 0.40;
    public static final double MAF_TO_SWITCH_TO_HOM_REF = 0.25;
    public static final double MAF_TO_SWITCH_TO_UNSCRUPULOUS_HOM_REF = 0.20;
    public static final double UNSCRUPULOUS_HOM_REF_ALLELE_FRACTION = 0.15;
    public static final double UNSCRUPULOUS_HOM_REF_FRACTION_TO_REMOVE_FOR_POSSIBLE_LOH = 0.1;
    public static final double UNSCRUPULOUS_HOM_REF_PERCENTILE = 100 * ( 1 - UNSCRUPULOUS_HOM_REF_FRACTION_TO_REMOVE_FOR_POSSIBLE_LOH);
    public static final double MINIMUM_UNSCRUPULOUS_HOM_REF_ALT_FRACTION_THRESHOLD = 0.1;

    public static final double MAF_STEP_SIZE = 0.04;
    private static final double PRECISION_FOR_STANDARD_ERROR = 0.000001;
    private final double contamination;
    private final double errorRate;
    private final List minorAlleleFractions;
    private final List> segments;

    public static final int HOM_REF = 0;
    public static final int HOM_ALT = 3;
    private static final int NUM_ITERATIONS = 3;
    private static final double MIN_FRACTION_OF_SITES_TO_USE = 0.25;
    private static final double MIN_RELATIVE_ERROR = 0.2;
    private static final double MIN_ABSOLUTE_ERROR = 0.001;
    private static final List CONTAMINATION_INITIAL_GUESSES = Arrays.asList(0.02, 0.05, 0.1, 0.2);

    public ContaminationModel(List sites) {
        errorRate = calculateErrorRate(sites);

        // partition genome into minor allele fraction (MAF) segments to better distinguish hom alts from LoH hets.
        segments = ContaminationSegmenter.findSegments(sites);
        final int numSegments = segments.size();

        final List minorAlleleFractionsGuess = new ArrayList<>(Collections.nCopies(segments.size(), 0.5));
        final MutableDouble contaminationGuess = new MutableDouble(0);
        for (int n = 0; n < NUM_ITERATIONS; n++) {
            IntStream.range(0, numSegments).forEach(s -> minorAlleleFractionsGuess.set(s, calculateMinorAlleleFraction(contaminationGuess.doubleValue(), errorRate, segments.get(s))));
            final Pair>, List> nonLOHSegmentsAndMafs = getNonLOHSegments(segments, minorAlleleFractionsGuess);
            contaminationGuess.setValue(calculateContamination(errorRate, nonLOHSegmentsAndMafs.getLeft(), nonLOHSegmentsAndMafs.getRight()));
        }

        minorAlleleFractions = minorAlleleFractionsGuess;
        contamination = contaminationGuess.doubleValue();
    }

    private static Pair>, List> getNonLOHSegments(final List> segments, final List mafs) {
        final int numSites = segments.stream().mapToInt(List::size).sum();
        for (double minMaf = INITIAL_MAF_THRESHOLD; minMaf > 0; minMaf -= MAF_STEP_SIZE) {
            final double threshold = minMaf;
            final int[] nonLOHIndices = IntStream.range(0, segments.size()).filter(s -> mafs.get(s) > threshold).toArray();
            final List> nonLOHSegments = Arrays.stream(nonLOHIndices).mapToObj(segments::get).collect(Collectors.toList());
            final List nonLOHMafs = Arrays.stream(nonLOHIndices).mapToObj(mafs::get).collect(Collectors.toList());

            final int numNonLOHSites = nonLOHSegments.stream().mapToInt(List::size).sum();
            if ((double) numNonLOHSites / numSites > MIN_FRACTION_OF_SITES_TO_USE) {
                return ImmutablePair.of(nonLOHSegments, nonLOHMafs);
            }
        }
        return ImmutablePair.of(segments, mafs);
    }

    /**
     * Calculate the contamination in a tumor sample using the hom alt and hom ref sites inferred from this model, which
     * could be derived from the tumor itself or a matched normal.
     * @return
     */
    public Pair calculateContaminationFromHoms(final List tumorSites) {
        for (double minMaf = INITIAL_MAF_THRESHOLD; minMaf >= 0; minMaf -= MAF_STEP_SIZE) {

            final Pair result;
            if (minMaf > MAF_TO_SWITCH_TO_HOM_REF) {
                result = calculateContamination(Strategy.HOM_ALT, tumorSites, minMaf);
            } else if (minMaf > MAF_TO_SWITCH_TO_UNSCRUPULOUS_HOM_REF) {
                result = calculateContamination(Strategy.HOM_REF, tumorSites, minMaf);
            } else {
                result = calculateContamination(Strategy.UNSCRUPULOUS_HOM_REF, tumorSites, minMaf);
            }
            if (!Double.isNaN(result.getLeft()) && result.getRight() < (result.getLeft() * MIN_RELATIVE_ERROR + MIN_ABSOLUTE_ERROR)) {
                return result;
            }
        }

        final Pair result = calculateContamination(Strategy.UNSCRUPULOUS_HOM_REF, tumorSites, 0);
        return Double.isNaN(result.getLeft()) ? Pair.of(0.0, 1.0) : result;
    }

    /**
     * Depending on how many sites we have found for use in our estimate, we are more or less strict as to which calculation
     * to use.  In ideal cases (exomes and genomes) we have enough hom alt sites to base the calculation on ref contamination
     * in hom alt sites.  If this fails -- that is, if the hom alt calculation has too much uncertainty -- we calculate based
     * on the alt in hom ref signal, which is less reliable because it is more subject to error (or population-dependence) in
     * the population allele frequencies.  Finally, we may resort to a hom ref calculation that uses a heuristic instead of
     * principled genotyping based on the minor allele fraction segmentation.  This brings in additionally hom ref sites that
     * fall outside the span of the het segmentation and ought to be necessary only for very small gene panels.
     */
    private enum Strategy {
        HOM_ALT, HOM_REF, UNSCRUPULOUS_HOM_REF
    }


    private Pair calculateContamination(final Strategy strategy, final List tumorSites, final double minMaf) {
        final boolean useHomAlt = strategy == Strategy.HOM_ALT;
        final List genotypingHoms;
        if (strategy == Strategy.HOM_ALT) {
            genotypingHoms = homAlts(minMaf);
        } else if (strategy == Strategy.HOM_REF) {
            genotypingHoms = homRefs(minMaf);
        } else {
            final List candidateHomRefs = tumorSites.stream()
                    .filter(site -> site.getAltFraction() < UNSCRUPULOUS_HOM_REF_ALLELE_FRACTION)
                    .collect(Collectors.toList());
            final double altFractionThreshold = Math.max(MINIMUM_UNSCRUPULOUS_HOM_REF_ALT_FRACTION_THRESHOLD,
                    new Percentile(UNSCRUPULOUS_HOM_REF_PERCENTILE).evaluate(candidateHomRefs.stream().mapToDouble(PileupSummary::getAltFraction).toArray()));
            genotypingHoms = candidateHomRefs.stream().filter(site -> site.getAltFraction() <= altFractionThreshold).collect(Collectors.toList());
        }
        final List homs = subsetSites(tumorSites, genotypingHoms);
        final double tumorErrorRate = calculateErrorRate(tumorSites);

        // depth of ref in hom alt or alt in hom ref
        final ToIntFunction oppositeCount = useHomAlt ? PileupSummary::getRefCount : PileupSummary::getAltCount;
        final ToDoubleFunction oppositeAlleleFrequency = useHomAlt ? PileupSummary::getRefFrequency : PileupSummary::getAlleleFrequency;

        final long totalDepth = homs.stream().mapToLong(PileupSummary::getTotalCount).sum();

        // total read count of ref in hom alt or alt in hom ref, as the case may be
        final long oppositeDepth = homs.stream().mapToLong(oppositeCount::applyAsInt).sum();
        final long errorDepth = Math.round(totalDepth * tumorErrorRate / 3);
        final long contaminationOppositeDepth = Math.max(oppositeDepth - errorDepth, 0);

        final double totalDepthWeightedByOppositeFrequency = homs.stream()
                .mapToDouble(ps -> ps.getTotalCount() * oppositeAlleleFrequency.applyAsDouble(ps))
                .sum();

        final double contaminationEstimate = contaminationOppositeDepth / totalDepthWeightedByOppositeFrequency;

        // coeff1 and coeff2 are the sums of (1-f)*d and f*(1-f)*d^2, respectively, over all hom alts, where f is
        // population allele frequency and d is total read depth.  When using hom refs as a backup strategy we replace
        // 1-f with f in coeff1.
        final double coeff1 = homs.stream().mapToDouble(ps -> oppositeAlleleFrequency.applyAsDouble(ps) * ps.getTotalCount()).sum();
        final double coeff2 = homs.stream().mapToDouble(ps ->
                oppositeAlleleFrequency.applyAsDouble(ps)*(1 - oppositeAlleleFrequency.applyAsDouble(ps)) * MathUtils.square(ps.getTotalCount())
        ).sum();

        final DoubleUnaryOperator errorFunc = c -> homs.isEmpty() ? 1 : Math.sqrt(coeff1*c*(1-c) + coeff2*c*c) / totalDepthWeightedByOppositeFrequency;

        // {@code errorFunc} is a formula for the standard error (see docs/mutect/mutect.pdf) of the contamination estimate,
        // but it depends on the true (unknown) contamination, not the estimate.  For low contaminations simply plugging in
        // the estimated contamination to the formula does not work.  In particular, an estimate of 0 yields a standard error
        // of 0.  A more principled approach is to find a range of contaminations that is consistent with the error estimate.
        //  For example, suppose our error estimate is 0.0.  If the standard error as a function of contamination = 0.02 is 0.01,
        // then 0.02 is not consistent with an estimate of 0.0.  If the standard error as a function of contamination = 0.005
        // is 0.005, then it is consistent with our estimate of 0.0, and furthermore because {@code errorFunc} is monotonic we
        // know it is the largest true contamination that would be consistent.
        // More precisely, if our contamination estimate is e then the highest consistent contamination is c such that
        // c = e + errorFunc(c), which we find via binary search.  The lowest consistent contmination is found similarly,
        // and our improved standard error is max(highest - e, e - lowest).  The binary search is far from optimized (in fact we could solve this explicitly with a messy closed-form solution)
        // but it converges to a precision of 1e-6 in 20 iterations of the square root of a linear function.  This is fast enough.

        final OptionalDouble upperEstimate = MathUtils.binarySearchFindZero(c -> c - errorFunc.applyAsDouble(c) - contaminationEstimate,
                contaminationEstimate, 1.0, PRECISION_FOR_STANDARD_ERROR);
        final OptionalDouble lowerEstimate = MathUtils.binarySearchFindZero(c -> c + errorFunc.applyAsDouble(c) - contaminationEstimate,
                0.0, contaminationEstimate, PRECISION_FOR_STANDARD_ERROR);

        // if for some reason the binary search failed -- and if my math is to believed it should never fail -- we use
        // the standard error formula with the estimated contamination
        final double stdError = (upperEstimate.isPresent() && lowerEstimate.isPresent()) ?
                Math.max(upperEstimate.getAsDouble() - contaminationEstimate, contaminationEstimate - lowerEstimate.getAsDouble()) :
                errorFunc.applyAsDouble(contaminationEstimate);

        return Pair.of(Math.min(contaminationEstimate, 1.0), stdError);
    }

    private List getType(final int genotype, final double minMaf) {
        final int[] nonLOHIndices = IntStream.range(0, segments.size()).filter(s -> minorAlleleFractions.get(s) > minMaf).toArray();
        final List> nonLOHSegments = Arrays.stream(nonLOHIndices).mapToObj(segments::get).collect(Collectors.toList());
        final List nonLOHMafs = Arrays.stream(nonLOHIndices).mapToObj(minorAlleleFractions::get).collect(Collectors.toList());

        return IntStream.range(0, nonLOHSegments.size())
                .mapToObj(n -> nonLOHSegments.get(n).stream().filter(site -> probability(site, contamination, errorRate, nonLOHMafs.get(n), genotype) > 0.5))
                .flatMap(stream -> stream)
                .collect(Collectors.toList());
    }

    private List homAlts(final double minMaf) { return getType(HOM_ALT, minMaf); }
    private List homRefs(final double minMaf) { return getType(HOM_REF, minMaf); }

    public List segmentationRecords() {
        return IntStream.range(0, segments.size()).mapToObj(n -> {
                    final List segment = segments.get(n);
                    final String contig = segment.get(0).getContig();
                    final int start = segment.get(0).getStart();
                    final int end = segment.get(segment.size() - 1).getEnd();
                    final double maf = minorAlleleFractions.get(n);
                    return new MinorAlleleFractionRecord(new SimpleInterval(contig, start, end), maf);
                }).collect(Collectors.toList());
    }

    private static double calculateErrorRate(final List sites) {
        final long totalBases = sites.stream().mapToInt(PileupSummary::getTotalCount).sum();
        final long otherAltBases = sites.stream().mapToInt(PileupSummary::getOtherAltCount).sum();
        return 1.5 * ((double) otherAltBases / totalBases);
    }

    private static double calculateMinorAlleleFraction(final double contamination, final double errorRate,  final List segment) {
        final DoubleUnaryOperator objective = maf -> segmentLogLikelihood(segment, contamination, errorRate, maf);
        return OptimizationUtils.max(objective, 0.1, 0.5, 0.4, 0.01, 0.01, 20).getPoint();
    }

    private static double calculateContamination(final double errorRate, final List> segments, final List mafs) {
        final DoubleUnaryOperator objective = c -> modelLogLikelihood(segments, c, errorRate, mafs);

        final List optima = CONTAMINATION_INITIAL_GUESSES.stream()
                .map(initial -> OptimizationUtils.max(objective, 0, 0.5, initial, 1.0e-4, 1.0e-4, 30))
                .collect(Collectors.toList());

        return Collections.max(optima, Comparator.comparingDouble(UnivariatePointValuePair::getValue)).getPoint();
    }

    /**
     * Array of likelihoods over the genotypes hom ref, alt minor, alt major, hom alt
     * @param site
     * @param c the contamination
     * @param maf   the local minor allele fraction
     * @param errorRate the base error rate
     * @return a double[4], never null
     */
    private static double[] genotypeLikelihoods(final PileupSummary site, final double c, final double errorRate, double maf) {
        final double f = site.getAlleleFrequency();
        final int k = site.getAltCount();
        final int n = k + site.getRefCount();

        // sample genotypes in order hom ref, alt minor, alt major, hom alt
        final double[] samplePriors = new double[] {(1 - f) * (1 - f), f * (1 - f), f * (1 - f), f * f};
        final double[] sampleAFs = new double[] {errorRate /3, maf, 1 - maf, 1 - errorRate};

        return new IndexRange(0, 4).mapToDouble(sg -> samplePriors[sg] *
                MathUtils.binomialProbability(n, k, (1 - c) * sampleAFs[sg] + c * f));
    }

    /**
     * Probability of a genotype hom ref, alt minor, alt major, hom alt
     * @param genotype 0 for hom ref, 3 for hom alt etc
     */
    private static double probability(final PileupSummary site, final double contamination, final double errorRate, final double minorAlleleFraction, final int genotype) {
        final double[] likelihoods = genotypeLikelihoods(site, contamination, errorRate, minorAlleleFraction);
        return likelihoods[genotype] / MathUtils.sum(likelihoods);
    }

    private static double segmentLogLikelihood(final List segment, final double contamination, final double errorRate, final double minorAlleleFraction) {
        return segment.stream().mapToDouble(site -> Math.log(MathUtils.sum(genotypeLikelihoods(site, contamination, errorRate, minorAlleleFraction)))).sum();
    }

    private static double modelLogLikelihood(final List> segments, final double contamination, final double errorRate, final List mafs) {
        Utils.validate(segments.size() == mafs.size(), " Must have one MAF per segment");
        return new IndexRange(0, segments.size()).sum(n -> segmentLogLikelihood(segments.get(n), contamination, errorRate, mafs.get(n)));
    }

    private static List subsetSites(final List sites, final List subsetLoci) {
        final OverlapDetector od = OverlapDetector.create(subsetLoci);
        return sites.stream().filter(od::overlapsAny).collect(Collectors.toList());
    }
}