org.broadinstitute.hellbender.tools.copynumber.models.AlleleFractionModeller Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.copynumber.models;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AllelicCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.ParameterDecileCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleSampleMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.ModeledSegment;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.mcmc.DecileCollection;
import org.broadinstitute.hellbender.utils.mcmc.GibbsSampler;
import org.broadinstitute.hellbender.utils.mcmc.ParameterSampler;
import org.broadinstitute.hellbender.utils.mcmc.ParameterizedModel;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/**
* Given segments and counts of alt and ref reads over a list of het sites,
* infers the minor-allele fraction of each segment. For example, a segment
* with (alt,ref) counts (10,90), (11,93), (88,12), (90,10) probably has a minor-allele fraction
* somewhere around 0.1. The model takes into account allelic reference bias due to mapping etc. by learning
* a global gamma distribution on allelic bias ratios.
*
*
* We define the bias ratio of each het locus to be the expected ratio of
* mapped ref reads to mapped alt reads given equal amounts of DNA (that is, given
* a germline het). The model learns a common gamma distribution:
* bias ratio ~ Gamma(alpha = mu^2 / sigma^2, beta = mu / sigma^2)
* where mu and sigma^2 are the global mean and variance of bias ratios, and
* alpha, beta are the natural parameters of the gamma distribution.
*
*
* Each segment has a minor-allele fraction f, and for each het within the locus
* the number of alt reads is drawn from a binomial distribution with total count
* n = #alt reads + #ref reads and alt probability f / (f + (1 - f) * bias ratio) if the
* locus is alt minor and (1 - f) / (1 - f + f * bias ratio) if the locus is ref minor.
* We also allow a prior on minor-allele fraction to be specified by the alpha parameter of the 4-parameter
* beta distribution Beta(alpha, 1, 0, 1/2).
*
*
* Conceptually, the model contains latent variables corresponding to the bias ratio
* and indicators for alt minor/ref minor at each het locus. However, we integrate them
* out and the MCMC model below only contains the minor-allele fractions and
* the three hyperparameters of the model: the two parameters of the gamma distribution
* along with the global outlier probability.
*
*
* See docs/CNV/archived/archived-CNV-methods.pdf for a thorough description of the model.
*
* @author David Benjamin <[email protected]>
* @author Samuel Lee <[email protected]>
*/
public final class AlleleFractionModeller {
private static final double MAX_REASONABLE_MEAN_BIAS = AlleleFractionInitializer.MAX_REASONABLE_MEAN_BIAS;
private static final double MAX_REASONABLE_BIAS_VARIANCE = AlleleFractionInitializer.MAX_REASONABLE_BIAS_VARIANCE;
private static final double MAX_REASONABLE_OUTLIER_PROBABILITY = AlleleFractionInitializer.MAX_REASONABLE_OUTLIER_PROBABILITY;
private static final double MIN_MINOR_FRACTION_SAMPLING_WIDTH = 1E-3;
private final SampleLocatableMetadata metadata;
private final ParameterizedModel model;
private final List meanBiasSamples = new ArrayList<>();
private final List biasVarianceSamples = new ArrayList<>();
private final List outlierProbabilitySamples = new ArrayList<>();
private final List minorFractionsSamples = new ArrayList<>();
/**
* Constructs an allele-fraction model given allelic counts and segments.
* {@link AlleleFractionInitializer} is used for initialization and slice-sampling widths are estimated.
*/
AlleleFractionModeller(final AllelicCountCollection allelicCounts,
final SimpleIntervalCollection segments,
final AlleleFractionPrior prior) {
Utils.nonNull(allelicCounts);
Utils.nonNull(segments);
Utils.validateArg(allelicCounts.getMetadata().getSequenceDictionary().equals(segments.getMetadata().getSequenceDictionary()),
"Metadata of the allelic counts and the segments do not match.");
Utils.nonNull(prior);
metadata = allelicCounts.getMetadata();
final AlleleFractionSegmentedData data = new AlleleFractionSegmentedData(allelicCounts, segments);
//initialization gets us to the mode of the likelihood
final AlleleFractionState initialState = new AlleleFractionInitializer(data).getInitializedState();
final AlleleFractionGlobalParameters initialParameters = initialState.globalParameters();
final AlleleFractionState.MinorFractions initialMinorFractions = initialState.minorFractions();
//if we approximate conditionals as normal, we can guess the width from the curvature at the mode and use as the slice-sampling widths
final double meanBiasSamplingWidths = approximatePosteriorWidthAtMode(meanBias ->
AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewMeanBias(meanBias), initialMinorFractions, data), initialParameters.getMeanBias());
final double biasVarianceSamplingWidths = approximatePosteriorWidthAtMode(biasVariance ->
AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewBiasVariance(biasVariance), initialMinorFractions, data), initialParameters.getBiasVariance());
final double outlierProbabilitySamplingWidths = approximatePosteriorWidthAtMode(outlierProbability ->
AlleleFractionLikelihoods.logLikelihood(initialParameters.copyWithNewOutlierProbability(outlierProbability), initialMinorFractions, data), initialParameters.getOutlierProbability());
final List minorFractionsSliceSamplingWidths = IntStream.range(0, data.getNumSegments()).boxed()
.map(segment -> approximatePosteriorWidthAtMode(
f -> AlleleFractionLikelihoods.segmentLogLikelihood(initialParameters, f, data.getIndexedAllelicCountsInSegment(segment)), initialMinorFractions.get(segment)))
.map(w -> Math.max(w, MIN_MINOR_FRACTION_SAMPLING_WIDTH))
.collect(Collectors.toList());
final ParameterSampler meanBiasSampler =
new AlleleFractionSamplers.MeanBiasSampler(MAX_REASONABLE_MEAN_BIAS, meanBiasSamplingWidths);
final ParameterSampler biasVarianceSampler =
new AlleleFractionSamplers.BiasVarianceSampler(MAX_REASONABLE_BIAS_VARIANCE, biasVarianceSamplingWidths);
final ParameterSampler outlierProbabilitySampler =
new AlleleFractionSamplers.OutlierProbabilitySampler(MAX_REASONABLE_OUTLIER_PROBABILITY, outlierProbabilitySamplingWidths);
final ParameterSampler minorFractionsSampler =
new AlleleFractionSamplers.MinorFractionsSampler(prior, minorFractionsSliceSamplingWidths);
model = new ParameterizedModel.GibbsBuilder<>(initialState, data)
.addParameterSampler(AlleleFractionParameter.MEAN_BIAS, meanBiasSampler, Double.class)
.addParameterSampler(AlleleFractionParameter.BIAS_VARIANCE, biasVarianceSampler, Double.class)
.addParameterSampler(AlleleFractionParameter.OUTLIER_PROBABILITY, outlierProbabilitySampler, Double.class)
.addParameterSampler(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, minorFractionsSampler, AlleleFractionState.MinorFractions.class)
.build();
}
/**
* Adds {@code numSamples - numBurnIn} Markov-Chain Monte-Carlo samples of the parameter posteriors (generated using
* Gibbs sampling) to the collections held internally. The current {@link AlleleFractionState} held internally is used
* to initialize the Markov Chain.
* @param numSamples total number of samples per posterior
* @param numBurnIn number of burn-in samples to discard
*/
void fitMCMC(final int numSamples, final int numBurnIn) {
//run MCMC
final GibbsSampler gibbsSampler = new GibbsSampler<>(numSamples, model);
gibbsSampler.runMCMC();
//update posterior samples
meanBiasSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.MEAN_BIAS, Double.class, numBurnIn));
biasVarianceSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.BIAS_VARIANCE, Double.class, numBurnIn));
outlierProbabilitySamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.OUTLIER_PROBABILITY, Double.class, numBurnIn));
minorFractionsSamples.addAll(gibbsSampler.getSamples(AlleleFractionParameter.MINOR_ALLELE_FRACTIONS, AlleleFractionState.MinorFractions.class, numBurnIn));
}
List getMeanBiasSamples() {
return Collections.unmodifiableList(meanBiasSamples);
}
List getBiasVarianceSamples() {
return Collections.unmodifiableList(biasVarianceSamples);
}
List getOutlierProbabilitySamples() {
return Collections.unmodifiableList(outlierProbabilitySamples);
}
List getMinorFractionsSamples() {
return Collections.unmodifiableList(minorFractionsSamples);
}
/**
* Should only be called after {@link #fitMCMC} has been called.
*/
List getMinorAlleleFractionsPosteriorSummaries() {
if (minorFractionsSamples.isEmpty()) {
throw new IllegalStateException("Attempted to get posterior summaries for minor-allele fractions before MCMC was performed.");
}
final int numSegments = minorFractionsSamples.get(0).size();
final List posteriorSummaries = new ArrayList<>(numSegments);
for (int segmentIndex = 0; segmentIndex < numSegments; segmentIndex++) {
final int j = segmentIndex;
final List minorFractionSamples =
minorFractionsSamples.stream().map(s -> s.get(j)).collect(Collectors.toList());
posteriorSummaries.add(new ModeledSegment.SimplePosteriorSummary(minorFractionSamples));
}
return posteriorSummaries;
}
/**
* Should only be called after {@link #fitMCMC} has been called.
*/
ParameterDecileCollection getGlobalParameterDeciles() {
if (meanBiasSamples.isEmpty()) {
throw new IllegalStateException("Attempted to get posterior summaries for global parameters before MCMC was performed.");
}
final Map parameterToDecilesMap = new LinkedHashMap<>();
parameterToDecilesMap.put(AlleleFractionParameter.MEAN_BIAS, new DecileCollection(meanBiasSamples));
parameterToDecilesMap.put(AlleleFractionParameter.BIAS_VARIANCE, new DecileCollection(biasVarianceSamples));
parameterToDecilesMap.put(AlleleFractionParameter.OUTLIER_PROBABILITY, new DecileCollection(outlierProbabilitySamples));
return new ParameterDecileCollection<>(new SimpleSampleMetadata(metadata.getSampleName()), parameterToDecilesMap, AlleleFractionParameter.class);
}
//use width of a probability distribution given the position of its mode (estimated from Gaussian approximation) as step size
private static double approximatePosteriorWidthAtMode(final Function logPDF,
final double mode) {
final double absMode = Math.abs(mode);
final double epsilon = Math.min(1E-6, absMode / 2); //adjust scale if mode is very near zero
final double defaultWidth = absMode / 10; //if "mode" is not close to true mode of logPDF, approximation may not apply; just use 1 / 10 of absMode in this case
final double secondDerivative = (logPDF.apply(mode + epsilon) - 2 * logPDF.apply(mode) + logPDF.apply(mode - epsilon)) / (epsilon * epsilon);
return secondDerivative < 0 ? Math.sqrt(-1.0 / secondDerivative) : defaultWidth;
}
}