org.broadinstitute.hellbender.tools.walkers.validation.CalculateMixingFractions Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.walkers.validation;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.*;
import org.apache.commons.lang3.mutable.MutableLong;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
import picard.cmdline.programgroups.VariantEvaluationProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import java.io.File;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
/**
* Given a VCF of known variants from multiple samples, calculate how much each sample
* contributes to a pooled BAM. We first estimate each sample's mixing fraction as the average alt fraction
* in the bam of singleton hets belonging to that sample, then normalize these initial estimates to sum to one.
*
*
* In the CRSP sensitivity validation, we have a bam derived from a pool of 5, 10, or 20 samples
* and a vcf of all known variants in those samples. The pooled bam is a simulated tumor and
* the vcf of individual variants is our truth data. We annotate the truth data with the estimated
* allele fractions of each variant in the pooled bam in order to bin our results by "tumor" allele fraction.
* To estimate allele fractions we need to know the sample mixing fractions as an intermediate step.
*
*
*
* One could use this tool for generating precise truth data any time multiple samples with known ground
* truth are combined to simulate tumors or any situation in which low allele fractions occur.
*
*
* Example
*
*
* gatk --java-options "-Xmx4g" CalculateMixingFractions \
* -V input.vcf \
* -O output.table
*
*
* Created by David Benjamin on 1/30/17.
*/
@CommandLineProgramProperties(
summary = "Calculate proportions of different samples in a pooled bam by first estimating each sample's" +
" mixing fraction as the average alt fraction in the bam of singleton hets belonging to that sample, then" +
" normalizing these initial estimates to sum to one.",
oneLineSummary = "(Internal) Calculate proportions of different samples in a pooled bam",
programGroup = VariantEvaluationProgramGroup.class
)
@DocumentedFeature
public class CalculateMixingFractions extends VariantWalker {
@Argument(fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="The output table of samplemixing fractions",
optional=false)
private final File outputFile = null;
private final Map sampleCounts = new HashMap<>();
@Override
public void onTraversalStart() {
getHeaderForVariants().getGenotypeSamples().forEach(s -> sampleCounts.put(s, new AltAndTotalReadCounts()));
}
@Override
public void apply(final VariantContext vc, final ReadsContext readsContext, final ReferenceContext refContext, final FeatureContext fc) {
if (!isBiallelicSingletonHetSnp(vc)) {
return;
}
final Optional variantSample = StreamSupport.stream(vc.getGenotypes().spliterator(), false)
.filter(genotype -> genotype.isHet())
.map(genotype -> genotype.getSampleName())
.findFirst();
if (!variantSample.isPresent()) {
return;
}
final List reads = new ArrayList<>();
final List offsets = new ArrayList<>();
for (final GATKRead read : readsContext) {
if (read.failsVendorQualityCheck()) {
continue;
}
final AlignmentStateMachine asm = new AlignmentStateMachine(read);
while ( asm.stepForwardOnGenome() != null && asm.getGenomePosition() < vc.getStart()) {
}
if (asm.getGenomePosition() == vc.getStart()) {
reads.add(read);
offsets.add(asm.getReadOffset());
}
}
final ReadPileup pileup = new ReadPileup(vc, reads, offsets);
final byte altBase = vc.getAlternateAllele(0).getBases()[0];
final long altCount = StreamSupport.stream(pileup.spliterator(), false)
.filter(pe -> pe.getBase() == altBase)
.count();
final long totalCount = pileup.size();
sampleCounts.get(variantSample.get()).addCounts(altCount, totalCount);
}
@Override
public Object onTraversalSuccess() {
final double mixingFractionNormalizer = sampleCounts.values().stream()
.mapToDouble(AltAndTotalReadCounts::getAltFraction).sum();
final List mixingFractions = sampleCounts.entrySet().stream()
.map(e -> new MixingFraction(e.getKey(), e.getValue().getAltFraction() / mixingFractionNormalizer))
.collect(Collectors.toList());
MixingFraction.writeMixingFractions(mixingFractions, outputFile);
return "SUCCESS";
}
private class AltAndTotalReadCounts {
private final MutableLong altCount = new MutableLong(0);
private final MutableLong totalCount = new MutableLong(0);
public void addCounts(final long numAlts, final long numTotal) {
altCount.add(numAlts);
totalCount.add(numTotal);
}
public double getAltFraction() { return altCount.doubleValue() / totalCount.doubleValue(); }
}
private boolean isBiallelicSingletonHetSnp(final VariantContext vc) {
return vc.isBiallelic() && vc.isSNP()
&& ( (vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)
&& getArrayAttribute(vc, VCFConstants.ALLELE_COUNT_KEY)[0] == 1)
|| vc.getGenotypes().stream().filter(genotype -> genotype.isHet()).count() == 1);
}
private static double[] getArrayAttribute(final VariantContext vc, final String attribute) {
return VariantContextGetters.getAttributeAsDoubleArray(vc, attribute, () -> null, -1);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy