All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.contamination.CalculateContamination Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.contamination;

import org.apache.commons.lang3.tuple.Pair;
import org.apache.commons.math3.stat.descriptive.moment.Mean;
import org.apache.commons.math3.stat.descriptive.rank.Median;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.tools.walkers.mutect.filtering.FilterMutectCalls;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;

import java.io.File;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

/**
 * 

* Calculates the fraction of reads coming from cross-sample contamination, given results from {@link GetPileupSummaries}. * The resulting contamination table is used with {@link FilterMutectCalls}. *

*

* This tool is featured in the Somatic Short Mutation calling Best Practice Workflow. * See Tutorial#11136 for a * step-by-step description of the workflow and Article#11127 * for an overview of what traditional somatic calling entails. For the latest pipeline scripts, see the * Mutect2 WDL scripts directory. *

* *

This tool borrows from ContEst by Cibulskis et al the idea of estimating contamination * from ref reads at hom alt sites. However, ContEst uses a probabilistic model that assumes a diploid genotype with no copy number * variation and independent contaminating reads. That is, ContEst assumes that each contaminating read is drawn randomly and * independently from a different human. This tool uses a simpler estimate of contamination that relaxes these assumptions. In particular, * it works in the presence of copy number variations and with an arbitrary number of contaminating samples. In addition, this tool * is designed to work well with no matched normal data. However, one can run {@link GetPileupSummaries} on a matched normal bam file * and input the result to this tool.

* *

Usage examples

* *

Tumor-only mode

* *
 * gatk CalculateContamination \
 *   -I pileups.table \
 *   -O contamination.table
 * 
* *

Matched normal mode

* *
 * gatk CalculateContamination \
 *   -I tumor-pileups.table \
 *   -matched normal-pileups.table \
 *   -O contamination.table
 * 
*

* The resulting table provides the fraction contamination, one line per sample, e.g. SampleID--TAB--Contamination. * The file has no header. *

* */ @CommandLineProgramProperties( summary = "Calculate the fraction of reads coming from cross-sample contamination", oneLineSummary = "Calculate the fraction of reads coming from cross-sample contamination", programGroup = DiagnosticsAndQCProgramGroup.class ) @DocumentedFeature public class CalculateContamination extends CommandLineProgram { public static final Logger logger = LogManager.getLogger(CalculateContamination.class); private static final int MIN_COVERAGE = 10; private static final double DEFAULT_LOW_COVERAGE_RATIO_THRESHOLD = 1.0/2; private static final double DEFAULT_HIGH_COVERAGE_RATIO_THRESHOLD = 3.0; @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc="The input table") private File inputPileupSummariesTable; public static final String MATCHED_NORMAL_LONG_NAME = "matched-normal"; public static final String MATCHED_NORMAL_SHORT_NAME = "matched"; @Argument(fullName = MATCHED_NORMAL_LONG_NAME, shortName = MATCHED_NORMAL_SHORT_NAME, doc="The matched normal input table", optional = true) private File matchedPileupSummariesTable = null; @Argument(fullName= StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName=StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc="The output table") private final File outputTable = null; public static final String TUMOR_SEGMENTATION_LONG_NAME = "tumor-segmentation"; public static final String TUMOR_SEGMENTATION_SHORT_NAME = "segments"; @Argument(fullName= TUMOR_SEGMENTATION_LONG_NAME, shortName= TUMOR_SEGMENTATION_SHORT_NAME, doc="The output table containing segmentation of the tumor by minor allele fraction", optional = true) private final File outputTumorSegmentation = null; public static final String LOW_COVERAGE_RATIO_THRESHOLD_NAME = "low-coverage-ratio-threshold"; @Argument(fullName = LOW_COVERAGE_RATIO_THRESHOLD_NAME, doc="The minimum coverage relative to the median.", optional = true) private final double lowCoverageRatioThreshold = DEFAULT_LOW_COVERAGE_RATIO_THRESHOLD; public static final String HIGH_COVERAGE_RATIO_THRESHOLD_NAME = "high-coverage-ratio-threshold"; @Argument(fullName= HIGH_COVERAGE_RATIO_THRESHOLD_NAME, doc="The maximum coverage relative to the mean.", optional = true) private final double highCoverageRatioThreshold = DEFAULT_HIGH_COVERAGE_RATIO_THRESHOLD; @Override public Object doWork() { final Pair> sampleAndsites = PileupSummary.readFromFile(inputPileupSummariesTable); final String sample = sampleAndsites.getLeft(); final List sites = filterSitesByCoverage(sampleAndsites.getRight()); // used the matched normal to genotype (i.e. find hom alt sites) if available final List genotypingSites = matchedPileupSummariesTable == null ? sites : filterSitesByCoverage(PileupSummary.readFromFile(matchedPileupSummariesTable).getRight()); final ContaminationModel genotypingModel = new ContaminationModel(genotypingSites); if (outputTumorSegmentation != null) { final ContaminationModel tumorModel = matchedPileupSummariesTable == null ? genotypingModel : new ContaminationModel(sites); MinorAlleleFractionRecord.writeToFile(sample, tumorModel.segmentationRecords(), outputTumorSegmentation); } final Pair contaminationAndError = genotypingModel.calculateContaminationFromHoms(sites); ContaminationRecord.writeToFile(Arrays.asList( new ContaminationRecord(sample, contaminationAndError.getLeft(), contaminationAndError.getRight())), outputTable); return "SUCCESS"; } private List filterSitesByCoverage(final List allSites) { // Just in case the intervals given to GetPileupSummaries contained un-covered sites, we remove them // so that a bunch of zeroes don't throw off the median coverage final List coveredSites = allSites.stream().filter(s -> s.getTotalCount() > MIN_COVERAGE).collect(Collectors.toList()); final double[] coverage = coveredSites.stream().mapToDouble(PileupSummary::getTotalCount).toArray(); final double medianCoverage = new Median().evaluate(coverage); final double meanCoverage = new Mean().evaluate(coverage); final double lowCoverageThreshold = medianCoverage * lowCoverageRatioThreshold; final double highCoverageThreshold = meanCoverage * highCoverageRatioThreshold; return coveredSites.stream() .filter(ps -> ps.getTotalCount() > lowCoverageThreshold && ps.getTotalCount() < highCoverageThreshold) .collect(Collectors.toList()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy