org.broadinstitute.hellbender.tools.walkers.contamination.GetPileupSummaries Maven / Gradle / Ivy
package org.broadinstitute.hellbender.tools.walkers.contamination;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFHeader;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.tsv.TableUtils;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Summarizes counts of reads that support reference, alternate and other alleles for given sites. Results can be used with {@link CalculateContamination}.
*
*
* The tool requires a common germline variant sites VCF, e.g. derived from the gnomAD resource, with population allele frequencies (AF) in the INFO field.
* This resource must contain only biallelic SNPs and can be an eight-column sites-only VCF.
* The tool ignores the filter status of the variant calls in this germline resource.
*
*
* This tool is featured in the Somatic Short Mutation calling Best Practice Workflow.
* See Tutorial#11136 for a
* step-by-step description of the workflow and Article#11127
* for an overview of what traditional somatic calling entails. For the latest pipeline scripts, see the
* Mutect2 WDL scripts directory.
* In particular, the mutect_resources.wdl script prepares a suitable resource from a larger dataset. An example excerpt is shown.
*
*
*
* #CHROM POS ID REF ALT QUAL FILTER INFO
* chr6 29942512 . G C 2974860 VQSRTrancheSNP99.80to99.90 AF=0.063
* chr6 29942517 . C A 2975860 VQSRTrancheSNP99.80to99.90 AF=0.062
* chr6 29942525 . G C 2975600 VQSRTrancheSNP99.60to99.80 AF=0.063
* chr6 29942547 rs114945359 G C 15667700 PASS AF=0.077
*
*
* Usage examples
*
*
* gatk GetPileupSummaries \
* -I tumor.bam \
* -V common_biallelic.vcf.gz \
* -L common_biallelic.vcf.gz \
* -O pileups.table
*
*
*
* gatk GetPileupSummaries \
* -I normal.bam \
* -V common_biallelic.vcf.gz \
* -L common_biallelic.vcf.gz \
* -O pileups.table
*
*
* Although the sites (-L) and variants (-V) resources will often be identical, this need not be the case. For example,
*
* gatk GetPileupSummaries \
* -I normal.bam \
* -V gnomad.vcf.gz \
* -L common_snps.interval_list \
* -O pileups.table
*
* attempts to get pileups at a list of common snps and emits output for those sites that are present in gnomAD, using the
* allele frequencies from gnomAD. Note that the sites may be a subset of the variants, the variants may be a subset of the sites,
* or they may overlap partially. In all cases pileup summaries are emitted for the overlap and nowhere else. The most common use
* case in which sites and variants differ is when the variants resources is a large file and the sites is an interval list subset from that file.
*
*
* GetPileupSummaries tabulates results into six columns as shown below.
* The alt_count and allele_frequency correspond to the ALT allele in the germline resource.
*
*
*
* contig position ref_count alt_count other_alt_count allele_frequency
* chr6 29942512 9 0 0 0.063
* chr6 29942517 13 1 0 0.062
* chr6 29942525 13 7 0 0.063
* chr6 29942547 36 0 0 0.077
*
*
*
* Note the default maximum population AF ({@code --maximum-population-allele-frequency} or {@code -max-af})
* is set to 0.2, which limits the sites the tool considers to those in the variants resource file that have
* AF of 0.2 or less. Likewise, the default minimum population AF ({@code --minimum-population-allele-frequency}
* or {@code -min-af}) is set to 0.01, which limits the sites the tool considers to those in the variants resource
* file that have AF of 0.01 or more.
*
*
*
* Finally for those using mappers other than bwa mem or dragen-os, {@code --minimum-mapping-quality} threshold is
* set to 50, which limits the usable reads that tool considers for generating pileups. Certain mappers are known to
* assign scores less that this threshold even for the unique mappings. If you observe all empty results in your
* summary file please adjust the {@code --minimum-mapping-quality} parameter according to your input files.
*
*
*/
@CommandLineProgramProperties(
summary = "Tabulates pileup metrics for inferring contamination",
oneLineSummary = "Tabulates pileup metrics for inferring contamination",
programGroup = CoverageAnalysisProgramGroup.class)
@DocumentedFeature
public class GetPileupSummaries extends LocusWalker {
public static final String MAX_SITE_AF_LONG_NAME = "maximum-population-allele-frequency";
public static final String MIN_SITE_AF_LONG_NAME = "minimum-population-allele-frequency";
public static final String MAX_SITE_AF_SHORT_NAME = "max-af";
public static final String MIN_SITE_AF_SHORT_NAME = "min-af";
private static final double DEFAULT_MIN_POPULATION_AF = 0.01;
private static final double DEFAULT_MAX_POPULATION_AF = 0.2;
private static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 50;
@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="The output table", optional=false)
private File outputTable;
@Argument(fullName = StandardArgumentDefinitions.VARIANT_LONG_NAME, shortName = StandardArgumentDefinitions.VARIANT_SHORT_NAME, doc = "A VCF file containing variants and allele frequencies")
public FeatureInput variants;
@Argument(fullName = MIN_SITE_AF_LONG_NAME,
shortName = MIN_SITE_AF_SHORT_NAME,
doc = "Minimum population allele frequency of sites to consider. A low value increases accuracy at the expense of speed.", optional = true)
private double minPopulationAlleleFrequency = DEFAULT_MIN_POPULATION_AF;
@Argument(fullName = MAX_SITE_AF_LONG_NAME,
shortName = MAX_SITE_AF_SHORT_NAME,
doc = "Maximum population allele frequency of sites to consider.", optional = true)
private double maxPopulationAlleleFrequency = DEFAULT_MAX_POPULATION_AF;
private boolean sawVariantsWithoutAlleleFrequency = false;
private boolean sawVariantsWithAlleleFrequency = false;
PileupSummary.PileupSummaryTableWriter writer;
@Override
public boolean requiresReads() {
return true;
}
@Override
public boolean requiresReference() {
return false;
}
@Override
public boolean requiresIntervals() {
return true;
}
@Override
public boolean requiresFeatures() {
return true;
}
@Override
public List getDefaultReadFilters() {
final List filters = new ArrayList<>();
filters.add(new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY));
filters.add(ReadFilterLibrary.MAPPING_QUALITY_AVAILABLE);
filters.add(ReadFilterLibrary.MAPPING_QUALITY_NOT_ZERO);
filters.add(ReadFilterLibrary.MAPPED);
filters.add(ReadFilterLibrary.PRIMARY_LINE);
filters.add(ReadFilterLibrary.NOT_DUPLICATE);
filters.add(ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK);
filters.add(ReadFilterLibrary.NON_ZERO_REFERENCE_LENGTH_ALIGNMENT);
filters.add(ReadFilterLibrary.MATE_ON_SAME_CONTIG_OR_NO_MAPPED_MATE);
filters.add(ReadFilterLibrary.GOOD_CIGAR);
filters.add(new WellformedReadFilter());
return filters;
}
@Override
public void onTraversalStart() {
final boolean alleleFrequencyInHeader = ((VCFHeader) getHeaderForFeatures(variants)).getInfoHeaderLines().stream()
.anyMatch(line -> line.getID().equals(VCFConstants.ALLELE_FREQUENCY_KEY));
if (!alleleFrequencyInHeader) {
throw new UserException.BadInput("Population vcf does not have an allele frequency (AF) info field in its header.");
}
try {
writer = new PileupSummary.PileupSummaryTableWriter(IOUtils.fileToPath(outputTable));
final String sampleName = ReadUtils.getSamplesFromHeader(getHeaderForReads()).stream().findFirst().get();
writer.writeMetadata(TableUtils.SAMPLE_METADATA_TAG, sampleName);
} catch (IOException ex) {
throw new UserException.CouldNotCreateOutputFile(outputTable, ex);
}
}
@Override
public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) {
final List vcs = featureContext.getValues(variants);
if (vcs.isEmpty()) {
return;
}
final VariantContext vc = vcs.get(0);
if ( vc.isBiallelic() && vc.isSNP() && alleleFrequencyInRange(vc) ) {
final ReadPileup pileup = alignmentContext.getBasePileup();
try {
writer.writeRecord(new PileupSummary(vc, pileup));
} catch (final IOException ex) {
throw new UserException(String.format("Encountered an IO exception while writing to %s", outputTable));
}
}
}
@Override
public Object onTraversalSuccess() {
if (sawVariantsWithoutAlleleFrequency && !sawVariantsWithAlleleFrequency) {
throw new UserException.BadInput("No variants in population vcf had an allele frequency (AF) field.");
}
return "SUCCESS";
}
@Override
public void closeTool() {
try {
if (writer != null) {
writer.close();
}
} catch (IOException ex) {
throw new UserException(String.format("Encountered an IO exception while closing %s", outputTable));
}
}
private boolean alleleFrequencyInRange(final VariantContext vc) {
if (!vc.hasAttribute(VCFConstants.ALLELE_FREQUENCY_KEY)) {
if (!sawVariantsWithoutAlleleFrequency) {
logger.warn(String.format("Variant context at %s:%d lacks allele frequency (AF) field.", vc.getContig(), vc.getStart()));
sawVariantsWithoutAlleleFrequency = true;
}
return false;
} else {
sawVariantsWithAlleleFrequency = true;
final double alleleFrequency = vc.getAttributeAsDouble(VCFConstants.ALLELE_FREQUENCY_KEY, -1.0);
return minPopulationAlleleFrequency < alleleFrequency && alleleFrequency < maxPopulationAlleleFrequency;
}
}
}