All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.coverage.DepthOfCoverage Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.walkers.coverage;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.codecs.refseq.RefSeqFeature;
import org.broadinstitute.hellbender.utils.read.ReadUtils;

import java.io.IOException;
import java.util.*;

/**
 * Assess sequence coverage by a wide array of metrics, partitioned by sample, read group, or library
 *
 * 

* This tool processes a set of bam files to determine coverage at different levels of partitioning and * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles, * and/or percentage of bases covered to or beyond a threshold. Additionally, reads and bases can be filtered by * mapping or base quality score. *

* *

Input

*
    *
  • One or more bam files (with proper headers) to be analyzed for coverage statistics
  • *
  • (Optional) A REFSEQ file to aggregate coverage to the gene level (for information about creating the REFSEQ file, please consult the online documentation)
  • *
*

Output

*

* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents: *

*
    *
  • no suffix: per locus coverage
  • *
  • _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
  • *
  • _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
  • *
  • _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
  • *
  • _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
  • *
  • _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
  • *
  • _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples. In its current incarnation it will not include genes not at least partially covered (see --omit-genes-not-entirely-covered-by-traversal for details)
  • *
  • _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
  • *
  • _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
  • *
* *

Notes

*
    *
  • DepthOfCoverage currently only supports typical nucleotide (and N) bases, IUPAC ambiguity codes or other non-ATCGN bases will cause exceptions
  • *
  • Read filters are applied to the reads before being counted in coverage information. By default Duplicate Marked and non-primary alignments are not counted. This can be disabled with --disable-tool-default-read-filters.
  • *
  • In order to filter reads out by their mapping qualities, the recommended approach is to use the MappingQualityReadFilter with the --minimum-mapping-quality or --maximum-mapping-quality arguments specified
  • *
* *

Usage example

*
 * gatk \
 *   DepthOfCoverage \
 *   -R reference.fasta \
 *   -O file_name_base \
 *   -I input_bams.list
 *   [-gene-list refSeq.sorted.refseq] \
 *   [-partition-type readgroup] \
 *   [--summary-coverage-threshold 4 --summary-coverage-threshold 6 --summary-coverage-threshold 10] \
 *   [-L my_capture_genes.interval_list]
 * 
*/ @CommandLineProgramProperties( summary = "Generate coverage summary information for reads data", oneLineSummary = "Generate coverage summary information for reads data", programGroup = CoverageAnalysisProgramGroup.class) @BetaFeature @DocumentedFeature public class DepthOfCoverage extends LocusWalkerByInterval { private CoverageOutputWriter writer; // Map used to store running aggregate counts for intervals that are being recorded by DepthOfCoverage private Map activeCoveragePartitioner = new HashMap<>(); // Map used to store target tables for each partition which are used in the computation of median coverage scores private Map perIntervalStatisticsAggregationByPartitioning = new HashMap<>(); // Map used to store target tables for each partition which are used in the computation of median coverage scores private Map perGeneStatisticsAggregationByPartitioning = new HashMap<>(); // PartitionDataStore corresponding to every base traversed by the tool private DepthOfCoveragePartitionedDataStore coverageTotalsForEntireTraversal; // List of all of the samples to be output split by the partition type private Map> globalIdentifierMap; /** * Base file name about which to create the coverage information */ @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Base file location to which to write coverage summary information, must be a path to a file") private String baseFileName = null; /** * Bases with quality scores lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold. */ @Argument(fullName = "min-base-quality", doc = "Minimum quality of bases to count towards depth", optional = true, minValue = 0, maxValue = Byte.MAX_VALUE) private byte minBaseQuality = 0; /** * Bases with quality scores higher than this threshold will be skipped. The default value is the largest number that can be represented as a byte. */ @Argument(fullName = "max-base-quality", doc = "Maximum quality of bases to count towards depth", optional = true, minValue = 0, maxValue = Byte.MAX_VALUE) private byte maxBaseQuality = Byte.MAX_VALUE; @Argument(fullName = "count-type", doc = "How should overlapping reads from the same fragment be handled? NOTE: currently only COUNT_READS is supported", optional = true) private CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS; /** * Instead of reporting depth, the program will report the base pileup at each locus */ @Argument(fullName = "print-base-counts", doc = "Add base counts to per-locus output", optional = true) private boolean printBaseCounts = false; /** * Disabling the tabulation of locus statistics (# loci covered by sample by coverage) should speed up processing. */ @Argument(fullName = "omit-locus-table", doc = "Do not calculate per-sample per-depth counts of loci", optional = true) private boolean omitLocusTable = false; /** * Disabling the tabulation of interval statistics (mean, median, quartiles AND # intervals by sample by coverage) should speed up processing. */ @Argument(fullName = "omit-interval-statistics", doc = "Do not calculate per-interval statistics", optional = true, mutex = "calculate-coverage-over-genes") private boolean omitIntervals = false; /** * Disabling the tabulation of total coverage at every base should speed up processing. */ @Argument(fullName = "omit-depth-output-at-each-base", doc = "Do not output depth of coverage at each base", optional = true) private boolean omitDepthOutput = false; /** * This option simply disables writing separate files for per-sample summary statistics (total, mean, median, quartile coverage per sample). These statistics are still calculated internally, so enabling this option will not improve runtime. */ @Argument(fullName = "omit-per-sample-statistics", doc = "Do not output the summary files per-sample", optional = true) private boolean omitSampleSummary = false; /** * Specify a RefSeq file for use in aggregating coverage statistics over genes. *

* This argument is incompatible with --omit-interval-statistics. A warning will be logged and no output file will be produced for the gene list if these arguments are enabled together. */ @Argument(fullName = "calculate-coverage-over-genes", shortName = "gene-list", doc = "Calculate coverage statistics over this list of genes", optional = true, mutex = "omit-interval-statistics") private List refSeqGeneListFiles = new ArrayList<>(); /** * Remove genes from the gene summary output file if all of its exon bases were not completely covered by traversal. */ @Argument(fullName = "omit-genes-not-entirely-covered-by-traversal", doc = "Do not output gene summary if it was not completely covered by traversal intervals", optional = true) private boolean omitPartiallyCoveredGenes = false; /** * Output file format (e.g. csv, table, rtable); defaults to r-readable table. */ @Argument(fullName = "output-format", doc = "The format of the output file", optional = true) CoverageOutputWriter.DEPTH_OF_COVERAGE_OUTPUT_FORMAT outputFormat = CoverageOutputWriter.DEPTH_OF_COVERAGE_OUTPUT_FORMAT.CSV; // --------------------------------------------------------------------------- // // Advanced arguments // // --------------------------------------------------------------------------- /** * Normally, sites where the reference is N (or another non-canonical base) are skipped. If this option is enabled, these sites will be included in DoC calculations if there is coverage from neighboring reads. */ @Advanced @Argument(fullName = "include-ref-n-sites", doc = "Include sites where the reference is N", optional = true) private boolean includeRefNBases = false; /** * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin. */ @Advanced @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", optional = true, minValue = 0) private int start = 1; /** * Sets the high-coverage cutoff for granular binning. All loci with depth > STOP are counted in the last bin. */ @Advanced @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", optional = true, minValue = 1) private int stop = 500; /** * Sets the number of bins for granular binning */ @Advanced @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", optional = true, minValue = 0, minRecommendedValue = 1) private int nBins = 499; /** * By default, coverage is partitioned by sample, but it can be any combination of sample, readgroup and/or library. */ @Argument(fullName = "partition-type", shortName = "pt", doc = "Partition type for depth of coverage", optional = true) private EnumSet partitionTypes = EnumSet.of(DoCOutputType.Partition.sample); /** * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output. */ @Advanced @Argument(fullName = "include-deletions", doc = "Include information on deletions alongside other bases in output table counts", optional = true) private boolean includeDeletions = false; @Advanced @Argument(fullName = "ignore-deletion-sites", doc = "Ignore sites consisting only of deletions", optional = true) boolean ignoreDeletionSites = false; /** * For summary file outputs, report the percentage of bases covered to an amount equal to or greater than this number (e.g. % bases >= CT for each sample). Defaults to 15; can take multiple arguments. */ @Advanced @Argument(fullName = "summary-coverage-threshold", doc = "Coverage threshold (in percent) for summarizing statistics", optional = true) private List coverageThresholds = new ArrayList<>(Collections.singleton(15)); // We explicitly handle read N bases in the code @Override public boolean includeNs() { return true; } // We want to make sure to still generate coverage information over uncovered bases. @Override public boolean emitEmptyLoci() { return true; } @Override public boolean includeDeletions() { return includeDeletions && ! ignoreDeletionSites; } // For now this tool requires a reference: s @Override public boolean requiresReference() { return true; } @Override public List getDefaultReadFilters() { final List defaultFilters = new ArrayList<>(2); defaultFilters.add(new WellformedReadFilter()); defaultFilters.add(new ReadFilterLibrary.NotDuplicateReadFilter()); defaultFilters.add(ReadFilterLibrary.NOT_SECONDARY_ALIGNMENT); defaultFilters.add(new ReadFilterLibrary.MappedReadFilter()); return defaultFilters; } @Override public void onTraversalStart() { try { writer = new CoverageOutputWriter(outputFormat, partitionTypes, baseFileName, !refSeqGeneListFiles.isEmpty(), printBaseCounts, omitDepthOutput, omitIntervals, omitSampleSummary, omitLocusTable, coverageThresholds); } catch (IOException e) { throw new UserException.CouldNotCreateOutputFile("Couldn't create output file, encountered exception: " + e.getMessage(), e); } globalIdentifierMap = makeGlobalIdentifierMap(partitionTypes); writer.writeCoverageOutputHeaders(globalIdentifierMap); ReadUtils.getSamplesFromHeader(getHeaderForReads()); coverageTotalsForEntireTraversal = createCoveragePartitioner(); } /** * * @param alignmentContext current alignment context * @param referenceContext Reference bases spanning the current locus. Will be an empty, but non-null, context object * if there is no backing source of reference data (in which case all queries on it will return * an empty array/iterator). Can request extra bases of context around the current locus * by invoking {@link ReferenceContext#setWindow} on this object before calling {@link ReferenceContext#getBases} * @param featureContext Features spanning the current locus. Will be an empty, but non-null, context object * if there is no backing source of Feature data (in which case all queries on it will return an * @param activeIntervals */ @Override public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext, Set activeIntervals) { // TODO evaluate consequences of supporting nonexistant references if (includeRefNBases || (hasReference() && BaseUtils.isRegularBase(referenceContext.getBase()))) { final Map> countsByPartition = CoverageUtils.getBaseCountsByPartition(alignmentContext, minBaseQuality, maxBaseQuality, countType, partitionTypes, getHeaderForReads()); if (!omitDepthOutput) { writer.writePerLocusDepthSummary(referenceContext.getInterval(), countsByPartition, globalIdentifierMap, includeDeletions); } // Update the traversing partitioners with this locus data: coverageTotalsForEntireTraversal.addLocusData(countsByPartition); // Update all of the active intervals that we are tracking seperately with the generated counts for (Locatable loc : activeCoveragePartitioner.keySet()) { // For genes, we don't want to update the interval for non-exon bases if (loc.contains(alignmentContext)) { activeCoveragePartitioner.get(loc).addLocusData(countsByPartition); } } } } /** * Ensure that we are tracking the newly provided interval with per-locus data * * @param activeInterval Locatable provided to the walker to be initialized */ public void onIntervalStart(Locatable activeInterval) { DepthOfCoveragePartitionedDataStore partitioner = createCoveragePartitioner(); //NOTE: we don't populate perIntervalStatisticsAggregationByPartitioning here because that gets populated on the fly later activeCoveragePartitioner.put(activeInterval, partitioner); } /** * When this method is called, we expect at some point in the past onIntervalStart() was called on the same Locatable * object and that we have been continuously tracking and adding per-base information for each base in the pileup * that overlaps those objects. Given the interval being closed out, we perform one of the following tasks: * * - Write out per-interval summary information to each of the partition interval summary files and update the target * summary table corresponding to each track. * * - If it corresponded to a gene interval, write out to the gene summary tables the line corresponding the coverage * over targets contained within the genes exome. * * @param activeInterval */ public void onIntervalEnd(final Locatable activeInterval) { final DepthOfCoveragePartitionedDataStore partitionerToRemove = activeCoveragePartitioner.remove(activeInterval); if (activeInterval instanceof SimpleInterval) { // For each partition type we are managing, make sure we update the active statistics table if (!omitIntervals) { for (DoCOutputType.Partition p : partitionTypes) { // Write the per-interval depth information as necessary final DepthOfCoverageStats coverageByAggregationPartitionType = partitionerToRemove.getCoverageByAggregationType(p); writer.writePerIntervalDepthInformation(p, (SimpleInterval) activeInterval, coverageByAggregationPartitionType, globalIdentifierMap.get(p)); // Create a new table if necessary if (!perIntervalStatisticsAggregationByPartitioning.containsKey(p)) { perIntervalStatisticsAggregationByPartitioning.put(p, new int[coverageByAggregationPartitionType.getHistograms().size()][coverageByAggregationPartitionType.getEndpoints().length + 1]); } // Update the target table to reflect the updated coverage information for this target CoverageUtils.updateTargetTable(perIntervalStatisticsAggregationByPartitioning.get(p), coverageByAggregationPartitionType); } } } else if (activeInterval instanceof RefSeqFeature) { DepthOfCoverageStats coverageBySample = partitionerToRemove.getCoverageByAggregationType(DoCOutputType.Partition.sample); if ( ! omitPartiallyCoveredGenes || ((RefSeqFeature)activeInterval).getTotalExonLength() <= coverageBySample.getNumberOfLociCovered()) { writer.writePerGeneDepthInformation((RefSeqFeature) activeInterval, coverageBySample, globalIdentifierMap.get(DoCOutputType.Partition.sample)); final DepthOfCoverageStats coverageByAggregationPartitionType = partitionerToRemove.getCoverageByAggregationType(DoCOutputType.Partition.sample); // Create a new table if necessary if (!perGeneStatisticsAggregationByPartitioning.containsKey(DoCOutputType.Partition.sample)) { perGeneStatisticsAggregationByPartitioning.put(DoCOutputType.Partition.sample, new int[coverageByAggregationPartitionType.getHistograms().size()][coverageByAggregationPartitionType.getEndpoints().length + 1]); } // Update the target table to reflect the updated coverage information for this target CoverageUtils.updateTargetTable(perGeneStatisticsAggregationByPartitioning.get(DoCOutputType.Partition.sample), coverageByAggregationPartitionType); } } else { throw new GATKException("Unrecognized Locatable object supplied for traversal, only RefSeqFeature and SimpleInterval are supported: "+activeInterval.toString()); } } /** * When we finish traversal we should have the following held in state: * 1. {@link #perIntervalStatisticsAggregationByPartitioning} that contains a map from partition to a doubly indexed integer array * that corresponds to a count of the number of data sources with at least the given depth. Importantly this is * expected to have had CoverageUtils.updateTargetTable() called on it once per interval (i.e. once per locus). * * 2. {@link #coverageTotalsForEntireTraversal} Which we expect to have had update() called on it exactly once for * every locus traversed by the locus walker. Consequently we expect partitioner to contain partitioned counts * corresponding to the entire window covered by the tool. * */ @Override public Object onTraversalSuccess() { // Write out accumulated interval summary statistics for (DoCOutputType.Partition partition : perIntervalStatisticsAggregationByPartitioning.keySet()) { writer.writeOutputIntervalStatistics(partition, perIntervalStatisticsAggregationByPartitioning.get(partition), CoverageUtils.calculateCoverageHistogramBinEndpoints(start,stop,nBins)); } // Write out accumulated gene summary statistics if (!refSeqGeneListFiles.isEmpty()) { for (DoCOutputType.Partition partition : perGeneStatisticsAggregationByPartitioning.keySet()) { writer.writeOutputGeneStatistics(perGeneStatisticsAggregationByPartitioning.get(partition), CoverageUtils.calculateCoverageHistogramBinEndpoints(start, stop, nBins)); } } if (!omitSampleSummary) { logger.info("Outputting summary info"); for (DoCOutputType.Partition type : partitionTypes) { writer.writeCumulativeOutputSummaryFiles(coverageTotalsForEntireTraversal, type, globalIdentifierMap.get(type)); } } if (!omitLocusTable) { logger.info("Outputting locus summary"); for (DoCOutputType.Partition type : partitionTypes) { writer.writePerLocusCumulativeCoverageMetrics(coverageTotalsForEntireTraversal, type, globalIdentifierMap.get(type)); } } writer.close(); return "success"; } // Initialize a coveragePartitioner object for storing interval data. private DepthOfCoveragePartitionedDataStore createCoveragePartitioner() { return new DepthOfCoveragePartitionedDataStore(partitionTypes, start, stop, nBins, includeDeletions, omitLocusTable, globalIdentifierMap); } /** * This makes a global sample map by identifier. That is, for every requested partition all of the samples corresponding * to that partition are listed and sorted lexicographically. * * @param types Partition types to generate samples for * @return A map of {@link DoCOutputType.Partition} to list of sorted samples */ private Map> makeGlobalIdentifierMap(Collection types) { Map> partitions = new LinkedHashMap<>(); for (DoCOutputType.Partition t : types) { List samplesForType = new ArrayList<>(getSamplesByPartitionFromReadHeader(t)); samplesForType.sort(String::compareTo); partitions.put(t, Collections.unmodifiableList(samplesForType)); } return Collections.unmodifiableMap(partitions); } // Return all of the samples denoted by a given partition based on the header. private HashSet getSamplesByPartitionFromReadHeader(DoCOutputType.Partition type) { final HashSet partition = new HashSet(); final SAMFileHeader header = getHeaderForReads(); if (type == DoCOutputType.Partition.sample) { partition.addAll(ReadUtils.getSamplesFromHeader(header)); } else { for (SAMReadGroupRecord rg : header.getReadGroups()) { partition.add(CoverageUtils.getTypeID(rg, type)); } } return partition; } /** * We want to keep track of per-interval information for both the user specified intervals and for user specified genes * * @return A combined list of the unmerged user specified intervals and any specified refSeqFeatures specified */ @Override public List getIntervalObjectsToQueryOver() { List userProvidedIntervals = intervalArgumentCollection.getIntervalsWithoutMerging(getBestAvailableSequenceDictionary()); final List refSeqInputs = new ArrayList<>(userProvidedIntervals); for (String input : refSeqGeneListFiles) { FeatureDataSource refSeqReader = new FeatureDataSource<>(input); for (final RefSeqFeature geneRecord : refSeqReader) { refSeqInputs.add(geneRecord); } refSeqReader.close(); } return refSeqInputs; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy