org.broadinstitute.hellbender.tools.walkers.coverage.DepthOfCoverage Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.walkers.coverage;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.WellformedReadFilter;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.codecs.refseq.RefSeqFeature;
import org.broadinstitute.hellbender.utils.read.ReadUtils;

import java.io.IOException;
import java.util.*;

/**
 * Assess sequence coverage by a wide array of metrics, partitioned by sample, read group, or library
 *
 * 
 * This tool processes a set of bam files to determine coverage at different levels of partitioning and
 * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by
 * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles,
 * and/or percentage of bases covered to or beyond a threshold. Additionally, reads and bases can be filtered by
 * mapping or base quality score.
 * 
 *
 * Input
 * 
 *     One or more bam files (with proper headers) to be analyzed for coverage statistics
 *     (Optional) A REFSEQ file to aggregate coverage to the gene level (for information about creating the REFSEQ file, please consult the online documentation)
 * 

 * Output
 * 
 * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
 * 
 * 
 *     no suffix: per locus coverage
 *     _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
 *     _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
 *     _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
 *     _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
 *     _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
 *     _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples. In its current incarnation it will not include genes not at least partially covered (see --omit-genes-not-entirely-covered-by-traversal for details) 
 *     _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
 *     _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
 * 
 *
 * Notes
 * 
 *     DepthOfCoverage currently only supports typical nucleotide (and N) bases, IUPAC ambiguity codes or other non-ATCGN bases will cause exceptions
 *     Read filters are applied to the reads before being counted in coverage information. By default Duplicate Marked and non-primary alignments are not counted. This can be disabled with --disable-tool-default-read-filters.
 *     In order to filter reads out by their mapping qualities, the recommended approach is to use the MappingQualityReadFilter with the --minimum-mapping-quality or --maximum-mapping-quality arguments specified
 * 
 *
 * Usage example
 *  * gatk \
 *   DepthOfCoverage \
 *   -R reference.fasta \
 *   -O file_name_base \
 *   -I input_bams.list
 *   [-gene-list refSeq.sorted.refseq] \
 *   [-partition-type readgroup] \
 *   [--summary-coverage-threshold 4 --summary-coverage-threshold 6 --summary-coverage-threshold 10] \
 *   [-L my_capture_genes.interval_list]
 * 
 */

@CommandLineProgramProperties(
        summary = "Generate coverage summary information for reads data",
        oneLineSummary = "Generate coverage summary information for reads data",
        programGroup = CoverageAnalysisProgramGroup.class)
@BetaFeature
@DocumentedFeature
public class DepthOfCoverage extends LocusWalkerByInterval {
    private CoverageOutputWriter writer;
    // Map used to store running aggregate counts for intervals that are being recorded by DepthOfCoverage
    private Map activeCoveragePartitioner = new HashMap<>();
    // Map used to store target tables for each partition which are used in the computation of median coverage scores
    private Map perIntervalStatisticsAggregationByPartitioning = new HashMap<>();
    // Map used to store target tables for each partition which are used in the computation of median coverage scores
    private Map perGeneStatisticsAggregationByPartitioning = new HashMap<>();

    // PartitionDataStore corresponding to every base traversed by the tool
    private DepthOfCoveragePartitionedDataStore coverageTotalsForEntireTraversal;
    // List of all of the samples to be output split by the partition type
    private Map> globalIdentifierMap;

    /**
     * Base file name about which to create the coverage information
     */
    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Base file location to which to write coverage summary information, must be a path to a file")
    private String baseFileName = null;
    /**
     * Bases with quality scores lower than this threshold will be skipped. This is set to -1 by default to disable the evaluation and ignore this threshold.
     */
    @Argument(fullName = "min-base-quality", doc = "Minimum quality of bases to count towards depth", optional = true, minValue = 0, maxValue = Byte.MAX_VALUE)
    private byte minBaseQuality = 0;
    /**
     * Bases with quality scores higher than this threshold will be skipped. The default value is the largest number that can be represented as a byte.
     */
    @Argument(fullName = "max-base-quality", doc = "Maximum quality of bases to count towards depth", optional = true, minValue = 0, maxValue = Byte.MAX_VALUE)
    private byte maxBaseQuality = Byte.MAX_VALUE;

    @Argument(fullName = "count-type", doc = "How should overlapping reads from the same fragment be handled? NOTE: currently only COUNT_READS is supported", optional = true)
    private CoverageUtils.CountPileupType countType = CoverageUtils.CountPileupType.COUNT_READS;

    /**
     * Instead of reporting depth, the program will report the base pileup at each locus
     */
    @Argument(fullName = "print-base-counts", doc = "Add base counts to per-locus output", optional = true)
    private boolean printBaseCounts = false;

    /**
     * Disabling the tabulation of locus statistics (# loci covered by sample by coverage) should speed up processing.
     */
    @Argument(fullName = "omit-locus-table", doc = "Do not calculate per-sample per-depth counts of loci", optional = true)
    private boolean omitLocusTable = false;

    /**
     * Disabling the tabulation of interval statistics (mean, median, quartiles AND # intervals by sample by coverage) should speed up processing.
     */
    @Argument(fullName = "omit-interval-statistics", doc = "Do not calculate per-interval statistics", optional = true, mutex = "calculate-coverage-over-genes")
    private boolean omitIntervals = false;
    /**
     * Disabling the tabulation of total coverage at every base should speed up processing.
     */
    @Argument(fullName = "omit-depth-output-at-each-base", doc = "Do not output depth of coverage at each base", optional = true)
    private boolean omitDepthOutput = false;
    /**
     * This option simply disables writing separate files for per-sample summary statistics (total, mean, median, quartile coverage per sample). These statistics are still calculated internally, so enabling this option will not improve runtime.
     */
    @Argument(fullName = "omit-per-sample-statistics", doc = "Do not output the summary files per-sample", optional = true)
    private boolean omitSampleSummary = false;

    /**
     * Specify a RefSeq file for use in aggregating coverage statistics over genes.
     * 
     * This argument is incompatible with --omit-interval-statistics. A warning will be logged and no output file will be produced for the gene list if these arguments are enabled together.
     */
    @Argument(fullName = "calculate-coverage-over-genes", shortName = "gene-list", doc = "Calculate coverage statistics over this list of genes", optional = true, mutex = "omit-interval-statistics")
    private List refSeqGeneListFiles = new ArrayList<>();
    /**
     * Remove genes from the gene summary output file if all of its exon bases were not completely covered by traversal.
     */
    @Argument(fullName = "omit-genes-not-entirely-covered-by-traversal", doc = "Do not output gene summary if it was not completely covered by traversal intervals", optional = true)
    private boolean omitPartiallyCoveredGenes = false;

    /**
     * Output file format (e.g. csv, table, rtable); defaults to r-readable table.
     */
    @Argument(fullName = "output-format", doc = "The format of the output file", optional = true)
    CoverageOutputWriter.DEPTH_OF_COVERAGE_OUTPUT_FORMAT outputFormat = CoverageOutputWriter.DEPTH_OF_COVERAGE_OUTPUT_FORMAT.CSV;


    // ---------------------------------------------------------------------------
    //
    // Advanced arguments
    //
    // ---------------------------------------------------------------------------

    /**
     * Normally, sites where the reference is N (or another non-canonical base) are skipped. If this option is enabled, these sites will be included in DoC calculations if there is coverage from neighboring reads.
     */
    @Advanced
    @Argument(fullName = "include-ref-n-sites", doc = "Include sites where the reference is N", optional = true)
    private boolean includeRefNBases = false;
    /**
     * Sets the low-coverage cutoff for granular binning. All loci with depth < START are counted in the first bin.
     */
    @Advanced
    @Argument(fullName = "start", doc = "Starting (left endpoint) for granular binning", optional = true, minValue = 0)
    private int start = 1;
    /**
     * Sets the high-coverage cutoff for granular binning. All loci with depth > STOP are counted in the last bin.
     */
    @Advanced
    @Argument(fullName = "stop", doc = "Ending (right endpoint) for granular binning", optional = true, minValue = 1)
    private int stop = 500;
    /**
     * Sets the number of bins for granular binning
     */
    @Advanced
    @Argument(fullName = "nBins", doc = "Number of bins to use for granular binning", optional = true, minValue = 0, minRecommendedValue = 1)
    private int nBins = 499;

    /**
     * By default, coverage is partitioned by sample, but it can be any combination of sample, readgroup and/or library.
     */
    @Argument(fullName = "partition-type", shortName = "pt", doc = "Partition type for depth of coverage", optional = true)
    private EnumSet partitionTypes = EnumSet.of(DoCOutputType.Partition.sample);

    /**
     * Consider a spanning deletion as contributing to coverage. Also enables deletion counts in per-base output.
     */
    @Advanced
    @Argument(fullName = "include-deletions", doc = "Include information on deletions alongside other bases in output table counts", optional = true)
    private boolean includeDeletions = false;

    @Advanced
    @Argument(fullName = "ignore-deletion-sites", doc = "Ignore sites consisting only of deletions", optional = true)
    boolean ignoreDeletionSites = false;

    /**
     * For summary file outputs, report the percentage of bases covered to an amount equal to or greater than this number  (e.g. % bases >= CT for each sample). Defaults to 15; can take multiple arguments.
     */
    @Advanced
    @Argument(fullName = "summary-coverage-threshold", doc = "Coverage threshold (in percent) for summarizing statistics", optional = true)
    private List coverageThresholds = new ArrayList<>(Collections.singleton(15));

    // We explicitly handle read N bases in the code
    @Override
    public boolean includeNs() {
        return true;
    }

    // We want to make sure to still generate coverage information over uncovered bases.
    @Override
    public boolean emitEmptyLoci() {
        return true;
    }

    @Override
    public boolean includeDeletions() {
        return includeDeletions && ! ignoreDeletionSites;
    }

    // For now this tool requires a reference: s
    @Override
    public boolean requiresReference() {
        return true;
    }

    @Override
    public List getDefaultReadFilters() {
        final List defaultFilters = new ArrayList<>(2);
        defaultFilters.add(new WellformedReadFilter());
        defaultFilters.add(new ReadFilterLibrary.NotDuplicateReadFilter());
        defaultFilters.add(ReadFilterLibrary.NOT_SECONDARY_ALIGNMENT);
        defaultFilters.add(new ReadFilterLibrary.MappedReadFilter());
        return defaultFilters;
    }


    @Override
    public void onTraversalStart() {

        try {
            writer = new CoverageOutputWriter(outputFormat,
                    partitionTypes,
                    baseFileName,
                    !refSeqGeneListFiles.isEmpty(),
                    printBaseCounts,
                    omitDepthOutput,
                    omitIntervals,
                    omitSampleSummary,
                    omitLocusTable,
                    coverageThresholds);
        } catch (IOException e) {
            throw new UserException.CouldNotCreateOutputFile("Couldn't create output file, encountered exception: " + e.getMessage(), e);
        }

        globalIdentifierMap = makeGlobalIdentifierMap(partitionTypes);
        writer.writeCoverageOutputHeaders(globalIdentifierMap);

        ReadUtils.getSamplesFromHeader(getHeaderForReads());

        coverageTotalsForEntireTraversal = createCoveragePartitioner();
    }

    /**
     *
     * @param alignmentContext current alignment context
     * @param referenceContext Reference bases spanning the current locus. Will be an empty, but non-null, context object
     *                         if there is no backing source of reference data (in which case all queries on it will return
     *                         an empty array/iterator). Can request extra bases of context around the current locus
     *                         by invoking {@link ReferenceContext#setWindow} on this object before calling {@link ReferenceContext#getBases}
     * @param featureContext Features spanning the current locus. Will be an empty, but non-null, context object
     *                       if there is no backing source of Feature data (in which case all queries on it will return an
     * @param activeIntervals
     */
    @Override
    public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext, Set activeIntervals) {
        // TODO evaluate consequences of supporting nonexistant references
        if (includeRefNBases || (hasReference() && BaseUtils.isRegularBase(referenceContext.getBase()))) {
            final Map> countsByPartition = CoverageUtils.getBaseCountsByPartition(alignmentContext, minBaseQuality, maxBaseQuality, countType, partitionTypes, getHeaderForReads());

            if (!omitDepthOutput) {
                writer.writePerLocusDepthSummary(referenceContext.getInterval(), countsByPartition, globalIdentifierMap, includeDeletions);
            }

            // Update the traversing partitioners with this locus data:
            coverageTotalsForEntireTraversal.addLocusData(countsByPartition);

            // Update all of the active intervals that we are tracking seperately with the generated counts
            for (Locatable loc : activeCoveragePartitioner.keySet()) {
                // For genes, we don't want to update the interval for non-exon bases
                if (loc.contains(alignmentContext)) {
                    activeCoveragePartitioner.get(loc).addLocusData(countsByPartition);
                }
            }
        }
    }

    /**
     * Ensure that we are tracking the newly provided interval with per-locus data
     *
     * @param activeInterval Locatable provided to the walker to be initialized
     */
    public void onIntervalStart(Locatable activeInterval) {
        DepthOfCoveragePartitionedDataStore partitioner = createCoveragePartitioner();
        //NOTE: we don't populate perIntervalStatisticsAggregationByPartitioning here because that gets populated on the fly later
        activeCoveragePartitioner.put(activeInterval, partitioner);
    }

    /**
     * When this method is called, we expect at some point in the past onIntervalStart() was called on the same Locatable
     * object and that we have been continuously tracking and adding per-base information for each base in the pileup
     * that overlaps those objects. Given the interval being closed out, we perform one of the following tasks:
     *
     *  - Write out per-interval summary information to each of the partition interval summary files and update the target
     *    summary table corresponding to each track.
     *
     *  - If it corresponded to a gene interval, write out to the gene summary tables the line corresponding the coverage
     *    over targets contained within the genes exome.
     *
     * @param activeInterval
     */
    public void onIntervalEnd(final Locatable activeInterval) {
        final DepthOfCoveragePartitionedDataStore partitionerToRemove = activeCoveragePartitioner.remove(activeInterval);
        if (activeInterval instanceof SimpleInterval) {
            // For each partition type we are managing, make sure we update the active statistics table
            if (!omitIntervals) {
                for (DoCOutputType.Partition p : partitionTypes) {
                    // Write the per-interval depth information as necessary
                    final DepthOfCoverageStats coverageByAggregationPartitionType = partitionerToRemove.getCoverageByAggregationType(p);
                    writer.writePerIntervalDepthInformation(p, (SimpleInterval) activeInterval, coverageByAggregationPartitionType, globalIdentifierMap.get(p));

                    // Create a new table if necessary
                    if (!perIntervalStatisticsAggregationByPartitioning.containsKey(p)) {
                        perIntervalStatisticsAggregationByPartitioning.put(p, new int[coverageByAggregationPartitionType.getHistograms().size()][coverageByAggregationPartitionType.getEndpoints().length + 1]);
                    }
                    // Update the target table to reflect the updated coverage information for this target
                    CoverageUtils.updateTargetTable(perIntervalStatisticsAggregationByPartitioning.get(p), coverageByAggregationPartitionType);
                }
            }

        } else if (activeInterval instanceof RefSeqFeature) {
            DepthOfCoverageStats coverageBySample = partitionerToRemove.getCoverageByAggregationType(DoCOutputType.Partition.sample);

            if ( ! omitPartiallyCoveredGenes || ((RefSeqFeature)activeInterval).getTotalExonLength() <= coverageBySample.getNumberOfLociCovered()) {
                writer.writePerGeneDepthInformation((RefSeqFeature) activeInterval, coverageBySample, globalIdentifierMap.get(DoCOutputType.Partition.sample));

                final DepthOfCoverageStats coverageByAggregationPartitionType = partitionerToRemove.getCoverageByAggregationType(DoCOutputType.Partition.sample);

                // Create a new table if necessary
                if (!perGeneStatisticsAggregationByPartitioning.containsKey(DoCOutputType.Partition.sample)) {
                    perGeneStatisticsAggregationByPartitioning.put(DoCOutputType.Partition.sample, new int[coverageByAggregationPartitionType.getHistograms().size()][coverageByAggregationPartitionType.getEndpoints().length + 1]);
                }

                // Update the target table to reflect the updated coverage information for this target
                CoverageUtils.updateTargetTable(perGeneStatisticsAggregationByPartitioning.get(DoCOutputType.Partition.sample), coverageByAggregationPartitionType);

            }
        } else {
            throw new GATKException("Unrecognized Locatable object supplied for traversal, only RefSeqFeature and SimpleInterval are supported: "+activeInterval.toString());
        }
    }

    /**
     * When we finish traversal we should have the following held in state:
     *  1. {@link #perIntervalStatisticsAggregationByPartitioning} that contains a map from partition to a doubly indexed integer array
     *     that corresponds to a count of the number of data sources with at least the given depth. Importantly this is
     *     expected to have had CoverageUtils.updateTargetTable() called on it once per interval (i.e. once per locus).
     *
     *  2. {@link #coverageTotalsForEntireTraversal} Which we expect to have had update() called on it exactly once for
     *     every locus traversed by the locus walker. Consequently we expect partitioner to contain partitioned counts
     *     corresponding to the entire window covered by the tool.
     *
     */
    @Override
    public Object onTraversalSuccess() {
        // Write out accumulated interval summary statistics
        for (DoCOutputType.Partition partition : perIntervalStatisticsAggregationByPartitioning.keySet()) {
            writer.writeOutputIntervalStatistics(partition, perIntervalStatisticsAggregationByPartitioning.get(partition), CoverageUtils.calculateCoverageHistogramBinEndpoints(start,stop,nBins));
        }

        // Write out accumulated gene summary statistics
        if (!refSeqGeneListFiles.isEmpty()) {
            for (DoCOutputType.Partition partition : perGeneStatisticsAggregationByPartitioning.keySet()) {
                writer.writeOutputGeneStatistics(perGeneStatisticsAggregationByPartitioning.get(partition), CoverageUtils.calculateCoverageHistogramBinEndpoints(start, stop, nBins));
            }
        }

        if (!omitSampleSummary) {
            logger.info("Outputting summary info");
            for (DoCOutputType.Partition type : partitionTypes) {
                writer.writeCumulativeOutputSummaryFiles(coverageTotalsForEntireTraversal, type, globalIdentifierMap.get(type));
            }
        }

        if (!omitLocusTable) {
            logger.info("Outputting locus summary");
            for (DoCOutputType.Partition type : partitionTypes) {
                writer.writePerLocusCumulativeCoverageMetrics(coverageTotalsForEntireTraversal, type, globalIdentifierMap.get(type));
            }
        }

        writer.close();
        return "success";
    }

    // Initialize a coveragePartitioner object for storing interval data.
    private DepthOfCoveragePartitionedDataStore createCoveragePartitioner() {
        return new DepthOfCoveragePartitionedDataStore(partitionTypes, start, stop, nBins, includeDeletions, omitLocusTable, globalIdentifierMap);
    }

    /**
     * This makes a global sample map by identifier. That is, for every requested partition all of the samples corresponding
     * to that partition are listed and sorted lexicographically.
     *
     * @param types Partition types to generate samples for
     * @return A map of {@link DoCOutputType.Partition} to list of sorted samples
     */
    private Map> makeGlobalIdentifierMap(Collection types) {
        Map> partitions = new LinkedHashMap<>();
        for (DoCOutputType.Partition t : types) {
            List samplesForType = new ArrayList<>(getSamplesByPartitionFromReadHeader(t));
            samplesForType.sort(String::compareTo);
            partitions.put(t, Collections.unmodifiableList(samplesForType));
        }
        return Collections.unmodifiableMap(partitions);
    }

    // Return all of the samples denoted by a given partition based on the header.
    private HashSet getSamplesByPartitionFromReadHeader(DoCOutputType.Partition type) {
        final HashSet partition = new HashSet();
        final SAMFileHeader header = getHeaderForReads();
        if (type == DoCOutputType.Partition.sample) {
            partition.addAll(ReadUtils.getSamplesFromHeader(header));
        } else {
            for (SAMReadGroupRecord rg : header.getReadGroups()) {
                partition.add(CoverageUtils.getTypeID(rg, type));
            }
        }
        return partition;
    }


    /**
     * We want to keep track of per-interval information for both the user specified intervals and for user specified genes
     *
     * @return A combined list of the unmerged user specified intervals and any specified refSeqFeatures specified
     */
    @Override
    public List getIntervalObjectsToQueryOver() {
        List userProvidedIntervals = intervalArgumentCollection.getIntervalsWithoutMerging(getBestAvailableSequenceDictionary());

        final List refSeqInputs = new ArrayList<>(userProvidedIntervals);
        for (String input : refSeqGeneListFiles) {
            FeatureDataSource refSeqReader = new FeatureDataSource<>(input);
            for (final RefSeqFeature geneRecord : refSeqReader) {
                refSeqInputs.add(geneRecord);
            }
            refSeqReader.close();
        }
        return refSeqInputs;
    }
}