org.broadinstitute.hellbender.tools.copynumber.CollectReadCounts Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gatk Show documentation
Development on GATK 4
The newest version!
package org.broadinstitute.hellbender.tools.copynumber;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multiset;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.OverlapDetector;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.HDF5SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.Metadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.MetadataUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Collects read counts at specified intervals.  The count for each interval is calculated by counting
 * the number of read starts that lie in the interval.
 *
 * Inputs
 *
 * 
 *     
 *         SAM format read data
 *     
 *     
 *         Intervals at which counts will be collected.
 *         The argument {@code interval-merging-rule} must be set to {@link IntervalMergingRule#OVERLAPPING_ONLY}
 *         and all other common arguments for interval padding or merging must be set to their defaults.
 *     
 *     
 *         Output file format.  This can be used to select TSV or HDF5 output.
 *     
 * 
 *
 * Outputs
 *
 * 
 *     
 *         Counts file.
 *         By default, the tool produces HDF5 format results. This can be changed with the {@code format} option
 *         to TSV format. Using HDF5 files with {@link CreateReadCountPanelOfNormals}
 *         can decrease runtime, by reducing time spent on IO, so this is the default output format.
 *         The HDF5 format contains information in the paths defined in {@link HDF5SimpleCountCollection}. HDF5 files may be viewed using
 *         hdfview or loaded in Python using
 *         PyTables or h5py.
 *         The TSV format has a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in
 *         {@link SimpleCountCollection.SimpleCountTableColumn}, and the corresponding entry rows.
 *     
 * 
 *
 * Usage examples
 *
 *  *     gatk CollectReadCounts \
 *          -I sample.bam \
 *          -L intervals.interval_list \
 *          --interval-merging-rule OVERLAPPING_ONLY \
 *          -O sample.counts.hdf5
 * 
 *
 * @author Andrey Smirnov <[email protected]>
 * @author Samuel Lee <[email protected]>
 */
@CommandLineProgramProperties(
        summary = "Collects read counts at specified intervals",
        oneLineSummary = "Collects read counts at specified intervals",
        programGroup = CoverageAnalysisProgramGroup.class
)
@DocumentedFeature
@WorkflowProperties
public final class CollectReadCounts extends ReadWalker {
    public enum Format {
        TSV, HDF5
    }

    private static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 30;

    public static final String FORMAT_LONG_NAME = "format";

    @Argument(
            doc = "Output file for read counts.",
            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
    )
    @WorkflowOutput
    private File outputCountsFile = null;

    @Argument(
            doc = "Output file format.",
            fullName = FORMAT_LONG_NAME,
            optional = true
    )
    private Format format = Format.HDF5;

    /**
     * Metadata contained in the BAM file.
     */
    private SampleLocatableMetadata metadata;

    private List intervals;

    private String currentContig = null;

    /**
     * Overlap detector used to determine when read starts overlap with input intervals.
     */
    private CachedOverlapDetector intervalCachedOverlapDetector;

    private Multiset intervalMultiset;

    @Override
    public boolean requiresIntervals() {
        return true;
    }

    @Override
    public List getDefaultReadFilters() {
        final List readFilters = new ArrayList<>(super.getDefaultReadFilters());
        readFilters.add(ReadFilterLibrary.MAPPED);
        readFilters.add(ReadFilterLibrary.NON_ZERO_REFERENCE_LENGTH_ALIGNMENT);
        readFilters.add(ReadFilterLibrary.NOT_DUPLICATE);
        readFilters.add(new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY));
        return readFilters;
    }

    @Override
    public void onTraversalStart() {
        validateArguments();

        metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
        final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
        //this check is currently redundant, since the master dictionary is taken from the reads;
        //however, if any other dictionary is added in the future, such a check should be performed
        if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
            logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
        }

        intervals = intervalArgumentCollection.getIntervals(sequenceDictionary);
        intervalMultiset = HashMultiset.create(intervals.size());

        logger.info("Collecting read counts...");
    }

    private void validateArguments() {
        CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
        CopyNumberArgumentValidationUtils.validateOutputFiles(outputCountsFile);
    }

    @Override
    public void apply(GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext) {
        if (currentContig == null || !read.getContig().equals(currentContig)) {
            //if we are on a new contig, create an OverlapDetector covering the contig
            currentContig = read.getContig();
            final List intervalsOnCurrentContig = intervals.stream()
                    .filter(i -> i.getContig().equals(currentContig))
                    .collect(Collectors.toList());
            intervalCachedOverlapDetector = new CachedOverlapDetector<>(intervalsOnCurrentContig);
        }
        final SimpleInterval overlappingInterval = intervalCachedOverlapDetector.getOverlap(
                new SimpleInterval(read.getContig(), read.getStart(), read.getStart()));

        //if read doesn't overlap any of the provided intervals, do nothing
        if (overlappingInterval == null) {
            return;
        }
        intervalMultiset.add(overlappingInterval);
    }

    @Override
    public Object onTraversalSuccess() {
        logger.info(String.format("Writing read counts to %s...", outputCountsFile.getAbsolutePath()));
        final SimpleCountCollection readCounts = new SimpleCountCollection(
                metadata,
                ImmutableList.copyOf(intervals.stream()     //making this an ImmutableList avoids a defensive copy in SimpleCountCollection
                        .map(i -> new SimpleCount(i, intervalMultiset.count(i)))
                        .iterator()));

        if (format == Format.HDF5) {
            readCounts.writeHDF5(outputCountsFile);
        } else {
            readCounts.write(outputCountsFile);
        }

        logger.info(String.format("%s complete.", getClass().getSimpleName()));

        return null;
    }

    /**
     * A simple wrapper around {@link OverlapDetector} to provide naive caching and ensure that overlap sets
     * only contain a single interval.
     */
    private final class CachedOverlapDetector {
        private final OverlapDetector overlapDetector;
        private T cachedResult;

        CachedOverlapDetector(final List intervals) {
            Utils.nonEmpty(intervals);
            this.overlapDetector = OverlapDetector.create(intervals);
            //double check that intervals do not overlap
            Utils.validateArg(intervals.stream().noneMatch(i -> overlapDetector.getOverlaps(i).size() > 1),
                    "Input intervals may not be overlapping.");
            cachedResult = intervals.get(0);
        }

        /**
         * We check the previously cached result first.  Assuming that queries will be made in sorted order,
         * this may slightly save on lookup time.
         * @return {@code null} if no interval overlaps {@code locatable}
         */
        T getOverlap(final Locatable locatable) {
            if (IntervalUtils.overlaps(cachedResult, locatable)) {
                return cachedResult;
            }
            final Set overlaps = overlapDetector.getOverlaps(locatable);
            if (overlaps.size() > 1) {
                //should not reach here since intervals are checked for overlaps;
                //performing a redundant check to protect against future code changes
                throw new GATKException.ShouldNeverReachHereException("Intervals should be non-overlapping, " +
                        "so at most one interval should intersect with the start of a read.");
            }
            final T firstOverlap = overlaps.stream().findFirst().orElse(null);
            if (firstOverlap != null) {
                cachedResult = firstOverlap;
            }
            return firstOverlap;
        }
    }
}