org.broadinstitute.hellbender.tools.copynumber.CollectReadCounts Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.copynumber;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multiset;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.samtools.util.OverlapDetector;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilter;
import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.HDF5SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.Metadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.MetadataUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SampleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Collects read counts at specified intervals. The count for each interval is calculated by counting
* the number of read starts that lie in the interval.
*
* Inputs
*
*
* -
* SAM format read data
*
* -
* Intervals at which counts will be collected.
* The argument {@code interval-merging-rule} must be set to {@link IntervalMergingRule#OVERLAPPING_ONLY}
* and all other common arguments for interval padding or merging must be set to their defaults.
*
* -
* Output file format. This can be used to select TSV or HDF5 output.
*
*
*
* Outputs
*
*
* -
* Counts file.
* By default, the tool produces HDF5 format results. This can be changed with the {@code format} option
* to TSV format. Using HDF5 files with {@link CreateReadCountPanelOfNormals}
* can decrease runtime, by reducing time spent on IO, so this is the default output format.
* The HDF5 format contains information in the paths defined in {@link HDF5SimpleCountCollection}. HDF5 files may be viewed using
* hdfview or loaded in Python using
* PyTables or h5py.
* The TSV format has a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in
* {@link SimpleCountCollection.SimpleCountTableColumn}, and the corresponding entry rows.
*
*
*
* Usage examples
*
*
* gatk CollectReadCounts \
* -I sample.bam \
* -L intervals.interval_list \
* --interval-merging-rule OVERLAPPING_ONLY \
* -O sample.counts.hdf5
*
*
* @author Andrey Smirnov <[email protected]>
* @author Samuel Lee <[email protected]>
*/
@CommandLineProgramProperties(
summary = "Collects read counts at specified intervals",
oneLineSummary = "Collects read counts at specified intervals",
programGroup = CoverageAnalysisProgramGroup.class
)
@DocumentedFeature
@WorkflowProperties
public final class CollectReadCounts extends ReadWalker {
public enum Format {
TSV, HDF5
}
private static final int DEFAULT_MINIMUM_MAPPING_QUALITY = 30;
public static final String FORMAT_LONG_NAME = "format";
@Argument(
doc = "Output file for read counts.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
)
@WorkflowOutput
private File outputCountsFile = null;
@Argument(
doc = "Output file format.",
fullName = FORMAT_LONG_NAME,
optional = true
)
private Format format = Format.HDF5;
/**
* Metadata contained in the BAM file.
*/
private SampleLocatableMetadata metadata;
private List intervals;
private String currentContig = null;
/**
* Overlap detector used to determine when read starts overlap with input intervals.
*/
private CachedOverlapDetector intervalCachedOverlapDetector;
private Multiset intervalMultiset;
@Override
public boolean requiresIntervals() {
return true;
}
@Override
public List getDefaultReadFilters() {
final List readFilters = new ArrayList<>(super.getDefaultReadFilters());
readFilters.add(ReadFilterLibrary.MAPPED);
readFilters.add(ReadFilterLibrary.NON_ZERO_REFERENCE_LENGTH_ALIGNMENT);
readFilters.add(ReadFilterLibrary.NOT_DUPLICATE);
readFilters.add(new MappingQualityReadFilter(DEFAULT_MINIMUM_MAPPING_QUALITY));
return readFilters;
}
@Override
public void onTraversalStart() {
validateArguments();
metadata = MetadataUtils.fromHeader(getHeaderForReads(), Metadata.Type.SAMPLE_LOCATABLE);
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
//this check is currently redundant, since the master dictionary is taken from the reads;
//however, if any other dictionary is added in the future, such a check should be performed
if (!CopyNumberArgumentValidationUtils.isSameDictionary(metadata.getSequenceDictionary(), sequenceDictionary)) {
logger.warn("Sequence dictionary in BAM does not match the master sequence dictionary.");
}
intervals = intervalArgumentCollection.getIntervals(sequenceDictionary);
intervalMultiset = HashMultiset.create(intervals.size());
logger.info("Collecting read counts...");
}
private void validateArguments() {
CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
CopyNumberArgumentValidationUtils.validateOutputFiles(outputCountsFile);
}
@Override
public void apply(GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext) {
if (currentContig == null || !read.getContig().equals(currentContig)) {
//if we are on a new contig, create an OverlapDetector covering the contig
currentContig = read.getContig();
final List intervalsOnCurrentContig = intervals.stream()
.filter(i -> i.getContig().equals(currentContig))
.collect(Collectors.toList());
intervalCachedOverlapDetector = new CachedOverlapDetector<>(intervalsOnCurrentContig);
}
final SimpleInterval overlappingInterval = intervalCachedOverlapDetector.getOverlap(
new SimpleInterval(read.getContig(), read.getStart(), read.getStart()));
//if read doesn't overlap any of the provided intervals, do nothing
if (overlappingInterval == null) {
return;
}
intervalMultiset.add(overlappingInterval);
}
@Override
public Object onTraversalSuccess() {
logger.info(String.format("Writing read counts to %s...", outputCountsFile.getAbsolutePath()));
final SimpleCountCollection readCounts = new SimpleCountCollection(
metadata,
ImmutableList.copyOf(intervals.stream() //making this an ImmutableList avoids a defensive copy in SimpleCountCollection
.map(i -> new SimpleCount(i, intervalMultiset.count(i)))
.iterator()));
if (format == Format.HDF5) {
readCounts.writeHDF5(outputCountsFile);
} else {
readCounts.write(outputCountsFile);
}
logger.info(String.format("%s complete.", getClass().getSimpleName()));
return null;
}
/**
* A simple wrapper around {@link OverlapDetector} to provide naive caching and ensure that overlap sets
* only contain a single interval.
*/
private final class CachedOverlapDetector {
private final OverlapDetector overlapDetector;
private T cachedResult;
CachedOverlapDetector(final List intervals) {
Utils.nonEmpty(intervals);
this.overlapDetector = OverlapDetector.create(intervals);
//double check that intervals do not overlap
Utils.validateArg(intervals.stream().noneMatch(i -> overlapDetector.getOverlaps(i).size() > 1),
"Input intervals may not be overlapping.");
cachedResult = intervals.get(0);
}
/**
* We check the previously cached result first. Assuming that queries will be made in sorted order,
* this may slightly save on lookup time.
* @return {@code null} if no interval overlaps {@code locatable}
*/
T getOverlap(final Locatable locatable) {
if (IntervalUtils.overlaps(cachedResult, locatable)) {
return cachedResult;
}
final Set overlaps = overlapDetector.getOverlaps(locatable);
if (overlaps.size() > 1) {
//should not reach here since intervals are checked for overlaps;
//performing a redundant check to protect against future code changes
throw new GATKException.ShouldNeverReachHereException("Intervals should be non-overlapping, " +
"so at most one interval should intersect with the start of a read.");
}
final T firstOverlap = overlaps.stream().findFirst().orElse(null);
if (firstOverlap != null) {
cachedResult = firstOverlap;
}
return firstOverlap;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy