org.broadinstitute.hellbender.tools.copynumber.AnnotateIntervals Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.tools.copynumber;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.bed.BEDFeature;
import org.apache.commons.collections4.IteratorUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.FeatureManager;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationMap;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
/**
* Annotates intervals with GC content, and optionally, mappability and segmental-duplication content.
* The output may optionally be used as input to {@link CreateReadCountPanelOfNormals}, {@link DenoiseReadCounts},
* and {@link GermlineCNVCaller}.
*
* Inputs
*
*
* -
* Reference FASTA file
*
* -
* Intervals to be annotated. Supported formats are described in
* Article#1319.
* The argument {@code interval-merging-rule} must be set to {@link IntervalMergingRule#OVERLAPPING_ONLY}
* and all other common arguments for interval padding or merging must be set to their defaults.
*
* -
* (Optional) Umap single-read mappability track.
* This is a BED file in .bed or .bed.gz format that identifies uniquely mappable regions of the genome.
* The track should correspond to the appropriate read length and overlapping intervals must be merged.
* See https://bismap.hoffmanlab.org/. If scores are provided,
* intervals will be annotated with the length-weighted average; note that NaN scores will be taken as unity.
* Otherwise, scores for covered and uncovered intervals will be taken as unity and zero, respectively.
*
* -
* (Optional) Segmental-duplication track.
* This is a BED file in .bed or .bed.gz format that identifies segmental-duplication regions of the genome.
* Overlapping intervals must be merged. If scores are provided, intervals will be annotated with the
* length-weighted average; note that NaN scores will be taken as unity. Otherwise, scores for covered and
* uncovered intervals will be taken as unity and zero, respectively.
*
*
*
* Outputs
*
*
* -
* Annotated-intervals file.
* This is a tab-separated values (TSV) file with a SAM-style header containing a sequence dictionary,
* a row specifying the column headers for the contained annotations (see {@link CopyNumberAnnotations}
* for possible annotations), and the corresponding entry rows.
*
*
*
* Usage examples
*
*
* gatk AnnotateIntervals \
* -R reference.fa \
* -L intervals.interval_list \
* --interval-merging-rule OVERLAPPING_ONLY \
* -O annotated_intervals.tsv
*
*
*
* gatk AnnotateIntervals \
* -R reference.fa \
* -L intervals.interval_list \
* --mappability-track mappability.bed.gz \
* --segmental-duplication-track segmental_duplication.bed.gz \
* --interval-merging-rule OVERLAPPING_ONLY \
* -O annotated_intervals.tsv
*
*
* @author Samuel Lee <[email protected]>
*/
@CommandLineProgramProperties(
summary = "Annotates intervals with GC content, mappability, and segmental-duplication content",
oneLineSummary = "Annotates intervals with GC content, mappability, and segmental-duplication content",
programGroup = CopyNumberProgramGroup.class
)
@DocumentedFeature
public final class AnnotateIntervals extends GATKTool {
private static final int DEFAULT_FEATURE_QUERY_LOOKAHEAD_IN_BP = 1_000_000;
public static final String MAPPABILITY_TRACK_PATH_LONG_NAME = "mappability-track";
public static final String SEGMENTAL_DUPLICATION_TRACK_PATH_LONG_NAME = "segmental-duplication-track";
public static final String FEATURE_QUERY_LOOKAHEAD = "feature-query-lookahead";
@Argument(
doc = "Output file for annotated intervals.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME
)
private File outputAnnotatedIntervalsFile;
@Argument(
doc = "Path to Umap single-read mappability track in .bed or .bed.gz format (see https://bismap.hoffmanlab.org/). " +
"Overlapping intervals must be merged.",
fullName = MAPPABILITY_TRACK_PATH_LONG_NAME,
optional = true
)
private FeatureInput mappabilityTrackPath;
@Argument(
doc = "Path to segmental-duplication track in .bed or .bed.gz format. " +
"Overlapping intervals must be merged.",
fullName = SEGMENTAL_DUPLICATION_TRACK_PATH_LONG_NAME,
optional = true
)
private FeatureInput segmentalDuplicationTrackPath;
@Argument(
doc = "Number of bases to cache when querying feature tracks.",
fullName = FEATURE_QUERY_LOOKAHEAD,
optional = true
)
private int featureQueryLookahead = DEFAULT_FEATURE_QUERY_LOOKAHEAD_IN_BP;
@Override
public boolean requiresReference() {
return true;
}
@Override
public boolean requiresIntervals() {
return true;
}
private List intervals;
private SAMSequenceDictionary sequenceDictionary;
private ReferenceDataSource reference;
private FeatureManager features;
private List> annotators = new ArrayList<>();
private AnnotatedIntervalCollection annotatedIntervals;
@Override
public void onTraversalStart() {
validateArguments();
logger.info("Loading intervals for annotation...");
sequenceDictionary = getBestAvailableSequenceDictionary();
intervals = intervalArgumentCollection.getIntervals(sequenceDictionary);
logger.info("Loading resources for annotation...");
reference = ReferenceDataSource.of(referenceArguments.getReferencePath()); //the GATKTool ReferenceDataSource is package-protected, so we cannot access it directly
features = new FeatureManager( //the GATKTool FeatureManager is package-protected, so we cannot access it directly
this,
featureQueryLookahead,
cloudPrefetchBuffer,
cloudIndexPrefetchBuffer,
getGenomicsDBOptions());
// always perform GC-content annotation
logger.info("Adding GC-content annotator...");
annotators.add(new GCContentAnnotator());
// add optional annotators
if (mappabilityTrackPath != null) {
checkForOverlaps(features, mappabilityTrackPath, sequenceDictionary);
logger.info("Adding mappability annotator...");
annotators.add(new MappabilityAnnotator(mappabilityTrackPath));
}
if (segmentalDuplicationTrackPath != null) {
checkForOverlaps(features, segmentalDuplicationTrackPath, sequenceDictionary);
logger.info("Adding segmental-duplication-content annotator...");
annotators.add(new SegmentalDuplicationContentAnnotator(segmentalDuplicationTrackPath));
}
logger.info("Annotating intervals...");
}
private void validateArguments() {
CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection);
CopyNumberArgumentValidationUtils.validateOutputFiles(outputAnnotatedIntervalsFile);
}
private static void checkForOverlaps(final FeatureManager featureManager,
final FeatureInput featureTrackPath,
final SAMSequenceDictionary sequenceDictionary) {
final List features = IteratorUtils.toList(featureManager.getFeatureIterator(featureTrackPath));
final GenomeLocParser parser = new GenomeLocParser(sequenceDictionary);
final List genomeLocs = IntervalUtils.genomeLocsFromLocatables(parser, features);
final List mergedGenomeLocs = IntervalUtils.mergeIntervalLocations(
genomeLocs, IntervalMergingRule.OVERLAPPING_ONLY);
if (genomeLocs.size() != mergedGenomeLocs.size()) {
throw new UserException.BadInput(String.format("Feature track %s contains overlapping intervals; " +
"these should be merged.", featureTrackPath));
}
}
@Override
public void traverse() {
final List annotatedIntervalList = new ArrayList<>(intervals.size());
for (final SimpleInterval interval : intervals) {
if (interval.getLengthOnReference() == 0) {
throw new UserException.BadInput(String.format("Interval cannot have zero length: %s", interval));
}
final ReferenceContext referenceContext = new ReferenceContext(reference, interval);
final FeatureContext featureContext = new FeatureContext(features, interval);
final AnnotationMap annotations = new AnnotationMap(annotators.stream()
.collect(Collectors.mapping(
a -> Pair.of(
a.getAnnotationKey(),
a.applyAndValidate(interval, referenceContext, featureContext)),
Collectors.toList())));
annotatedIntervalList.add(new AnnotatedInterval(interval, annotations));
progressMeter.update(interval);
}
annotatedIntervals = new AnnotatedIntervalCollection(new SimpleLocatableMetadata(sequenceDictionary), annotatedIntervalList);
}
@Override
public Object onTraversalSuccess() {
reference.close();
features.close();
logger.info(String.format("Writing annotated intervals to %s...", outputAnnotatedIntervalsFile.getAbsolutePath()));
annotatedIntervals.write(outputAnnotatedIntervalsFile);
logger.info(String.format("%s complete.", getClass().getSimpleName()));
return super.onTraversalSuccess();
}
/**
* If additional annotators are added to this tool, they should follow this interface.
* Validation that the required resources are available should be performed before
* calling {@link IntervalAnnotator#apply}.
*/
abstract static class IntervalAnnotator {
public abstract AnnotationKey getAnnotationKey();
/**
* @param interval assumed to have non-zero length
*/
abstract T apply(final Locatable interval,
final ReferenceContext referenceContext,
final FeatureContext featureContext);
/**
* @param interval assumed to have non-zero length
*/
T applyAndValidate(final Locatable interval,
final ReferenceContext referenceContext,
final FeatureContext featureContext) {
return getAnnotationKey().validate(apply(interval, referenceContext, featureContext));
}
}
private static class GCContentAnnotator extends IntervalAnnotator {
@Override
public AnnotationKey getAnnotationKey() {
return CopyNumberAnnotations.GC_CONTENT;
}
/**
* @param interval assumed to have non-zero length
*/
@Override
Double apply(final Locatable interval,
final ReferenceContext referenceContext,
final FeatureContext featureContext) {
final Nucleotide.Counter counter = new Nucleotide.Counter();
counter.addAll(referenceContext.getBases());
final long gcCount = counter.get(Nucleotide.C) + counter.get(Nucleotide.G);
final long atCount = counter.get(Nucleotide.A) + counter.get(Nucleotide.T);
final long totalCount = gcCount + atCount;
return totalCount == 0 ? Double.NaN : gcCount / (double) totalCount;
}
}
/**
* If scores are provided, intervals will be annotated with the length-weighted average; note that NaN scores will
* be taken as unity. Otherwise, scores for covered and uncovered intervals will be taken as unity and zero,
* respectively.
*/
abstract static class BEDLengthWeightedAnnotator extends IntervalAnnotator {
private final FeatureInput trackPath;
BEDLengthWeightedAnnotator(final FeatureInput trackPath) {
this.trackPath = trackPath;
}
/**
* @param interval assumed to have non-zero length
*/
@Override
Double apply(final Locatable interval,
final ReferenceContext referenceContext,
final FeatureContext featureContext) {
double lengthWeightedSum = 0.;
final List features = featureContext.getValues(trackPath);
for (final BEDFeature feature : features) {
final double scoreOrNaN = (double) feature.getScore();
final double score = Double.isNaN(scoreOrNaN) ? 1. : scoreOrNaN; // missing or NaN score -> score = 1
lengthWeightedSum += score *
CoordMath.getOverlap(
feature.getStart(), feature.getEnd() - 1, // zero-based
interval.getStart(), interval.getEnd()); // one-based
}
return lengthWeightedSum / interval.getLengthOnReference();
}
}
private static class MappabilityAnnotator extends BEDLengthWeightedAnnotator {
MappabilityAnnotator(final FeatureInput mappabilityTrackPath) {
super(mappabilityTrackPath);
}
@Override
public AnnotationKey getAnnotationKey() {
return CopyNumberAnnotations.MAPPABILITY;
}
}
private static class SegmentalDuplicationContentAnnotator extends BEDLengthWeightedAnnotator {
SegmentalDuplicationContentAnnotator(final FeatureInput segmentalDuplicationTrackPath) {
super(segmentalDuplicationTrackPath);
}
@Override
public AnnotationKey getAnnotationKey() {
return CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy