org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.cmdline.argumentcollections;
import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineException;
import org.broadinstitute.hellbender.engine.TraversalParameters;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.*;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
/**
* Intended to be used as an @ArgumentCollection for specifying intervals at the command line.
* Subclasses must override getIntervalStrings and addToIntervalStrings().
*/
public abstract class IntervalArgumentCollection implements Serializable {
private static final Logger logger = LogManager.getLogger(IntervalArgumentCollection.class);
private static final long serialVersionUID = 1L;
public static final String EXCLUDE_INTERVALS_LONG_NAME = "exclude-intervals";
public static final String EXCLUDE_INTERVALS_SHORT_NAME = "XL";
public static final String INTERVAL_SET_RULE_LONG_NAME = "interval-set-rule";
public static final String INTERVAL_PADDING_LONG_NAME = "interval-padding";
public static final String INTERVAL_EXCLUSION_PADDING_LONG_NAME = "interval-exclusion-padding";
public static final String INTERVAL_MERGING_RULE_LONG_NAME = "interval-merging-rule";
/**
* Subclasses must provide a -L argument and override this to return the results of that argument.
*
* The -L argument specifies intervals to include in analysis and has the following semantics
* It can use samtools-style intervals either explicitly on the command line (e.g. -L 1 or -L 1:100-200) or
* by loading in a file containing a list of intervals (e.g. -L myFile.intervals).
* It can be specified multiple times.
*
* @return string gathered from the command line -L argument to be parsed into intervals to include
*/
protected abstract List getIntervalStrings();
/**
* Add an extra interval string to the intervals to include.
* ONLY for testing -- will throw if called after interval parsing has been performed.
*/
@VisibleForTesting
protected abstract void addToIntervalStrings(String newInterval);
/**
* Use this argument to exclude certain parts of the genome from the analysis (like -L, but the opposite).
* This argument can be specified multiple times. You can use samtools-style intervals either explicitly on the
* command line (e.g. -XL 1 or -XL 1:100-200) or by loading in a file containing a list of intervals
* (e.g. -XL myFile.intervals).
* @return strings gathered from the command line -XL argument to be parsed into intervals to exclude
*/
// Interval list files such as Picard interval lists are structured and require specialized parsing that
// is handled by IntervalUtils, so use suppressFileExpansion to bypass command line parser auto-expansion.
@Argument(fullName = EXCLUDE_INTERVALS_LONG_NAME, shortName = EXCLUDE_INTERVALS_SHORT_NAME, doc = "One or more genomic intervals to exclude from processing",
suppressFileExpansion = true, optional = true, common = true)
protected final List excludeIntervalStrings = new ArrayList<>();
/**
* By default, the program will take the UNION of all intervals specified using -L and/or -XL. However, you can
* change this setting for -L, for example if you want to take the INTERSECTION of the sets instead. E.g. to
* perform the analysis only on chromosome 1 exomes, you could specify -L exomes.intervals -L 1 --interval-set-rule
* INTERSECTION. However, it is not possible to modify the merging approach for intervals passed using -XL (they will
* always be merged using UNION).
*
* Note that if you specify both -L and -XL, the -XL interval set will be subtracted from the -L interval set.
*/
@Argument(fullName = INTERVAL_SET_RULE_LONG_NAME, shortName = "isr", doc = "Set merging approach to use for combining interval inputs", common = true)
protected IntervalSetRule intervalSetRule = IntervalSetRule.UNION;
/**
* Use this to add padding to the intervals specified using -L. For example, '-L 1:100' with a
* padding value of 20 would turn into '-L 1:80-120'. This is typically used to add padding around targets when
* analyzing exomes.
*/
@Argument(fullName = INTERVAL_PADDING_LONG_NAME, shortName = "ip", doc = "Amount of padding (in bp) to add to each interval you are including.", common = true)
protected int intervalPadding = 0;
/**
* Use this to add padding to the intervals specified using -XL. For example, '-XL 1:100' with a
* padding value of 20 would turn into '-XL 1:80-120'. This is typically used to add padding around targets when
* analyzing exomes.
*/
@Argument(fullName = INTERVAL_EXCLUSION_PADDING_LONG_NAME, shortName= "ixp", doc = "Amount of padding (in bp) to add to each interval you are excluding.", common = true)
protected int intervalExclusionPadding = 0;
/**
* By default, the program merges abutting intervals (i.e. intervals that are directly side-by-side but do not
* actually overlap) into a single continuous interval. However you can change this behavior if you want them to be
* treated as separate intervals instead.
*/
@Argument(fullName = INTERVAL_MERGING_RULE_LONG_NAME, shortName = "imr", doc = "Interval merging rule for abutting intervals", optional = true)
protected IntervalMergingRule intervalMergingRule = IntervalMergingRule.ALL;
/**
* Full parameters for traversal, including our parsed intervals and a flag indicating whether unmapped records
* should be returned. Lazily initialized.
*/
protected TraversalParameters traversalParameters = null;
/**
* Get the intervals specified on the command line.
* @param sequenceDict used to validate intervals
* @return a list of the given intervals after processing and validation
*/
public List getIntervals( final SAMSequenceDictionary sequenceDict ){
return getTraversalParameters(sequenceDict).getIntervalsForTraversal();
}
/**
* Get the interval set rule specified on the command line.
*/
public IntervalSetRule getIntervalSetRule() {
return intervalSetRule;
}
/**
* Get the interval padding specified on the command line.
*/
public int getIntervalPadding() {
return intervalPadding;
}
/**
* Get the interval exclusion padding specified on the command line.
*/
public int getIntervalExclusionPadding() {
return intervalExclusionPadding;
}
/**
* Get the interval merging rule specified on the command line.
*/
public IntervalMergingRule getIntervalMergingRule() {
return intervalMergingRule;
}
/**
* Returns the full set of traversal intervals specified on the command line, including parsed intervals without
* merging intervals specified by the user on the command line. This is an advanced use case
*
* NOTE: this currently does not account for interval exclusion arguments
*
* @param sequenceDict used to validate intervals
* @return the full set of intervals specified on the command line, without any merging performed
*/
public List getIntervalsWithoutMerging(final SAMSequenceDictionary sequenceDict ) {
if (getIntervalStrings().isEmpty() ) {
throw new GATKException("Cannot call getIntervalsWithoutMerging() without specifying intervals to include.");
}
List intervals = IntervalUtils.loadIntervalsNonMerging(getIntervalStrings(), intervalPadding, new GenomeLocParser(sequenceDict));
// Separate out requests for unmapped records from the rest of the intervals.
boolean traverseUnmapped = false;
if (intervals.contains(GenomeLoc.UNMAPPED)) {
traverseUnmapped = true;
intervals.remove(GenomeLoc.UNMAPPED);
}
return new TraversalParameters(IntervalUtils.convertGenomeLocsToSimpleIntervals(intervals), traverseUnmapped).getIntervalsForTraversal();
}
/**
* Returns the full set of traversal parameters specified on the command line, including the parsed intervals
* and a flag indicating whether unmapped records were requested.
*
* @param sequenceDict used to validate intervals
* @return the full set of traversal parameters specified on the command line
*/
public TraversalParameters getTraversalParameters( final SAMSequenceDictionary sequenceDict ) {
if ( ! intervalsSpecified() ) {
throw new GATKException("Cannot call getTraversalParameters() without specifying either intervals to include or exclude.");
}
if ( traversalParameters == null ) {
traversalParameters = parseIntervals(new GenomeLocParser(sequenceDict), intervalMergingRule, intervalSetRule, excludeIntervalStrings);
}
return traversalParameters;
}
private TraversalParameters parseIntervals(final GenomeLocParser genomeLocParser, final IntervalMergingRule intervalMergingRule,
final IntervalSetRule intervalSetRule, final List excludeIntervalStrings) {
// return if no interval arguments at all
if (!intervalsSpecified()) {
throw new GATKException("Cannot call parseIntervals() without specifying either intervals to include or exclude.");
}
GenomeLocSortedSet includeSortedSet;
if (getIntervalStrings().isEmpty()) {
// the -L argument isn't specified, which means that -XL was, since we checked intervalsSpecified()
// therefore we set the include set to be the entire reference territory
includeSortedSet = GenomeLocSortedSet.createSetFromSequenceDictionary(genomeLocParser.getSequenceDictionary());
} else {
try {
includeSortedSet = IntervalUtils.loadIntervals(getIntervalStrings(), intervalSetRule, intervalMergingRule, intervalPadding, genomeLocParser);
} catch (UserException.EmptyIntersection e) {
throw new CommandLineException.BadArgumentValue("-L, --" + IntervalArgumentCollection.INTERVAL_SET_RULE_LONG_NAME, getIntervalStrings() + "," + intervalSetRule, "The specified intervals had an empty intersection");
}
}
final GenomeLocSortedSet excludeSortedSet = IntervalUtils.loadIntervals(excludeIntervalStrings, IntervalSetRule.UNION, intervalMergingRule, intervalExclusionPadding, genomeLocParser);
if (excludeSortedSet.contains(GenomeLoc.UNMAPPED)) {
throw new UserException("-XL unmapped is not currently supported");
}
GenomeLocSortedSet intervals;
// if no exclude arguments, can return the included set directly
if (excludeSortedSet.isEmpty()) {
intervals = includeSortedSet;
}// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
else {
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
if (intervals.isEmpty()) {
throw new CommandLineException.BadArgumentValue("-L,-XL", getIntervalStrings().toString() + ", " + excludeIntervalStrings.toString(), "The intervals specified for exclusion with -XL removed all territory specified by -L.");
}
// logging messages only printed when exclude (-XL) arguments are given
final long toPruneSize = includeSortedSet.coveredSize();
final long toExcludeSize = excludeSortedSet.coveredSize();
final long intervalSize = intervals.coveredSize();
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
}
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
// Separate out requests for unmapped records from the rest of the intervals.
boolean traverseUnmapped = false;
if (intervals.contains(GenomeLoc.UNMAPPED)) {
traverseUnmapped = true;
intervals.remove(GenomeLoc.UNMAPPED);
}
return new TraversalParameters(IntervalUtils.convertGenomeLocsToSimpleIntervals(intervals.toList()), traverseUnmapped);
}
/**
* Is the interval specified an interval list
*/
public String intervalListFileSpecified() {
if (getIntervalStrings().size()==1 && IntervalUtils.isGatkIntervalFile(getIntervalStrings().get(0))) {
return getIntervalStrings().get(0);
} else {
return null;
}
}
/**
* Have any intervals been specified for inclusion or exclusion
*/
public boolean intervalsSpecified() {
return !( getIntervalStrings().isEmpty() && excludeIntervalStrings.isEmpty());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy