All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.copynumber.PreprocessIntervals Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools.copynumber;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import org.apache.commons.math3.util.FastMath;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import picard.cmdline.programgroups.IntervalsManipulationProgramGroup;

import java.io.File;
import java.util.List;
import java.util.stream.Collectors;

/**
 * Prepares bins for coverage collection.
 *
 * 

* The input intervals are first checked for overlapping intervals, which are merged. * The resulting intervals are then padded. The padded intervals are then split into bins. * Finally, bins that contain only Ns are filtered out. *

* *

Inputs

* *
    *
  • * Reference FASTA file *
  • *
  • * Intervals to be preprocessed. * The argument {@code interval-merging-rule} must be set to {@link IntervalMergingRule#OVERLAPPING_ONLY} * and all other common arguments for interval padding or merging must be set to their defaults. * If no intervals are specified, then each contig will be assumed to be a single interval and binned accordingly; * this produces bins appropriate for whole genome sequencing analyses. *
  • *
  • * Padding length (in bp). * Use {@code padding} to specify the size of each of the regions added to both ends of the intervals that result * after overlapping intervals have been merged. Do not use the common {@code interval-padding} argument. * Intervals that would overlap after padding by the specified amount are instead only * padded until they are adjacent. *
  • *
  • * Bin length (in bp). * If this length is not commensurate with the length of a padded interval, then the last bin will be of * different length than the others in that interval. If zero is specified, then no binning will be performed; * this is generally appropriate for targeted analyses. *
  • *
* *

Outputs

* *
    *
  • * Preprocessed Picard interval-list file. *
  • *
* *

Usage examples

* * To pad intervals by 250 bases and disable binning (e.g., for targeted analyses): * *
 *     gatk PreprocessIntervals \
 *          -R reference.fa \
 *          -L intervals.interval_list \
 *          --bin-length 0 \
 *          --padding 250 \
 *          -O preprocessed_intervals.interval_list
 * 
* * To generate consecutive bins of 1000 bases from the reference (e.g., for whole genome sequencing analyses): * *
 *     gatk PreprocessIntervals \
 *          -R reference.fa \
 *          --bin-length 1000 \
 *          --padding 0 \
 *          -O preprocessed_intervals.interval_list
 * 
* * @author Marton Kanasz-Nagy <[email protected]> * @author Samuel Lee <[email protected]> */ @CommandLineProgramProperties( summary = "Prepares bins for coverage collection", oneLineSummary = "Prepares bins for coverage collection", programGroup = IntervalsManipulationProgramGroup.class ) @DocumentedFeature public final class PreprocessIntervals extends GATKTool { public static final String BIN_LENGTH_LONG_NAME = "bin-length"; public static final String PADDING_LONG_NAME = "padding"; @Argument( doc = "Length (in bp) of the bins. If zero, no binning will be performed.", fullName = BIN_LENGTH_LONG_NAME, optional = true, minValue = 0 ) private int binLength = 1000; @Argument( doc = "Length (in bp) of the padding regions on each side of the intervals.", fullName = PADDING_LONG_NAME, optional = true, minValue = 0 ) private int padding = 250; @Argument( doc = "Output Picard interval-list file containing the preprocessed intervals.", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME ) private File outputPreprocessedIntervalsFile; @Override public boolean requiresReference() { return true; } @Override public void onTraversalStart() { validateArguments(); } @Override public void traverse() {} // no traversal for this tool @Override public Object onTraversalSuccess() { final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary(); final List inputIntervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(sequenceDictionary) : IntervalUtils.getAllIntervalsForReference(sequenceDictionary); // if the user didn't add any intervals, // we assume that they wanted to do whole genome sequencing logger.info("Padding intervals..."); final IntervalList paddedIntervalList = padIntervals(inputIntervals, padding, sequenceDictionary); logger.info("Generating bins..."); final IntervalList unfilteredBins = generateBins(paddedIntervalList, binLength, sequenceDictionary); logger.info("Filtering bins containing only Ns..."); final ReferenceDataSource reference = ReferenceDataSource.of(referenceArguments.getReferencePath()); final IntervalList bins = filterBinsContainingOnlyNs(unfilteredBins, reference); logger.info(String.format("Writing bins to %s...", outputPreprocessedIntervalsFile.getAbsolutePath())); bins.write(outputPreprocessedIntervalsFile); logger.info(String.format("%s complete.", getClass().getSimpleName())); return null; } private void validateArguments() { if (hasUserSuppliedIntervals()) { CopyNumberArgumentValidationUtils.validateIntervalArgumentCollection(intervalArgumentCollection); } CopyNumberArgumentValidationUtils.validateOutputFiles(outputPreprocessedIntervalsFile); } private static IntervalList padIntervals(final List inputIntervals, final int padding, final SAMSequenceDictionary sequenceDictionary) { final List paddedIntervals = inputIntervals.stream() .map(i -> new SimpleInterval( i.getContig(), Math.max(1, i.getStart() - padding), Math.min(i.getEnd() + padding, sequenceDictionary.getSequence(i.getContig()).getSequenceLength()))) .collect(Collectors.toList()); // alter the padded intervals in place to eliminate overlaps for (int i = 0; i < paddedIntervals.size() - 1; i++) { final SimpleInterval thisInterval = paddedIntervals.get(i); final SimpleInterval nextInterval = paddedIntervals.get(i + 1); if (thisInterval.overlaps(nextInterval)) { final int originalThisEnd = inputIntervals.get(i).getEnd(); final int originalNextStart = inputIntervals.get(i + 1).getStart(); final int newThisEnd = (originalThisEnd + originalNextStart) / 2; final int newNextStart = newThisEnd + 1; paddedIntervals.set(i, new SimpleInterval(thisInterval.getContig(), thisInterval.getStart(), newThisEnd)); paddedIntervals.set(i + 1, new SimpleInterval(nextInterval.getContig(), newNextStart, nextInterval.getEnd())); } } final IntervalList paddedIntervalList = new IntervalList(sequenceDictionary); paddedIntervals.forEach(i -> paddedIntervalList.add(new Interval(i.getContig(), i.getStart(), i.getEnd()))); return paddedIntervalList; } private static IntervalList generateBins(final IntervalList preparedIntervalList, final int binLength, final SAMSequenceDictionary sequenceDictionary) { if (binLength == 0) { return IntervalList.copyOf(preparedIntervalList); } final IntervalList bins = new IntervalList(sequenceDictionary); for (final Interval interval : preparedIntervalList) { for (int binStart = interval.getStart(); binStart <= interval.getEnd(); binStart += binLength) { final int binEnd = FastMath.min(binStart + binLength - 1, interval.getEnd()); bins.add(new Interval(interval.getContig(), binStart, binEnd)); } } return bins; } private static IntervalList filterBinsContainingOnlyNs(final IntervalList unfilteredBins, final ReferenceDataSource reference) { final IntervalList bins = new IntervalList(reference.getSequenceDictionary()); for (final Interval unfilteredBin : unfilteredBins) { if (!Utils.stream(reference.query(new SimpleInterval(unfilteredBin))).allMatch(b -> Nucleotide.decode(b) == Nucleotide.N)) { bins.add(unfilteredBin); } } return bins; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy