All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.walkers.SplitIntervals Maven / Gradle / Ivy

There is a newer version: 4.6.0.0
Show newest version
package org.broadinstitute.hellbender.tools.walkers;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.IntervalList;
import htsjdk.samtools.util.RuntimeIOException;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection;
import org.broadinstitute.hellbender.engine.GATKTool;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.param.ParamUtils;
import picard.cmdline.programgroups.IntervalsManipulationProgramGroup;
import picard.util.IntervalList.IntervalListScatterMode;
import picard.util.IntervalList.IntervalListScatterer;

import java.io.File;
import java.text.DecimalFormat;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
 * This tool takes in intervals via the standard arguments of
 * {@link IntervalArgumentCollection} and splits them into interval files for scattering. The resulting files contain
 * equal number of bases.
 *
 * 

Standard GATK engine arguments include -L and -XL, interval padding, and interval set rule etc. * For example, for the -L argument, the tool accepts GATK-style intervals (.list or .intervals), BED files * and VCF files. See --subdivision-mode parameter for more options.

* *

Usage example

* *
 * gatk SplitIntervals \
 *   -R ref_fasta.fa \
 *   -L intervals.list \
 *   --scatter-count 10 \
 *   -O interval-files-folder
 * 
* *

* The -O argument specifies a directory name for the scatter intervals files. Each file will be named, e.g 0000-scattered.interval_list, * 0001-scattered.interval_list, 0002-scattered.interval_list and so on. * The default --scatter-count is 1 and so this value should be changed to utilize the tool's functionality. * Specify --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION to avoid splitting input intervals -- that is, the set * of input intervals is split, but individual intervals are left intact. This may affect results when using assembly-based callers downstream. *

* * *
 *  * gatk SplitIntervals \
 *  *   -R ref_fasta.fa \
 *  *   -L adjacent_intervals.list \
 *  *   --scatter-count 10 \
 *  *   --interval-merge-rule OVERLAPPING_ONLY \
 *  *   --subdivision-mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW
 *  *   -O interval-files-folder
 *  * 
* *

* Note that adjacent intervals will be merged by default. In cases where the desired behavior is to apportion a set * of small adjacent intervals with nearly uniform runtime among X new interval lists, the argument * `--interval-merge-rule OVERLAPPING_ONLY` should be included. *

* * */ @CommandLineProgramProperties( summary = "Split intervals into sub-interval files.", oneLineSummary = "Split intervals into sub-interval files.", programGroup = IntervalsManipulationProgramGroup.class ) @DocumentedFeature public class SplitIntervals extends GATKTool { public static final String SCATTER_COUNT_SHORT_NAME = "scatter"; public static final String SCATTER_COUNT_LONG_NAME = "scatter-count"; public static final String SUBDIVISION_MODE_SHORT_NAME = "mode"; public static final String SUBDIVISION_MODE_lONG_NAME = "subdivision-mode"; public static final String DONT_MIX_CONTIGS_LONG_NAME = "dont-mix-contigs"; public static final String MIN_CONTIG_SIZE_LONG_NAME = "min-contig-size"; public static final String INTERVAL_FILE_EXTENSION_FULL_NAME = "extension"; public static final String PICARD_INTERVAL_FILE_EXTENSION = "interval_list"; public static final String DEFAULT_EXTENSION = "-scattered." + PICARD_INTERVAL_FILE_EXTENSION; public static final int DEFAULT_NUMBER_OF_DIGITS = 4; //to preserve backward compatibility @Argument(fullName = SCATTER_COUNT_LONG_NAME, shortName = SCATTER_COUNT_SHORT_NAME, doc = "scatter count: number of output interval files to split into", optional = true) private int scatterCount = 1; @Argument(fullName = MIN_CONTIG_SIZE_LONG_NAME, doc = "Minimum contig size to keep if getting intervals from the reference", optional = true) private int minContigSize = 0; @Argument(fullName = SUBDIVISION_MODE_lONG_NAME, shortName = SUBDIVISION_MODE_SHORT_NAME, doc = "How to divide intervals.") private IntervalListScatterMode subdivisionMode = IntervalListScatterMode.INTERVAL_SUBDIVISION; @Argument(doc = "The directory into which to write the scattered interval sub-directories.", fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) public File outputDir; @Argument(doc = "Extension to use when writing interval files", fullName = INTERVAL_FILE_EXTENSION_FULL_NAME, optional = true) public String extension = DEFAULT_EXTENSION; @Argument(doc = "Scattered interval files do not contain intervals from multiple contigs. This is applied after the initial scatter, so that the requested scatter count is a lower bound on the number of actual scattered files.", fullName = DONT_MIX_CONTIGS_LONG_NAME, optional = true) public boolean dontMixContigs = false; @Override public void onTraversalStart() { ParamUtils.isPositive(scatterCount, "scatter-count must be > 0."); if (!outputDir.exists() && !outputDir.mkdir()) { throw new RuntimeIOException("Unable to create directory: " + outputDir.getAbsolutePath()); } // in general dictionary will be from the reference, but using -I reads.bam or -F variants.vcf // to use the sequence dict from a bam or vcf is also supported final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary(); final List intervals = hasUserSuppliedIntervals() ? intervalArgumentCollection.getIntervals(sequenceDictionary) : IntervalUtils.getAllIntervalsForReference(sequenceDictionary).stream() .filter(contig -> contig.getLengthOnReference() >= minContigSize) .collect(Collectors.toList()); final IntervalList intervalList = new IntervalList(sequenceDictionary); intervals.stream().map(si -> new Interval(si.getContig(), si.getStart(), si.getEnd())).forEach(intervalList::add); final IntervalListScatterer scatterer = subdivisionMode.make(); final List scattered = scatterer.scatter(intervalList, scatterCount); // optionally split interval lists that contain intervals from multiple contigs final List scatteredFinal = !dontMixContigs ? scattered : scattered.stream().flatMap(il -> il.getIntervals().stream() .collect(Collectors.groupingBy(Interval::getContig)).entrySet().stream() // group each interval list into sublists .sorted(Comparator.comparingInt(entry -> sequenceDictionary.getSequenceIndex(entry.getKey()))) // sort entries by contig .map(entry -> entry.getValue()) // discard the keys and just keep the lists of intervals .map(list -> { final IntervalList singleContigList = new IntervalList(sequenceDictionary); singleContigList.addall(list); return singleContigList; }) // turn the intervals back into an IntervalList ).collect(Collectors.toList()); final int maxNumberOfPlaces = Math.max((int)Math.floor(Math.log10(scatterCount-1))+1, DEFAULT_NUMBER_OF_DIGITS); final String formatString = "%0" + maxNumberOfPlaces + "d"; IntStream.range(0, scatteredFinal.size()).forEach(n -> scatteredFinal.get(n).write(new File(outputDir, String.format(formatString, n) + extension))); } @Override public void traverse() { } // no traversal for this tool! }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy