All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.SplitReads Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.util.IOUtil;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.WorkflowProperties;
import org.broadinstitute.barclay.argparser.WorkflowOutput;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.readersplitters.LibraryNameSplitter;
import org.broadinstitute.hellbender.tools.readersplitters.ReadGroupIdSplitter;
import org.broadinstitute.hellbender.tools.readersplitters.ReaderSplitter;
import org.broadinstitute.hellbender.tools.readersplitters.SampleNameSplitter;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import java.util.stream.Collectors;

/**
 * Outputs reads from a SAM/BAM/CRAM by read group, sample and library name
 *
 * 

* Note: Not to be confused with a tool that splits reads for RNA-Seq analysis *

* *

Input

*
    *
  • A single BAM file
  • *
* *

Outputs

*
    *
  • A collection of BAM files each corresponding to a group, sample and/or library of the original BAM file
  • *
* *

Usage Example

*

Split reads in BAM file by sample name, read group and library name

*
 *   gatk SplitReads \
 *     -I input.bam \
 *     -O outputDirectory \
 *     --split-sample \
 *     --split-read-group \
 *     --split-library-name
 * 
*/ @CommandLineProgramProperties( summary = "Outputs reads from a SAM/BAM/CRAM by read group, sample and library name", oneLineSummary = "Outputs reads from a SAM/BAM/CRAM by read group, sample and library name", programGroup = ReadDataManipulationProgramGroup.class ) @DocumentedFeature @WorkflowProperties public final class SplitReads extends ReadWalker { public static final String SAMPLE_SHORT_NAME = "SM"; public static final String READ_GROUP_SHORT_NAME = "RG"; public static final String LIBRARY_NAME_SHORT_NAME = "LB"; public static final String SAMPLE_LONG_NAME = "split-sample"; public static final String READ_GROUP_LONG_NAME = "split-read-group"; public static final String LIBRARY_NAME_LONG_NAME = "split-library-name"; public static final String UNKNOWN_OUT_PREFIX = "unknown"; @Argument( shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, doc = "The directory to output SAM/BAM/CRAM files." ) @WorkflowOutput public GATKPath OUTPUT_DIRECTORY; @Argument( fullName = SAMPLE_LONG_NAME, shortName = SAMPLE_SHORT_NAME, doc = "Split file by sample." ) public boolean SAMPLE; @Argument( fullName = READ_GROUP_LONG_NAME, shortName = READ_GROUP_SHORT_NAME, doc = "Split file by read group." ) public boolean READ_GROUP; @Argument( fullName = LIBRARY_NAME_LONG_NAME, shortName = LIBRARY_NAME_SHORT_NAME, doc = "Split file by library." ) public boolean LIBRARY_NAME; private final List> splitters = new ArrayList<>(); private Map outs = null; @Override public void onTraversalStart() { IOUtil.assertDirectoryIsWritable(OUTPUT_DIRECTORY.toPath()); if ( readArguments.getReadPathSpecifiers().size() != 1 ) { throw new UserException("This tool only accepts a single SAM/BAM/CRAM as input"); } if (SAMPLE) { splitters.add(new SampleNameSplitter()); } if (READ_GROUP) { splitters.add(new ReadGroupIdSplitter()); } if (LIBRARY_NAME) { splitters.add(new LibraryNameSplitter()); } outs = createWriters(splitters); } @Override public void apply( GATKRead read, ReferenceContext referenceContext, FeatureContext featureContext ) { outs.computeIfAbsent(getKey(splitters, read), this::createUnknownOutOnDemand).addRead(read); } @Override public void closeTool() { if ( outs != null ) { outs.values().forEach(writer -> writer.close()); } } // Create an output stream on demand for holding any reads that do not have a value for one or more of the // attributes we're grouping by private SAMFileGATKReadWriter createUnknownOutOnDemand(String attributeValue) { if (!attributeValue.equals("."+UNKNOWN_OUT_PREFIX)) { // the only attribute value we should ever discover at runtime is the string ".unknown" which is // synthesized by "getkey" below when a splitter returns null because we're splitting on some // attribute for which a given read/group has no value; anything else indicates a coding error throw new GATKException.ShouldNeverReachHereException("Unrecognized attribute value found: " + attributeValue); } return prepareSAMFileWriter(attributeValue); } // Create a new output file and prepare and return the corresponding SAMFileGATKReadWriter. private SAMFileGATKReadWriter prepareSAMFileWriter(final String keyName) { final GATKPath pathSpec = readArguments.getReadPathSpecifiers().get(0); final GATKPath outFile = new GATKPath( OUTPUT_DIRECTORY.toPath().resolve( pathSpec.getBaseName().orElse("") + keyName + pathSpec.getExtension().get()).toString()); return createSAMWriter(outFile, true); } /** * Creates SAMFileWriter instances for the reader splitters based on the input file. * @param splitters Reader splitters. * @return A map of file name keys to SAMFileWriter. */ private Map createWriters(final List> splitters) { final Map outs = new LinkedHashMap<>(); final SAMFileHeader samFileHeaderIn = getHeaderForReads(); // Build up a list of key options at each level. final List> splitKeys = splitters.stream() .map(splitter -> splitter.getSplitsBy(samFileHeaderIn)) .collect(Collectors.toList()); // For every combination of keys, add a SAMFileWriter. addKey(splitKeys, 0, "", key -> { outs.put(key, prepareSAMFileWriter(key)); }); return outs; } /** * Recursively builds up a key, then when it reaches the bottom of the list, calls the adder on the generated key. * @param listKeys A outer list, where each inner list contains the output options for that level. * @param listIndex The current recursive index within the listKeys. * @param key The built up key recursively * @param adder Function to run on the recursively generated key once the bottom of the outer list is reached. */ private void addKey(final List> listKeys, final int listIndex, final String key, final Consumer adder) { if (listIndex < listKeys.size()) { for (final Object newKey : listKeys.get(listIndex)) { addKey(listKeys, listIndex + 1, key + "." + newKey, adder); } } else { adder.accept(key); } } /** * Traverses the splitters generating a key for this particular record. * @param splitters The list of splitters. * @param record The record to analyze. * @return The generated key that may then be used to find the appropriate SAMFileWriter. */ private String getKey(final List> splitters, final GATKRead record) { // if a read is missing the value for the target split, return the constant "unknown" which will // result in a new output stream being created on demand to hold uncategorized reads return splitters.stream() .map(s -> { final Object key = s.getSplitBy(record, getHeaderForReads()); return key == null ? UNKNOWN_OUT_PREFIX : key.toString(); }) .reduce("", (acc, item) -> acc + "." + item); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy