All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.broadinstitute.hellbender.tools.SplitCRAM Maven / Gradle / Ivy

The newest version!
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.cram.build.CramContainerIterator;
import htsjdk.samtools.cram.build.CramIO;
import htsjdk.samtools.cram.structure.Container;
import htsjdk.samtools.cram.structure.CramHeader;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.GATKException;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.regex.Pattern;

/***
 * SplitCRAM - split a cram file into smaller cram files (shards) containing a minimal number of records
 * while still respecting container boundaries.
 *
 * The tool operates on a CRAM container level and therefore is efficient but not exact in the number of
 * records on each output file (container boundaries are maintained)
 *
 * Note that CRAM files have relative record counters embedded in each container. These are not reset by
 * this tool. Therefore, the resulting files may not contain correct record counter values.
 *
 * 

Usage

* * ./gatk SplitCRAM \ * -I * input.cram * -O * output_%04d.cram * --shard-records * 5000000 * * * Notes: * 1. shard-records is optional. defaults to 10M * 2. output filename should contain a %d formatter pattern */ @CommandLineProgramProperties( summary = "Splits CRAM files efficiently by taking advantage of their container based structure", oneLineSummary = "Split CRAM files to smaller files efficiently", programGroup = FlowBasedProgramGroup.class ) @WorkflowProperties @ExperimentalFeature public class SplitCRAM extends CommandLineProgram { public static final int DEFAULT_SHARD_RECORDS = 10000000; public static final String SHARD_RECORDS_FULL_NAME = "shard-records"; public static final String SHARD_MAX_OUTPUT_COUNT = "shard-max-output-count"; public static final Pattern numeratorFormat = Pattern.compile("%[0-9]*d"); @Argument(fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, doc = "input cram file to split") private GATKPath cramInput = null; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "output cram file template. should contain %d, which will be replaced by shard index", optional = true) private String cramOutputTemplate = "output_%04d.cram"; @Argument(fullName = SHARD_RECORDS_FULL_NAME, doc = "minimum threshold for number of records per shard.", optional = true) private long shardRecords = DEFAULT_SHARD_RECORDS; @Argument(fullName = SHARD_MAX_OUTPUT_COUNT, doc = "maximal number of output shards to output.", optional = true) private int shardMaxOutputCount; // locals CramContainerIterator cramContainerIterator; int shard; @Override protected void onStartup() { super.onStartup(); // check that output template contains a %d formatter if ( !numeratorFormat.matcher(cramOutputTemplate).find() ) { throw new IllegalArgumentException("output template missing a %d enumerator formatter: " + cramOutputTemplate); } } @Override protected Object doWork() { try (final CramContainerIterator cramContainerIterator = new CramContainerIterator(new BufferedInputStream(cramInput.getInputStream())) ){ // get header final CramHeader cramHeader = cramContainerIterator.getCramHeader(); // iterate while (cramContainerIterator.hasNext()) { int shardOuputCount = 0; try (final OutputStream os = nextOutputStream()) { // write headers CramIO.writeCramHeader(cramContainerIterator.getCramHeader(), os); Container.writeSAMFileHeaderContainer(cramContainerIterator.getCramHeader().getCRAMVersion(), cramContainerIterator.getSamFileHeader(), os); // iterate long records = 0; while (cramContainerIterator.hasNext() && (records < shardRecords)) { // get next container final Container container = cramContainerIterator.next(); // write container to output stream container.write(cramHeader.getCRAMVersion(), os); // update record count records += container.getContainerHeader().getNumberOfRecords(); } CramIO.writeCramEOF(cramContainerIterator.getCramHeader().getCRAMVersion(), os); shardOuputCount++; if ( shardMaxOutputCount != 0 && shardOuputCount >= shardMaxOutputCount ) { break; } } } } catch (IOException e) { throw new GATKException(e.getMessage(), e); } return null; } private OutputStream nextOutputStream() { final String filename = String.format(cramOutputTemplate, shard++); final GATKPath path = new GATKPath(filename); return new BufferedOutputStream(path.getOutputStream()); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy