org.seqdoop.hadoop_bam.cli.plugins.Cat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
There is a newer version: 7.10.0
// Copyright (c) 2012 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2012-03-08 14:58:13

package org.seqdoop.hadoop_bam.cli.plugins;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.Path;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.samtools.util.BlockCompressedStreamConstants;

import org.seqdoop.hadoop_bam.custom.jargs.gnu.CmdLineParser;
import static org.seqdoop.hadoop_bam.custom.jargs.gnu.CmdLineParser.Option.*;

import org.seqdoop.hadoop_bam.SAMFormat;
import org.seqdoop.hadoop_bam.cli.CLIPlugin;
import org.seqdoop.hadoop_bam.cli.Utils;
import org.seqdoop.hadoop_bam.util.Pair;
import org.seqdoop.hadoop_bam.util.SAMOutputPreparer;

public final class Cat extends CLIPlugin {
	private static final List> optionDescs
		= new ArrayList>();

	private static final CmdLineParser.Option
		verboseOpt    = new BooleanOption('v', "verbose"),
		stringencyOpt = new  StringOption("validation-stringency=S");

	public Cat() {
		super("cat", "concatenation of partial SAM and BAM files", "1.1",
			"OUTPATH INPATH [INPATH...]", optionDescs,
			"Reads the SAM or BAM files in the given INPATHs and outputs "+
			"the reads contained within them directly to OUTPATH. Performs no "+
			"format conversions: simply concatenates the files. Note that BAM "+
			"files should not have terminator blocks in them, as they will also "+
			"be copied, which could confuse some tools."+
			"\n\n"+
			"Each INPATH can be a glob pattern as understood by Hadoop."+
			"\n\n"+
			"The header used in OUTPATH is selected from the first INPATH. The "+
			"output format is always the same as that of the INPATHs, and thus "+
			"is selected based on the first INPATH or its contents.");
	}
	static {
		optionDescs.add(new Pair(
			verboseOpt, "report progress verbosely"));
		optionDescs.add(new Pair(
			stringencyOpt, Utils.getStringencyOptHelp()));
	}

	@Override protected int run(final CmdLineParser parser) {
		final List args = parser.getRemainingArgs();
		if (args.isEmpty()) {
			System.err.println("cat :: OUTPATH not given.");
			return 3;
		}
		if (args.size() == 1) {
			System.err.println("cat :: no INPATHs given.");
			return 3;
		}

		final Path outPath = new Path(args.get(0));

		final List ins = args.subList(1, args.size());

		final boolean verbose = parser.getBoolean(verboseOpt);

		final ValidationStringency stringency =
			Utils.toStringency(parser.getOptionValue(stringencyOpt, ValidationStringency.DEFAULT_STRINGENCY.toString()), "cat");
		if (stringency == null)
			return 3;

		final Configuration conf = getConf();

		// Expand the glob patterns.

		final List inputs = new ArrayList(ins.size());
		for (final String in : ins) {
			try {
				final Path p = new Path(in);
				for (final FileStatus fstat : p.getFileSystem(conf).globStatus(p))
					inputs.add(fstat.getPath());
			} catch (IOException e) {
				System.err.printf(
					"cat :: Could not expand glob pattern '%s': %s\n",
					in, e.getMessage());
			}
		}

		final Path input0 = inputs.get(0);

		// Infer the format from the first input path or contents.
		// the first input path or contents.

		SAMFormat format = SAMFormat.inferFromFilePath(input0);
		if (format == null) {
			try {
				format = SAMFormat.inferFromData(
					input0.getFileSystem(conf).open(input0));
			} catch (IOException e) {
				System.err.printf("cat :: Could not read input '%s': %s\n",
				                  input0, e.getMessage());
				return 4;
			}
			if (format == null) {
				System.err.printf(
					"cat :: Unknown SAM format in input '%s'\n", inputs.get(0));
				return 4;
			}
		}

		// Choose the header.

		final SAMFileHeader header;
		try {
			final SamReader r = SamReaderFactory.makeDefault().validationStringency(stringency).
					open(SamInputResource.of(input0.getFileSystem(conf).open(input0)));

			header = r.getFileHeader();
			r.close();
		} catch (IOException e) {
			System.err.printf("cat :: Could not read input '%s': %s\n",
									input0, e.getMessage());
			return 5;
		}

		// Open the output.

		final OutputStream out;

		try {
			out = outPath.getFileSystem(conf).create(outPath);
		} catch (IOException e) {
			System.err.printf("cat :: Could not create output file: %s\n",
			                  e.getMessage());
			return 6;
		}

		// Output the header.

		try {
			// Don't use the returned stream, because we're concatenating directly
			// and don't want to apply another layer of compression to BAM.
			new SAMOutputPreparer().prepareForRecords(out, format, header);

		} catch (IOException e) {
			System.err.printf("cat :: Outputting header failed: %s\n",
									e.getMessage());
			return 7;
		}

		// Output the records from each file in the order given, converting if
		// necessary.

		int inIdx = 1;
		try { for (final Path inPath : inputs) {
			if (verbose) {
				System.out.printf("cat :: Concatenating path %d of %d...\n",
										inIdx++, inputs.size());
			}
			switch (format) {
				case SAM: {
					final InputStream in = inPath.getFileSystem(conf).open(inPath);

					// Use SamReader to grab the header, but ignore it, thus
					// ensuring that the header has been skipped.
					SamReaderFactory.makeDefault().open(SamInputResource.of(in)).getFileHeader();

					IOUtils.copyBytes(in, out, conf, false);
					in.close();
					break;
				}
				case BAM: {
					final FSDataInputStream in =
						inPath.getFileSystem(conf).open(inPath);

					// Find the block length, thankfully given to us by the BGZF
					// format. We need it in order to know how much gzipped data to
					// read after skipping the BAM header, so that we can only read
					// that much and then simply copy the remaining gzip blocks
					// directly.

					final ByteBuffer block =
						ByteBuffer.wrap(new byte[0xffff])
						          .order(ByteOrder.LITTLE_ENDIAN);

					// Don't use readFully here, since EOF is fine.
					for (int read = 0, prev;
					     (prev = in.read(block.array(), read,
					                     block.capacity() - read))
					     	< block.capacity();)
					{
						// EOF is fine.
						if (prev == -1)
							break;
						read += prev;
					}

					// Find the BGZF subfield and extract the length from it.
					int blockLength = 0;
					for (int xlen = (int)block.getShort(10) & 0xffff,
					         i    = 12,
					         end  = i + xlen;
					         i < end;)
					{
						final int slen = (int)block.getShort(i+2) & 0xffff;
						if (block.getShort(i) == 0x4342 && slen == 2) {
							blockLength = ((int)block.getShort(i+4) & 0xffff) + 1;
							break;
						}
						i += 4 + slen;
					}
					if (blockLength == 0)
						throw new IOException(
							"BGZF extra field not found in " + inPath);

					if (verbose) {
						System.err.printf(
							"cat ::   first block length %d\n", blockLength);
					}

					// Skip the BAM header. Can't use SAMFileReader because it'll
					// use its own BlockCompressedInputStream.

					final ByteArrayInputStream blockIn =
						new ByteArrayInputStream(block.array(), 0, blockLength);

					final BlockCompressedInputStream bin =
						new BlockCompressedInputStream(blockIn);

					// Theoretically we could write into the ByteBuffer we already
					// had, since BlockCompressedInputStream needs to read the
					// header before it can decompress any data and thereafter we
					// can freely overwrite the first 8 bytes of the header... but
					// that's a bit too nasty, so let's not.
					final ByteBuffer buf =
						ByteBuffer.wrap(new byte[8]).order(ByteOrder.LITTLE_ENDIAN);

					// Read the BAM magic number and the SAM header length, verify
					// the magic, and skip the SAM header.

					IOUtils.readFully(bin, buf.array(), 0, 8);

					final int magic     = buf.getInt(0),
					          headerLen = buf.getInt(4);

					if (magic != 0x014d4142)
						throw new IOException("bad BAM magic number in " + inPath);

					IOUtils.skipFully(bin, headerLen);

					// Skip the reference sequences.

					IOUtils.readFully(bin, buf.array(), 0, 4);

					for (int i = buf.getInt(0); i-- > 0;) {
						// Read the reference name length and skip it along with the
						// reference length.
						IOUtils.readFully(bin, buf.array(), 0, 4);
						IOUtils.skipFully(bin, buf.getInt(0) + 4);
					}

					// Recompress the rest of this gzip block.

					final int remaining = bin.available();

					if (verbose)
						System.err.printf("cat ::   %d bytes to bgzip\n", remaining);

					if (remaining > 0) {
						// The overload of IOUtils.copyBytes that takes "long length"
						// was added only in Hadoop 0.20.205.0, which we don't want
						// to depend on, so copy manually.
						final byte[] remBuf = new byte[remaining];
						IOUtils.readFully(bin, remBuf, 0, remBuf.length);

						final BlockCompressedOutputStream bout =
							new BlockCompressedOutputStream(out, null);

						bout.write(remBuf);
						bout.flush();
					}

					// Just copy the raw bytes comprising the remaining blocks.

					in.seek(blockLength);
					IOUtils.copyBytes(in, out, conf, false);
					in.close();
					break;
				}
			}
		}} catch (IOException e) {
			System.err.printf("cat :: Outputting records failed: %s\n",
			                  e.getMessage());
			return 8;
		}

		// For BAM, output the BGZF terminator.

		try {
			if (format == SAMFormat.BAM)
				out.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK);

			out.close();
		} catch (IOException e) {
			System.err.printf("cat :: Finishing output failed: %s\n",
			                  e.getMessage());
			return 9;
		}
		return 0;
	}
}