org.seqdoop.hadoop_bam.BAMSplitGuesser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
There is a newer version: 7.10.0
// Copyright (c) 2011 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.

// File created: 2011-01-17 15:17:59

package org.seqdoop.hadoop_bam;

import htsjdk.samtools.SAMRecord;
import java.io.InputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.GenericOptionsParser;

import htsjdk.samtools.BAMRecordCodec;
import htsjdk.samtools.FileTruncatedException;
import htsjdk.samtools.SAMFormatException;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.RuntimeEOFException;
import htsjdk.samtools.util.RuntimeIOException;

import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
import org.seqdoop.hadoop_bam.util.SeekableArrayStream;
import org.seqdoop.hadoop_bam.util.WrapSeekable;

/** A class for heuristically finding BAM record positions inside an area of
 * a BAM file.
 */
public class BAMSplitGuesser {
	private       SeekableStream             inFile, in;
	private       BlockCompressedInputStream bgzf;
	private final BAMRecordCodec             bamCodec;
	private final ByteBuffer                 buf;
	private final int                        referenceSequenceCount;

	// We want to go through this many BGZF blocks fully, checking that they
	// contain valid BAM records, when guessing a BAM record position.
	private final static byte BLOCKS_NEEDED_FOR_GUESS = 3;

	// Since the max size of a BGZF block is 0xffff (64K), and we might be just
	// one byte off from the start of the previous one, we need 0xfffe bytes for
	// the start, and then 0xffff times the number of blocks we want to go
	// through.
	private final static int MAX_BYTES_READ =
		BLOCKS_NEEDED_FOR_GUESS * 0xffff + 0xfffe;

	private final static int BGZF_MAGIC     = 0x04088b1f;
	private final static int BGZF_MAGIC_SUB = 0x00024342;
	private final static int BGZF_SUB_SIZE  = 4 + 2;

	private final static int SHORTEST_POSSIBLE_BAM_RECORD = 4*9 + 1 + 1 + 1;

	/** The stream must point to a valid BAM file, because the header is read
	 * from it.
	 */
	public BAMSplitGuesser(
			SeekableStream ss, Configuration conf)
		throws IOException
	{
		this(ss, ss, conf);

		// Secondary check that the header points to a BAM file: Picard can get
		// things wrong due to its autodetection.
		ss.seek(0);
		if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC)
			throw new SAMFormatException("Does not seem like a BAM file");
	}

	public BAMSplitGuesser(
			SeekableStream ss, InputStream headerStream, Configuration conf)
		throws IOException
	{
		inFile = ss;

		buf = ByteBuffer.allocate(8);
		buf.order(ByteOrder.LITTLE_ENDIAN);

		referenceSequenceCount =
			SAMHeaderReader.readSAMHeaderFrom(headerStream, conf)
			.getSequenceDictionary().size();

		bamCodec = new BAMRecordCodec(null, new LazyBAMRecordFactory());
	}

	/** Finds a virtual BAM record position in the physical position range
	 * [beg,end). Returns end if no BAM record was found.
	 */
	public long guessNextBAMRecordStart(long beg, long end)
		throws IOException
	{
		// Buffer what we need to go through.

		byte[] arr = new byte[MAX_BYTES_READ];

		this.inFile.seek(beg);
		int totalRead = 0;
		for (int left = Math.min((int)(end - beg), arr.length); left > 0;) {
			final int r = inFile.read(arr, totalRead, left);
			if (r < 0)
				break;
			totalRead += r;
			left -= r;
		}
		arr = Arrays.copyOf(arr, totalRead);

		this.in = new SeekableArrayStream(arr);

		this.bgzf = new BlockCompressedInputStream(this.in);
		this.bgzf.setCheckCrcs(true);

		this.bamCodec.setInputStream(bgzf);

		final int firstBGZFEnd = Math.min((int)(end - beg), 0xffff);

		// cp: Compressed Position, indexes the entire BGZF input.
		for (int cp = 0;; ++cp) {
			final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd);
			if (psz == null)
				return end;

			final int  cp0     = cp = psz.pos;
			final long cp0Virt = (long)cp0 << 16;
			try {
				bgzf.seek(cp0Virt);

			// This has to catch Throwable, because it's possible to get an
			// OutOfMemoryError due to an overly large size.
			} catch (Throwable e) {
				// Guessed BGZF position incorrectly: try the next guess.
				continue;
			}

			// up: Uncompressed Position, indexes the data inside the BGZF block.
			for (int up = 0;; ++up) {
				final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size);

				if (up0 < 0) {
					// No BAM records found in the BGZF block: try the next BGZF
					// block.
					break;
				}

				// Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth
				// of records starting at (cp0,up0).
				bgzf.seek(cp0Virt | up0);
				boolean decodedAny = false;
				try {
					byte b = 0;
					int prevCP = cp0;
					while (b < BLOCKS_NEEDED_FOR_GUESS)
					{
						SAMRecord record = bamCodec.decode();
						if (record == null) {
							break;
						}
						record.getCigar(); // force decoding of CIGAR
						decodedAny = true;

						final int cp2 = (int)(bgzf.getFilePointer() >>> 16);
						if (cp2 != prevCP) {
							// The compressed position changed so we must be in a new
							// block.
							assert cp2 > prevCP;
							prevCP = cp2;
							++b;
						}
					}

					// Running out of records to verify is fine as long as we
					// verified at least something. It should only happen if we
					// couldn't fill the array.
					if (b < BLOCKS_NEEDED_FOR_GUESS) {
						assert arr.length < MAX_BYTES_READ;
						if (!decodedAny)
							continue;
					}
				}
                                  catch (SAMFormatException     e) { continue; }
				  catch (FileTruncatedException e) { continue; }
				  catch (OutOfMemoryError       e) { continue; }
				  catch (IllegalArgumentException e) { continue; }
				  catch (RuntimeIOException     e) { continue; }
				  catch (RuntimeEOFException    e) {
					// This can happen legitimately if the [beg,end) range is too
					// small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut
					// off in the middle of a record. In that case, our stream
					// should have hit EOF as well. If we've then verified at least
					// something, go ahead with it and hope for the best.
					if (!decodedAny && this.in.eof())
						continue;
				}

				return beg+cp0 << 16 | up0;
			}
		}
	}

	private static class PosSize {
		public int pos;
		public int size;
		public PosSize(int p, int s) { pos = p; size = s; }
	}

	// Gives the compressed size on the side. Returns null if it doesn't find
	// anything.
	private PosSize guessNextBGZFPos(int p, int end) {
		try { for (;;) {
			for (;;) {
				in.seek(p);
				in.read(buf.array(), 0, 4);
				int n = buf.getInt(0);

				if (n == BGZF_MAGIC)
					break;

				// Skip ahead a bit more than 1 byte if you can.
				if (n >>> 8 == BGZF_MAGIC << 8 >>> 8)
					++p;
				else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16)
					p += 2;
				else
					p += 3;

				if (p >= end)
					return null;
			}
			// Found what looks like a gzip block header: now get XLEN and
			// search for the BGZF subfield.
			final int p0 = p;
			p += 10;
			in.seek(p);
			in.read(buf.array(), 0, 2);
			p += 2;
			final int xlen   = getUShort(0);
			final int subEnd = p + xlen;

			while (p < subEnd) {
				in.read(buf.array(), 0, 4);

				if (buf.getInt(0) != BGZF_MAGIC_SUB) {
					p += 4 + getUShort(2);
					in.seek(p);
					continue;
				}

				// Found it: this is close enough to a BGZF block, make it
				// our guess.

				// But find out the size before returning. First, grab bsize:
				// we'll need it later.
				in.read(buf.array(), 0, 2);
				int bsize = getUShort(0);

				// Then skip the rest of the subfields.
				p += BGZF_SUB_SIZE;
				while (p < subEnd) {
					in.seek(p);
					in.read(buf.array(), 0, 4);
					p += 4 + getUShort(2);
				}
				if (p != subEnd) {
					// Cancel our guess because the xlen field didn't match the
					// data.
					break;
				}

				// Now skip past the compressed data and the CRC-32.
				p += bsize - xlen - 19 + 4;
				in.seek(p);
				in.read(buf.array(), 0, 4);
				return new PosSize(p0, buf.getInt(0));
			}
			// No luck: look for the next gzip block header. Start right after
			// where we last saw the identifiers, although we could probably
			// safely skip further ahead. (If we find the correct one right
			// now, the previous block contained 0x1f8b0804 bytes of data: that
			// seems... unlikely.)
			p = p0 + 4;

		}} catch (IOException e) {
			return null;
		}
	}

	private int guessNextBAMPos(long cpVirt, int up, int cSize) {
		// What we're actually searching for is what's at offset [4], not [0]. So
		// skip ahead by 4, thus ensuring that whenever we find a valid [0] it's
		// at position up or greater.
		up += 4;

		try {
			while (up + SHORTEST_POSSIBLE_BAM_RECORD - 4 < cSize) {
				bgzf.seek(cpVirt | up);
				bgzf.read(buf.array(), 0, 8);

				// If the first two checks fail we have what looks like a valid
				// reference sequence ID. Assume we're at offset [4] or [24], i.e.
				// the ID of either this read or its mate, respectively. So check
				// the next integer ([8] or [28]) to make sure it's a 0-based
				// leftmost coordinate.
				final int id  = buf.getInt(0);
				final int pos = buf.getInt(4);
				if (id < -1 || id > referenceSequenceCount || pos < -1) {
					++up;
					continue;
				}

				// Okay, we could be at [4] or [24]. Assuming we're at [4], check
				// that [24] is valid. Assume [4] because we should hit it first:
				// the only time we expect to hit [24] is at the beginning of the
				// split, as part of the first read we should skip.

				bgzf.seek(cpVirt | up+20);
				bgzf.read(buf.array(), 0, 8);

				final int nid  = buf.getInt(0);
				final int npos = buf.getInt(4);
				if (nid < -1 || nid > referenceSequenceCount || npos < -1) {
					++up;
					continue;
				}

				// So far so good: [4] and [24] seem okay. Now do something a bit
				// more involved: make sure that [36 + [12]&0xff - 1] == 0: that
				// is, the name of the read should be null terminated.

				// Move up to 0 just to make it less likely that we get confused
				// with offsets. Remember where we should continue from if we
				// reject this up.
				final int nextUP = up + 1;
				up -= 4;

				bgzf.seek(cpVirt | up+12);
				bgzf.read(buf.array(), 0, 4);

				final int nameLength = buf.getInt(0) & 0xff;
				if (nameLength < 1) {
					// Names are null-terminated so length must be at least one
					up = nextUP;
					continue;
				}

				final int nullTerminator = up + 36 + nameLength-1;

				if (nullTerminator >= cSize) {
					// This BAM record can't fit here. But maybe there's another in
					// the remaining space, so try again.
					up = nextUP;
					continue;
				}

				bgzf.seek(cpVirt | nullTerminator);
				bgzf.read(buf.array(), 0, 1);

				if (buf.get(0) != 0) {
					up = nextUP;
					continue;
				}

				// All of [4], [24], and [36 + [12]&0xff] look good. If [0] is also
				// sensible, that's good enough for us. "Sensible" to us means the
				// following:
				//
				// [0] >= 4*([16]&0xffff) + [20] + ([20]+1)/2 + 4*8 + ([12]&0xff)

				// Note that [0] is "length of the _remainder_ of the alignment
				// record", which is why this uses 4*8 instead of 4*9.
				int zeroMin = 4*8 + nameLength;

				bgzf.seek(cpVirt | up+16);
				bgzf.read(buf.array(), 0, 8);

				zeroMin += (buf.getInt(0) & 0xffff) * 4;
				zeroMin += buf.getInt(4) + (buf.getInt(4)+1)/2;

				bgzf.seek(cpVirt | up);
				bgzf.read(buf.array(), 0, 4);

				if (buf.getInt(0) < zeroMin) {
					up = nextUP;
					continue;
				}
				return up;
			}
		} catch (IOException e) {}
		return -1;
	}
	private int getUShort(final int idx) {
		return (int)buf.getShort(idx) & 0xffff;
	}

	public static void main(String[] args) throws IOException {
		final GenericOptionsParser parser;
		try {
			parser = new GenericOptionsParser(args);

		// This should be IOException but Hadoop 0.20.2 doesn't throw it...
		} catch (Exception e) {
			System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage());
			System.exit(1);

			// Hooray for javac
			return;
		}

		args = parser.getRemainingArgs();
		//final Configuration conf = ContextUtil.getConfiguration(parser);
                final Configuration conf = parser.getConfiguration();

		long beg = 0;

		if (args.length < 2 || args.length > 3) {
			System.err.println(
				"Usage: BAMSplitGuesser path-or-uri header-path-or-uri [beg]");
			System.exit(2);
		}

		try {
			if (args.length > 2) beg = Long.decode(args[2]);
		} catch (NumberFormatException e) {
			System.err.println("Invalid beg offset.");
			if (e.getMessage() != null)
				System.err.println(e.getMessage());
			System.exit(2);
		}

		SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0]));
		SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1]));

		final long end = beg + MAX_BYTES_READ;

		System.out.printf(
			"Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"+
			"Will then verify BAM data within:  [%1$#x,%3$#x) = [%1$d,%3$d)\n",
			beg, beg + 0xffff, end);

		final long g =
			new BAMSplitGuesser(ss, hs, conf).guessNextBAMRecordStart(beg, end);

		ss.close();

		if (g == end) {
			System.out.println(
				"Didn't find any acceptable BAM record in any BGZF block.");
			System.exit(1);
		}

		System.out.printf(
			"Accepted BGZF block at offset %1$#x (%1$d).\n"+
			"Accepted BAM record at offset %2$#x (%2$d) therein.\n",
			g >> 16, g & 0xffff);
	}
}