org.seqdoop.hadoop_bam.BAMSplitGuesser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (c) 2011 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// File created: 2011-01-17 15:17:59
package org.seqdoop.hadoop_bam;
import htsjdk.samtools.BAMFileSpan;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMFileSpan;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordHelper;
import htsjdk.samtools.SamInputResource;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.seekablestream.ByteArraySeekableStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.GenericOptionsParser;
import htsjdk.samtools.BAMRecordCodec;
import htsjdk.samtools.FileTruncatedException;
import htsjdk.samtools.SAMFormatException;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.RuntimeEOFException;
import htsjdk.samtools.util.RuntimeIOException;
import org.seqdoop.hadoop_bam.util.SAMHeaderReader;
import org.seqdoop.hadoop_bam.util.WrapSeekable;
/** A class for heuristically finding BAM record positions inside an area of
* a BAM file.
*/
public class BAMSplitGuesser extends BaseSplitGuesser {
private SeekableStream inFile;
private BlockCompressedInputStream bgzf;
private final BAMRecordCodec bamCodec;
private final int referenceSequenceCount;
private final SAMFileHeader header;
// We want to go through this many BGZF blocks fully, checking that they
// contain valid BAM records, when guessing a BAM record position.
private final static byte BLOCKS_NEEDED_FOR_GUESS = 3;
// Since the max size of a BGZF block is 0xffff (64K), and we might be just
// one byte off from the start of the previous one, we need 0xfffe bytes for
// the start, and then 0xffff times the number of blocks we want to go
// through.
private final static int MAX_BYTES_READ =
BLOCKS_NEEDED_FOR_GUESS * 0xffff + 0xfffe;
private final static int SHORTEST_POSSIBLE_BAM_RECORD = 4*9 + 1 + 1 + 1;
/** The stream must point to a valid BAM file, because the header is read
* from it.
*/
public BAMSplitGuesser(
SeekableStream ss, Configuration conf)
throws IOException
{
this(ss, ss, conf);
// Secondary check that the header points to a BAM file: Picard can get
// things wrong due to its autodetection.
ss.seek(0);
if (ss.read(buf.array(), 0, 4) != 4 || buf.getInt(0) != BGZF_MAGIC)
throw new SAMFormatException("Does not seem like a BAM file");
}
public BAMSplitGuesser(
SeekableStream ss, InputStream headerStream, Configuration conf)
throws IOException
{
inFile = ss;
header = SAMHeaderReader.readSAMHeaderFrom(headerStream, conf);
referenceSequenceCount = header.getSequenceDictionary().size();
bamCodec = new BAMRecordCodec(null, new LazyBAMRecordFactory());
}
/** Finds a virtual BAM record position in the physical position range
* [beg,end). Returns end if no BAM record was found.
*/
public long guessNextBAMRecordStart(long beg, long end)
throws IOException
{
// Use a reader to skip through the headers at the beginning of a BAM file, since
// the headers may exceed MAX_BYTES_READ in length. Don't close the reader
// otherwise it will close the underlying stream, which we continue to read from
// on subsequent calls to this method.
if (beg == 0) {
this.inFile.seek(beg);
SamReader open = SamReaderFactory.makeDefault().setUseAsyncIo(false)
.open(SamInputResource.of(inFile));
SAMFileSpan span = open.indexing().getFilePointerSpanningReads();
if (span instanceof BAMFileSpan) {
return ((BAMFileSpan) span).getFirstOffset();
}
}
// Buffer what we need to go through.
byte[] arr = new byte[MAX_BYTES_READ];
this.inFile.seek(beg);
int totalRead = 0;
for (int left = Math.min((int)(end - beg), arr.length); left > 0;) {
final int r = inFile.read(arr, totalRead, left);
if (r < 0)
break;
totalRead += r;
left -= r;
}
arr = Arrays.copyOf(arr, totalRead);
this.in = new ByteArraySeekableStream(arr);
this.bgzf = new BlockCompressedInputStream(this.in);
this.bgzf.setCheckCrcs(true);
this.bamCodec.setInputStream(bgzf);
final int firstBGZFEnd = Math.min((int)(end - beg), 0xffff);
// cp: Compressed Position, indexes the entire BGZF input.
for (int cp = 0;; ++cp) {
final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd);
if (psz == null)
return end;
final int cp0 = cp = psz.pos;
final long cp0Virt = (long)cp0 << 16;
try {
bgzf.seek(cp0Virt);
// This has to catch Throwable, because it's possible to get an
// OutOfMemoryError due to an overly large size.
} catch (Throwable e) {
// Guessed BGZF position incorrectly: try the next guess.
continue;
}
// up: Uncompressed Position, indexes the data inside the BGZF block.
for (int up = 0;; ++up) {
final int up0 = up = guessNextBAMPos(cp0Virt, up, psz.size);
if (up0 < 0) {
// No BAM records found in the BGZF block: try the next BGZF
// block.
break;
}
// Verify that we can actually decode BLOCKS_NEEDED_FOR_GUESS worth
// of records starting at (cp0,up0).
bgzf.seek(cp0Virt | up0);
boolean decodedAny = false;
try {
byte b = 0;
int prevCP = cp0;
while (b < BLOCKS_NEEDED_FOR_GUESS)
{
SAMRecord record = bamCodec.decode();
if (record == null) {
break;
}
record.setHeaderStrict(header);
SAMRecordHelper.eagerDecode(record); // force decoding of fields
decodedAny = true;
final int cp2 = (int)(bgzf.getFilePointer() >>> 16);
if (cp2 != prevCP) {
// The compressed position changed so we must be in a new
// block.
assert cp2 > prevCP;
prevCP = cp2;
++b;
}
}
// Running out of records to verify is fine as long as we
// verified at least something. It should only happen if we
// couldn't fill the array.
if (b < BLOCKS_NEEDED_FOR_GUESS) {
assert arr.length < MAX_BYTES_READ;
if (!decodedAny)
continue;
}
}
catch (SAMFormatException e) { continue; }
catch (OutOfMemoryError e) { continue; }
catch (IllegalArgumentException e) { continue; }
catch (IndexOutOfBoundsException e) { continue; }
catch (RuntimeIOException e) { continue; }
// EOF can happen legitimately if the [beg,end) range is too
// small to accommodate BLOCKS_NEEDED_FOR_GUESS and we get cut
// off in the middle of a record. In that case, our stream
// should have hit EOF as well. If we've then verified at least
// something, go ahead with it and hope for the best.
catch (FileTruncatedException e) {
if (!decodedAny && this.in.eof())
continue;
}
catch (RuntimeEOFException e) {
if (!decodedAny && this.in.eof())
continue;
}
return beg+cp0 << 16 | up0;
}
}
}
private int guessNextBAMPos(long cpVirt, int up, int cSize) {
// What we're actually searching for is what's at offset [4], not [0]. So
// skip ahead by 4, thus ensuring that whenever we find a valid [0] it's
// at position up or greater.
up += 4;
try {
while (up + SHORTEST_POSSIBLE_BAM_RECORD - 4 < cSize) {
bgzf.seek(cpVirt | up);
IOUtils.readFully(bgzf, buf.array(), 0, 8);
// If the first two checks fail we have what looks like a valid
// reference sequence ID. Assume we're at offset [4] or [24], i.e.
// the ID of either this read or its mate, respectively. So check
// the next integer ([8] or [28]) to make sure it's a 0-based
// leftmost coordinate.
final int id = buf.getInt(0);
final int pos = buf.getInt(4);
if (id < -1 || id > referenceSequenceCount || pos < -1) {
++up;
continue;
}
// Okay, we could be at [4] or [24]. Assuming we're at [4], check
// that [24] is valid. Assume [4] because we should hit it first:
// the only time we expect to hit [24] is at the beginning of the
// split, as part of the first read we should skip.
bgzf.seek(cpVirt | up+20);
IOUtils.readFully(bgzf, buf.array(), 0, 8);
final int nid = buf.getInt(0);
final int npos = buf.getInt(4);
if (nid < -1 || nid > referenceSequenceCount || npos < -1) {
++up;
continue;
}
// So far so good: [4] and [24] seem okay. Now do something a bit
// more involved: make sure that [36 + [12]&0xff - 1] == 0: that
// is, the name of the read should be null terminated.
// Move up to 0 just to make it less likely that we get confused
// with offsets. Remember where we should continue from if we
// reject this up.
final int nextUP = up + 1;
up -= 4;
bgzf.seek(cpVirt | up+12);
IOUtils.readFully(bgzf, buf.array(), 0, 4);
final int nameLength = buf.getInt(0) & 0xff;
if (nameLength < 1) {
// Names are null-terminated so length must be at least one
up = nextUP;
continue;
}
final int nullTerminator = up + 36 + nameLength-1;
if (nullTerminator >= cSize) {
// This BAM record can't fit here. But maybe there's another in
// the remaining space, so try again.
up = nextUP;
continue;
}
bgzf.seek(cpVirt | nullTerminator);
IOUtils.readFully(bgzf, buf.array(), 0, 1);
if (buf.get(0) != 0) {
up = nextUP;
continue;
}
// All of [4], [24], and [36 + [12]&0xff] look good. If [0] is also
// sensible, that's good enough for us. "Sensible" to us means the
// following:
//
// [0] >= 4*([16]&0xffff) + [20] + ([20]+1)/2 + 4*8 + ([12]&0xff)
// Note that [0] is "length of the _remainder_ of the alignment
// record", which is why this uses 4*8 instead of 4*9.
int zeroMin = 4*8 + nameLength;
bgzf.seek(cpVirt | up+16);
IOUtils.readFully(bgzf, buf.array(), 0, 8);
zeroMin += (buf.getInt(0) & 0xffff) * 4;
zeroMin += buf.getInt(4) + (buf.getInt(4)+1)/2;
bgzf.seek(cpVirt | up);
IOUtils.readFully(bgzf, buf.array(), 0, 4);
if (buf.getInt(0) < zeroMin) {
up = nextUP;
continue;
}
return up;
}
} catch (IOException e) {}
return -1;
}
public static void main(String[] args) throws IOException {
final GenericOptionsParser parser;
try {
parser = new GenericOptionsParser(args);
// This should be IOException but Hadoop 0.20.2 doesn't throw it...
} catch (Exception e) {
System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage());
System.exit(1);
// Hooray for javac
return;
}
args = parser.getRemainingArgs();
final Configuration conf = parser.getConfiguration();
long beg = 0;
if (args.length < 2 || args.length > 3) {
System.err.println(
"Usage: BAMSplitGuesser path-or-uri header-path-or-uri [beg]");
System.exit(2);
}
try {
if (args.length > 2) beg = Long.decode(args[2]);
} catch (NumberFormatException e) {
System.err.println("Invalid beg offset.");
if (e.getMessage() != null)
System.err.println(e.getMessage());
System.exit(2);
}
SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0]));
SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1]));
final long end = beg + MAX_BYTES_READ;
System.out.printf(
"Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"+
"Will then verify BAM data within: [%1$#x,%3$#x) = [%1$d,%3$d)\n",
beg, beg + 0xffff, end);
final long g =
new BAMSplitGuesser(ss, hs, conf).guessNextBAMRecordStart(beg, end);
ss.close();
if (g == end) {
System.out.println(
"Didn't find any acceptable BAM record in any BGZF block.");
System.exit(1);
}
System.out.printf(
"Accepted BGZF block at offset %1$#x (%1$d).\n"+
"Accepted BAM record at offset %2$#x (%2$d) therein.\n",
g >> 16, g & 0xffff);
}
}