
org.seqdoop.hadoop_bam.util.BGZFSplitGuesser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (c) 2011 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// File created: 2011-05-31 11:40:06
package org.seqdoop.hadoop_bam.util;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import htsjdk.samtools.util.BlockCompressedInputStream;
import org.apache.hadoop.fs.FSDataInputStream;
public class BGZFSplitGuesser {
private FSDataInputStream inFile;
private SeekableArrayStream in;
private final ByteBuffer buf;
private final static int BGZF_MAGIC = 0x04088b1f;
private final static int BGZF_MAGIC_SUB = 0x00024342;
private final static int BGZF_SUB_SIZE = 4 + 2;
public BGZFSplitGuesser(FSDataInputStream is) {
inFile = is;
buf = ByteBuffer.allocate(8);
buf.order(ByteOrder.LITTLE_ENDIAN);
}
/// Looks in the range [beg,end). Returns end if no BAM record was found.
public long guessNextBGZFBlockStart(long beg, long end)
throws IOException
{
// Buffer what we need to go through. Since the max size of a BGZF block
// is 0xffff (64K), and we might be just one byte off from the start of
// the previous one, we need 0xfffe bytes for the start, and then 0xffff
// for the block we're looking for.
byte[] arr = new byte[2*0xffff - 1];
this.inFile.seek(beg);
arr = Arrays.copyOf(arr, inFile.read(arr, 0, Math.min((int)(end - beg),
arr.length)));
this.in = new SeekableArrayStream(arr);
final BlockCompressedInputStream bgzf =
new BlockCompressedInputStream(this.in);
bgzf.setCheckCrcs(true);
final int firstBGZFEnd = Math.min((int)(end - beg), 0xffff);
for (int pos = 0;;) {
pos = guessNextBGZFPos(pos, firstBGZFEnd);
if (pos < 0)
return end;
try {
// Seek in order to trigger decompression of the block and a CRC
// check.
bgzf.seek((long)pos << 16);
// This has to catch Throwable, because it's possible to get an
// OutOfMemoryError due to an overly large size.
} catch (Throwable e) {
// Guessed BGZF position incorrectly: try the next guess.
++pos;
continue;
}
return beg + pos;
}
}
// Returns a negative number if it doesn't find anything.
private int guessNextBGZFPos(int p, int end)
throws IOException
{
for (;;) {
for (;;) {
in.seek(p);
in.read(buf.array(), 0, 4);
int n = buf.getInt(0);
if (n == BGZF_MAGIC)
break;
// Skip ahead a bit more than 1 byte if you can.
if (n >>> 8 == BGZF_MAGIC << 8 >>> 8)
++p;
else if (n >>> 16 == BGZF_MAGIC << 16 >>> 16)
p += 2;
else
p += 3;
if (p >= end)
return -1;
}
// Found what looks like a gzip block header: now get XLEN and
// search for the BGZF subfield.
final int p0 = p;
p += 10;
in.seek(p);
in.read(buf.array(), 0, 2);
p += 2;
final int xlen = getUShort(0);
final int subEnd = p + xlen;
while (p < subEnd) {
in.read(buf.array(), 0, 4);
if (buf.getInt(0) != BGZF_MAGIC_SUB) {
p += 4 + getUShort(2);
in.seek(p);
continue;
}
// Found it: this is close enough to a BGZF block, make it
// our guess.
return p0;
}
// No luck: look for the next gzip block header. Start right after
// where we last saw the identifiers, although we could probably
// safely skip further ahead. (If we find the correct one right
// now, the previous block contained 0x1f8b0804 bytes of data: that
// seems... unlikely.)
p = p0 + 4;
}
}
private int getUShort(final int idx) {
return (int)buf.getShort(idx) & 0xffff;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy