org.seqdoop.hadoop_bam.BCFSplitGuesser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-bam Show documentation
Show all versions of hadoop-bam Show documentation
A Java library for the manipulation of files in common bioinformatics formats using the Hadoop MapReduce framework.
// Copyright (c) 2013 Aalto University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to
// deal in the Software without restriction, including without limitation the
// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
// sell copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
// IN THE SOFTWARE.
// File created: 2013-06-28 13:49:57
package org.seqdoop.hadoop_bam;
import htsjdk.samtools.seekablestream.ByteArraySeekableStream;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.GenericOptionsParser;
import htsjdk.samtools.FileTruncatedException;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.RuntimeEOFException;
import htsjdk.tribble.TribbleException;
import htsjdk.tribble.readers.PositionalBufferedStream;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.vcf.VCFHeader;
import org.seqdoop.hadoop_bam.util.WrapSeekable;
/** A class for heuristically finding BCF record positions inside an area of
* a BCF file. Handles both compressed and uncompressed BCF.
*/
public class BCFSplitGuesser extends BaseSplitGuesser {
// cin is the compressed input: a BlockCompressedInputStream for compressed
// BCF, otherwise equal to in. Unfortunately the closest common type is then
// InputStream, which is why we have the cinSeek() method.
private InputStream cin;
private SeekableStream inFile;
private final boolean bgzf;
private final BCF2Codec bcfCodec = new BCF2Codec();
private final int contigDictionaryLength, genotypeSampleCount;
// The amount of data we verify for uncompressed BCF.
private final static int UNCOMPRESSED_BYTES_NEEDED = 0x80000;
// We want to go through this many BGZF blocks fully, checking that they
// contain valid BCF records, when guessing a BCF record position.
private final static byte BGZF_BLOCKS_NEEDED_FOR_GUESS = 2;
// Since the max size of a BGZF block is 0xffff (64K), and we might be just
// one byte off from the start of the previous one, we need 0xfffe bytes for
// the start, and then 0xffff times the number of blocks we want to go
// through.
private final static int BGZF_MAX_BYTES_READ =
BGZF_BLOCKS_NEEDED_FOR_GUESS * 0xffff + 0xfffe;
// This is probably too conservative.
private final static int SHORTEST_POSSIBLE_BCF_RECORD = 4*8 + 1;
/** The stream must point to a valid BCF file, because the header is read
* from it.
*/
public BCFSplitGuesser(SeekableStream ss) throws IOException {
this(ss, ss);
}
public BCFSplitGuesser(SeekableStream ss, InputStream headerStream)
throws IOException
{
inFile = ss;
InputStream bInFile = new BufferedInputStream(inFile);
bgzf = BlockCompressedInputStream.isValidFile(bInFile);
if (bgzf)
bInFile = new BlockCompressedInputStream(bInFile);
// Excess buffering here but it can't be helped that BCF2Codec only takes
// PositionalBufferedStream.
final VCFHeader header =
(VCFHeader)bcfCodec.readHeader(
new PositionalBufferedStream(bInFile)).getHeaderValue();
contigDictionaryLength = header.getContigLines().size();
genotypeSampleCount = header.getNGenotypeSamples();
}
public boolean isBGZF() { return bgzf; }
private void cinSeek(long virt) throws IOException {
if (bgzf)
((BlockCompressedInputStream)cin).seek(virt);
else
((SeekableStream)cin).seek(virt);
}
/** Finds a (virtual in the case of BGZF) BCF record position in the
* physical position range [beg,end). Returns end if no BCF record was
* found.
*/
public long guessNextBCFRecordStart(long beg, long end)
throws IOException
{
// Buffer what we need to go through.
byte[] arr = new byte[
bgzf ? BGZF_MAX_BYTES_READ : UNCOMPRESSED_BYTES_NEEDED];
this.inFile.seek(beg);
int totalRead = 0;
for (int left = Math.min((int)(end - beg), arr.length); left > 0;) {
final int r = inFile.read(arr, totalRead, left);
if (r < 0)
break;
totalRead += r;
left -= r;
}
arr = Arrays.copyOf(arr, totalRead);
this.in = new ByteArraySeekableStream(arr);
final int firstBGZFEnd;
if (this.bgzf) {
firstBGZFEnd = Math.min((int)(end - beg), 0xffff);
BlockCompressedInputStream bgzfStream =
new BlockCompressedInputStream(this.in);
bgzfStream.setCheckCrcs(true);
this.cin = bgzfStream;
} else {
this.cin = this.in;
firstBGZFEnd = 0; // Actually unused
}
// cp: Compressed Position, indexes the entire BGZF input. If
// we have uncompressed BCF, this loop does nothing.
for (int cp = 0;; ++cp) {
final int cp0;
final long cp0Virt;
final int blockLen;
if (this.bgzf) {
final PosSize psz = guessNextBGZFPos(cp, firstBGZFEnd);
if (psz == null)
break;
cp0 = cp = psz.pos;
cp0Virt = (long)cp0 << 16;
try {
cinSeek(cp0Virt);
// This has to catch Throwable, because it's possible to get an
// OutOfMemoryError due to an overly large size.
} catch (Throwable e) {
// Guessed BGZF position incorrectly: try the next guess.
continue;
}
blockLen = psz.size;
} else {
cp0 = 0; // Actually unused
cp0Virt = 0;
blockLen = Math.max(arr.length, UNCOMPRESSED_BYTES_NEEDED);
}
// up: Uncompressed Position, indexes the data inside the BGZF block.
for (int up = 0;; ++up) {
final int up0 = up = guessNextBCFPos(cp0Virt, up, blockLen);
if (up0 < 0) {
// No BCF records found in the BGZF block: try the next BGZF
// block.
break;
}
// Verification time.
cinSeek(cp0Virt | up0);
final PositionalBufferedStream pbIn =
new PositionalBufferedStream(cin);
boolean decodedAny = false;
try {
if (bgzf) {
byte b = 0;
int prevCP = cp0;
while (b < BGZF_BLOCKS_NEEDED_FOR_GUESS && pbIn.peek() != -1)
{
bcfCodec.decode(pbIn);
decodedAny = true;
final int cp2 = (int)
(((BlockCompressedInputStream)cin).getFilePointer()
>>> 16);
if (cp2 != prevCP) {
// The compressed position changed so we must be in a
// new block.
assert cp2 > prevCP;
cp = cp2;
++b;
}
}
// Running out of records to verify is fine as long as we
// verified at least something. It should only happen if we
// couldn't fill the array.
if (b < BGZF_BLOCKS_NEEDED_FOR_GUESS) {
assert arr.length < BGZF_MAX_BYTES_READ;
if (!decodedAny)
continue;
}
} else {
while (pbIn.getPosition() - up0 < UNCOMPRESSED_BYTES_NEEDED
&& pbIn.peek() != -1)
{
bcfCodec.decode(pbIn);
decodedAny = true;
}
// As in the BGZF case.
if (pbIn.getPosition() - up0 < UNCOMPRESSED_BYTES_NEEDED) {
assert arr.length < UNCOMPRESSED_BYTES_NEEDED;
if (!decodedAny)
continue;
}
}
} catch (FileTruncatedException e) { continue; }
catch (OutOfMemoryError e) { continue; }
catch (RuntimeEOFException e) { continue; }
catch (TribbleException e) {
// This is the way in which BCF2Codec reports unexpected EOF.
// Unfortunately, it also reports every other kind of error with
// the same exception. It even wraps IOException in
// TribbleException!
//
// We need to catch EOF in the middle of a record, which can
// happen legitimately if the [beg,end) range is too small and
// cuts off a record. First, require decodedAny, and then, assume
// that this exception means EOF if the stream has hit EOF.
if (!(decodedAny && pbIn.peek() == -1))
continue;
}
return this.bgzf ? beg+cp0 << 16 | up0 : beg + up0;
}
if (!this.bgzf)
break;
}
return end;
}
private int guessNextBCFPos(long cpVirt, int up, int cSize) {
try {
for (; up + SHORTEST_POSSIBLE_BCF_RECORD < cSize; ++up) {
// Note! The BCF2 spec has a table listing the fields and their
// types, but QUAL is misplaced there! It should be before
// n_allele_info, not after n_fmt_sample! The "Putting it all
// together" section shows the correct field order.
// Check that [0] and [4] are big enough to make sense.
cinSeek(cpVirt | up);
IOUtils.readFully(cin, buf.array(), 0, 8);
final long sharedLen = getUInt(0);
final long indivLen = getUInt(4);
if (sharedLen + indivLen < (long)SHORTEST_POSSIBLE_BCF_RECORD)
continue;
// Check that [8] looks like a valid CHROM field and that [12] is a
// 0-based leftmost coordinate.
cinSeek(cpVirt | up+8);
IOUtils.readFully(cin, buf.array(), 0, 8);
final int chrom = buf.getInt(0);
final int pos = buf.getInt(4);
if (chrom < 0 || chrom >= contigDictionaryLength || pos < 0)
continue;
// [24] and [26] are lengths and should thus be nonnegative.
cinSeek(cpVirt | up+24);
IOUtils.readFully(cin, buf.array(), 0, 4);
final int alleleInfo = buf.getInt(0);
final int alleleCount = alleleInfo >> 16;
final int infoCount = alleleInfo & 0xffff;
if (alleleCount < 0) // don't check infoCount since it is always nonnegative
continue;
// Make sure that [28] matches to the same value in the header.
cinSeek(cpVirt | up+28);
IOUtils.readFully(cin, buf.array(), 0, 1);
final short nSamples = getUByte(0);
if ((int)nSamples != genotypeSampleCount)
continue;
// Check that the ID string has a sensible type encoding. That is,
// it should claim to be a character string: [32] & 0x0f == 0x07.
// Further, if it has length 15 or more, i.e. [32] & 0xf0 == 0xf0,
// then it should be followed by an integer, i.e. [33] & 0x0f
// should be in the range [1, 3], and the value of that integer
// should be in the range [15, [0] - x) where x is the guaranteed
// number of bytes in the first part of this record (before the
// genotype block).
cinSeek(cpVirt | up+32);
IOUtils.readFully(cin, buf.array(), 0, 6);
final byte idType = buf.get(0);
if ((idType & 0x0f) != 0x07)
continue;
if ((idType & 0xf0) == 0xf0) {
final byte idLenType = buf.get(1);
final long idLen;
switch (idLenType & 0x0f) {
case 0x01: idLen = getUByte (2); break;
case 0x02: idLen = getUShort(2); break;
case 0x03: idLen = getUInt (2); break;
default: continue;
}
if (idLen < 15
|| idLen > sharedLen - (4*8 + alleleCount + infoCount*2))
continue;
}
// Good enough.
return up;
}
} catch (IOException e) {
// fall through
}
return -1;
}
private long getUInt(final int idx) {
return (long)buf.getInt(idx) & 0xffffffff;
}
private short getUByte(final int idx) {
return (short)((short)buf.get(idx) & 0xff);
}
public static void main(String[] args) throws IOException {
final GenericOptionsParser parser;
try {
parser = new GenericOptionsParser(args);
// This should be IOException but Hadoop 0.20.2 doesn't throw it...
} catch (Exception e) {
System.err.printf("Error in Hadoop arguments: %s\n", e.getMessage());
System.exit(1);
return;
}
args = parser.getRemainingArgs();
final Configuration conf = parser.getConfiguration();
long beg = 0;
if (args.length < 2 || args.length > 3) {
System.err.println(
"Usage: BCFSplitGuesser path-or-uri header-path-or-uri [beg]");
System.exit(2);
}
try {
if (args.length > 2) beg = Long.decode(args[2]);
} catch (NumberFormatException e) {
System.err.println("Invalid beg offset.");
if (e.getMessage() != null)
System.err.println(e.getMessage());
System.exit(2);
}
SeekableStream ss = WrapSeekable.openPath(conf, new Path(args[0]));
SeekableStream hs = WrapSeekable.openPath(conf, new Path(args[1]));
final BCFSplitGuesser guesser = new BCFSplitGuesser(ss, hs);
final long end;
if (guesser.isBGZF()) {
end = beg + BGZF_MAX_BYTES_READ;
System.out.printf(
"This looks like a BGZF-compressed BCF file.\n"+
"Will look for a BGZF block within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"+
"Will then verify BCF data within: [%1$#x,%3$#x) = [%1$d,%3$d)\n",
beg, beg + 0xffff, end);
} else {
end = beg + UNCOMPRESSED_BYTES_NEEDED;
System.out.printf(
"This looks like an uncompressed BCF file.\n"+
"Will look for a BCF record within: [%1$#x,%2$#x) = [%1$d,%2$d)\n"+
"And then will verify all following data in that range.\n",
beg, end);
}
final long g = guesser.guessNextBCFRecordStart(beg, end);
ss.close();
if (g == end) {
System.out.println(
"Didn't find any acceptable BCF record in any BGZF block.");
System.exit(1);
}
if (guesser.isBGZF())
System.out.printf(
"Accepted BGZF block at offset %1$#x (%1$d).\n"+
"Accepted BCF record at offset %2$#x (%2$d) therein.\n",
g >> 16, g & 0xffff);
else
System.out.printf("Accepted BCF record at offset %1$#x (%1$d).\n", g);
}
}