
org.broadinstitute.hellbender.utils.reference.TwoBitReference Maven / Gradle / Ivy
The newest version!
package org.broadinstitute.hellbender.utils.reference;
import com.google.common.io.ByteStreams;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.util.PeekableIterator;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.IntervalMergingRule;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import java.io.IOException;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Files;
import java.util.*;
/**
* A class for reading a genomic reference stored in .2bit format. This format is defined here:
*
* http://genome.ucsc.edu/FAQ/FAQformat.html#format7
*
* Note that this class stores the entire .2bit reference in memory (in packed form) to facilitate
* Spark broadcasts.
*
* Supported public operations are:
* {@link #getSequenceDictionary}
* {@link #getReferenceBases(SimpleInterval)}
* {@link #close}
*
* The constructor gives the option to either preserve lowercase (masked) bases, or uppercase them.
* Ambiguity codes are not supported by the 2bit format.
*/
public class TwoBitReference implements Serializable, AutoCloseable {
private static final long serialVersionUID = 1L;
// Magic value at the start of a 2bit file
public static final int TWO_BIT_SIGNATURE = 0x1A412743;
// Version of the 2bit format we support
public static final int TWO_BIT_SUPPORTED_VERSION = 0;
// File extension for 2bit files
public static final String TWO_BIT_EXTENSION = ".2bit";
// Size of the 2bit header in bytes
public static final int HEADER_LENGTH_IN_BYTES = 16;
private final GATKPath referencePath;
// If true, preserve lowercase (masked) bases, otherwise uppercase all bases
private final boolean preserveCase;
// The complete in-memory contents of the 2bit file. Initialized as read-only in the constructor
private final ByteBuffer rawBytes;
// Byte order to use, as defined in the 2bit header
private ByteOrder byteOrder;
// The total number of sequences in the file
private int sequenceCount;
// LinkedHashMap because the order of the sequence records does matter
private LinkedHashMap sequenceRecords;
private SAMSequenceDictionary sequenceDictionary;
/**
* Creates a TwoBitReference that will uppercase all bases
*
* @param referencePath path to the 2bit reference
*/
public TwoBitReference( final GATKPath referencePath ) {
this(referencePath, false);
}
/**
* Creates a TwoBitReference, and specifies whether bases should be uppercased
*
* @param referencePath path to the 2bit reference
* @param preserveCase if true, preserve lowercase (masked) bases, otherwise uppercase all bases
*/
public TwoBitReference( final GATKPath referencePath, final boolean preserveCase ) {
Utils.nonNull(referencePath);
Utils.validateArg(referencePath.getURI().getPath().endsWith(TWO_BIT_EXTENSION), "Twobit reference must end with a " + TWO_BIT_EXTENSION + " extension");
// Since the Java NIO APIs we're relying on use signed integers for file positions, we're
// limited to supporting files up to MAX_INT (2^31 - 1) bytes in size, even though the 2bit
// format should theoretically allow for files up to 2^32 bytes (ie., MAX_UNSIGNED_INT).
try {
final long referenceSizeInBytes = Files.size(referencePath.toPath());
if ( referenceSizeInBytes > Integer.MAX_VALUE ) {
throw new UserException.CouldNotReadInputFile(referencePath,
"We currently only support 2bit references up to " + Integer.MAX_VALUE +
" bytes in size, but this file is " + referenceSizeInBytes + " bytes. " +
"This limitation may be removed in a future release.");
}
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile(referencePath, "Error while checking the size of the 2bit reference file", e);
}
this.referencePath = referencePath;
this.preserveCase = preserveCase;
try {
// Load the raw contents of the 2bit file into a read-only in-memory ByteBuffer
rawBytes = ByteBuffer.wrap(ByteStreams.toByteArray(referencePath.getInputStream())).asReadOnlyBuffer();
} catch ( IOException e ) {
throw new UserException.CouldNotReadInputFile(referencePath, "Unable to load bytes from 2bit input file", e);
}
sequenceRecords = new LinkedHashMap<>();
readHeader();
readSequenceRecordIndexAndMetadata();
sequenceDictionary = new SAMSequenceDictionary();
for ( final TwoBitSequenceRecord twoBitSequenceRecord : sequenceRecords.values() ) {
sequenceDictionary.addSequence(new SAMSequenceRecord(twoBitSequenceRecord.getSequenceName(), twoBitSequenceRecord.getDNASize()));
}
}
/**
* Reads the 2bit file header, and sets byte order for the rest of the file
*
* The file begins with a 16-byte header containing the following fields:
*
* signature - the number 0x1A412743 in the architecture of the machine that created the file
* version - zero for now. Readers should abort if they see a version number higher than 0
* sequenceCount - the number of sequences in the file
* reserved - always zero for now
*/
private void readHeader() {
setByteOrder(ByteOrder.LITTLE_ENDIAN);
int signature = rawBytes.getInt();
if ( signature != TWO_BIT_SIGNATURE ) {
setByteOrder(ByteOrder.BIG_ENDIAN);
rawBytes.position(0);
signature = rawBytes.getInt();
if ( signature != TWO_BIT_SIGNATURE ) {
throw new UserException.MalformedFile(referencePath,
"File does not start with the required 2bit signature value " + TWO_BIT_SIGNATURE);
}
}
final int version = rawBytes.getInt();
if ( version != TWO_BIT_SUPPORTED_VERSION ) {
throw new UserException.CouldNotReadInputFile(referencePath,
"File is version " + version + ", but we only support version " + TWO_BIT_SUPPORTED_VERSION);
}
this.sequenceCount = rawBytes.getInt();
if ( sequenceCount < 0 ) {
throw new UserException.MalformedFile(referencePath,
"Negative sequence count in 2bit header: " + sequenceCount);
}
final int reservedValue = rawBytes.getInt();
if ( reservedValue != 0 ) {
throw new UserException.MalformedFile(referencePath,
"Reserved header value must be 0 according to the 2bit spec, but found " + reservedValue);
}
}
/**
* Reads and parses the sequence record index and metadata
*
* The header is followed by a file index, which contains one entry for each sequence. Each index entry contains three fields:
*
* nameSize - a byte containing the length of the name field
* name - the sequence name itself (in ASCII-compatible byte string), of variable length depending on nameSize
* offset - the 32-bit offset of the sequence data relative to the start of the file, not aligned to any 4-byte padding boundary
*
* The index is followed by the sequence records, which contain nine fields:
*
* dnaSize - number of bases of DNA in the sequence
* nBlockCount - the number of blocks of Ns in the file (representing unknown sequence)
* nBlockStarts - an array of length nBlockCount of 32 bit integers indicating the (0-based) starting position of a block of Ns
* nBlockSizes - an array of length nBlockCount of 32 bit integers indicating the length of a block of Ns
* maskBlockCount - the number of masked (lower-case) blocks
* maskBlockStarts - an array of length maskBlockCount of 32 bit integers indicating the (0-based) starting position of a masked block
* maskBlockSizes - an array of length maskBlockCount of 32 bit integers indicating the length of a masked block
* reserved - always zero for now
* packedDna - the DNA packed to two bits per base, represented as so: T - 00, C - 01, A - 10, G - 11. The first base is in the most significant 2-bit byte; the last base is in the least significant 2 bits. For example, the sequence TCAG is represented as 00011011.
*/
private void readSequenceRecordIndexAndMetadata() {
final ByteBuffer bytes = independentBufferView();
bytes.position(HEADER_LENGTH_IN_BYTES); // seek past the header
// First read the index, with the names of each sequence and start offsets for the sequence records
for ( int i = 0; i < sequenceCount; i++ ) {
final byte nameSize = rawBytes.get();
final byte[] nameBytes = new byte[nameSize];
rawBytes.get(nameBytes);
final String sequenceName = new String(nameBytes);
final int sequenceRecordStartOffset = rawBytes.getInt();
sequenceRecords.put(sequenceName, new TwoBitSequenceRecord(sequenceName, i, sequenceRecordStartOffset));
}
// Next read the metadata for each sequence (DNA size, N blocks, masked blocks, and start offset for the
// actual bases) from the sequence records
for ( final TwoBitSequenceRecord sequenceRecord : sequenceRecords.values() ) {
bytes.position(sequenceRecord.getSequenceRecordStartOffset());
sequenceRecord.setDNASize(bytes.getInt());
final int nBlockCount = bytes.getInt();
final int[] nBlockStarts = new int[nBlockCount];
final int[] nBlockSizes = new int[nBlockCount];
for ( int i = 0; i < nBlockCount; i++ ) {
nBlockStarts[i] = bytes.getInt();
}
for ( int i = 0; i < nBlockCount; i++ ) {
nBlockSizes[i] = bytes.getInt();
}
sequenceRecord.setNBlocks(convertTwoBitBlocksToSortedMergedIntervals(sequenceRecord.getSequenceName(), nBlockStarts, nBlockSizes));
final int maskBlockCount = bytes.getInt();
final int[] maskBlockStarts = new int[maskBlockCount];
final int[] maskBlockSizes = new int[maskBlockCount];
for ( int i = 0; i < maskBlockCount; i++ ) {
maskBlockStarts[i] = bytes.getInt();
}
for ( int i = 0; i < maskBlockCount; i++ ) {
maskBlockSizes[i] = bytes.getInt();
}
sequenceRecord.setMaskedBlocks(convertTwoBitBlocksToSortedMergedIntervals(sequenceRecord.getSequenceName(), maskBlockStarts, maskBlockSizes));
final int reservedValue = bytes.getInt();
if ( reservedValue != 0 ) {
throw new UserException.MalformedFile(referencePath,
"Reserved value for sequence " + sequenceRecord.getSequenceName() + " must be zero, but is non-zero");
}
sequenceRecord.setSequenceBasesStartOffset(bytes.position());
}
}
private List convertTwoBitBlocksToSortedMergedIntervals( final String contig, final int[] zeroBasedBlockStarts, final int[] blockSizes ) {
final List intervals = new ArrayList<>();
for ( int i = 0; i < zeroBasedBlockStarts.length; i++ ) {
final int oneBasedInclusiveStart = zeroBasedBlockStarts[i] + 1;
final int oneBasedInclusiveEnd = oneBasedInclusiveStart + blockSizes[i] - 1;
intervals.add(new SimpleInterval(contig, oneBasedInclusiveStart, oneBasedInclusiveEnd));
}
return IntervalUtils.sortAndMergeIntervalsFromSameContig(intervals, IntervalMergingRule.ALL);
}
private void setByteOrder( final ByteOrder byteOrder ) {
this.byteOrder = byteOrder;
rawBytes.order(byteOrder);
}
/**
* @return The sequence dictionary for this TwoBit reference
*/
public SAMSequenceDictionary getSequenceDictionary() {
return sequenceDictionary;
}
/**
* @param interval Genomic interval to query (must be completely on the reference)
* @return the reference bases covering the specified interval
*/
public ReferenceSequence getReferenceBases( final SimpleInterval interval ) {
final TwoBitSequenceRecord sequenceRecord = sequenceRecords.get(interval.getContig());
Utils.nonNull(sequenceRecord, "Sequence " + interval.getContig() + " not present in 2bit file " + referencePath);
if ( interval.getEnd() > sequenceRecord.getDNASize() ) {
throw new IllegalArgumentException("Query interval " + interval + " ends after the end of sequence " +
sequenceRecord.getSequenceName() + " (length " + sequenceRecord.getDNASize() + ")");
}
final byte[] sequenceBases = new byte[interval.getLengthOnReference()];
final TwoBitBaseDecoder decoder = new TwoBitBaseDecoder(independentBufferView(), sequenceRecord, interval, preserveCase);
int baseOffset = 0;
while ( decoder.hasNext() ) {
if ( baseOffset >= sequenceBases.length ) {
throw new IllegalStateException("2bit base decoder returned an unexpected number of bases");
}
sequenceBases[baseOffset] = decoder.next();
baseOffset++;
}
return new ReferenceSequence(sequenceRecord.getSequenceName(), sequenceRecord.getSequenceIndex(), sequenceBases);
}
/**
* @return An independent view of the ByteBuffer containing the 2bit file contents, with its own
* position, limit, and mark values. The actual bytes in the buffer are not copied.
*/
private ByteBuffer independentBufferView() {
final ByteBuffer newView = rawBytes.asReadOnlyBuffer(); // does not duplicate the content, just creates a new view of it
newView.order(byteOrder); // byte order is not preserved by asReadOnlyBuffer()
newView.clear(); // resets the position/limit/mark, does not erase the buffer contents
return newView;
}
@Override
public void close() {
// No-op for now, just here to allow seamless use in Autocloseable contexts.
// In a future version that does not store the entire reference in memory, we will need this.
}
/**
* An Iterator that decodes the raw TwoBit reference for a specified interval.
* Each interval requires a new Iterator instance.
*/
private static class TwoBitBaseDecoder implements Iterator, Iterable, Serializable {
private static final long serialVersionUID = 1L;
// A two-bit base encoding used as an index into this array will produce the corresponding base
private static final Byte[] twoBitEncodingToBase = { 'T', 'C', 'A', 'G' };
private final TwoBitSequenceRecord sequenceRecord;
private final ByteBuffer rawBytes;
private final SimpleInterval intervalToDecode;
private final boolean preserveCase;
// 1-based inclusive genomic position within our sequence's contig
private int currentOneBasedGenomicPosition;
// Raw value of the current packed byte we're positioned on
private byte currentByte;
// 0-based bit offset from the most significant bit in the current byte, marking the start of the next 2bit encoding
// Can be 0, 2, 4, or 6
private int currentBitOffset;
// The base that will be returned by the next call to next(), or null if the iteration is complete
private Byte nextBase;
private PeekableIterator nBlockIterator;
private PeekableIterator maskedBlockIterator;
/**
* @param rawBytes Raw byte buffer containing the complete TwoBit data. Should be an independent view of the
* buffer not shared with any other concurrent code.
* @param sequenceRecord TwoBitSequenceRecord for the contig we're accessing
* @param intervalToDecode Query interval to decode. Must match the TwoBitSequenceRecord's contig.
* @param preserveCase If true, preserve lowercase (masked) bases, otherwise convert all bases to uppercase.
*/
public TwoBitBaseDecoder( final ByteBuffer rawBytes, final TwoBitSequenceRecord sequenceRecord, final SimpleInterval intervalToDecode, final boolean preserveCase ) {
Utils.validate(intervalToDecode.getEnd() <= sequenceRecord.getDNASize(), "Interval " + intervalToDecode + " ends past the end of contig " + sequenceRecord.getSequenceName());
Utils.validate(intervalToDecode.getContig().equals(sequenceRecord.getSequenceName()), "Interval " + intervalToDecode + " is on the wrong contig for sequence " + sequenceRecord.getSequenceName());
this.sequenceRecord = sequenceRecord;
this.intervalToDecode = intervalToDecode;
this.preserveCase = preserveCase;
this.currentOneBasedGenomicPosition = intervalToDecode.getStart();
this.rawBytes = rawBytes;
rawBytes.position(sequenceRecord.getSequenceBasesStartOffset() + ((currentOneBasedGenomicPosition - 1) / 4));
this.currentByte = rawBytes.get();
this.currentBitOffset = ((currentOneBasedGenomicPosition - 1) % 4) * 2;
nBlockIterator = new PeekableIterator<>(sequenceRecord.getNBlocks().iterator());
maskedBlockIterator = new PeekableIterator<>(sequenceRecord.getMaskedBlocks().iterator());
advanceBlockIterators();
loadNextBase();
}
@Override
public Iterator iterator() {
return this;
}
@Override
public boolean hasNext() {
return nextBase != null;
}
@Override
public Byte next() {
if ( ! hasNext() ) {
throw new NoSuchElementException("next() called when iterator exhausted");
}
Byte toReturn = nextBase;
loadNextBase();
return toReturn;
}
private void loadNextBase() {
if ( iterationComplete() ) {
nextBase = null;
return;
}
nextBase = decodeBaseAtCurrentPosition();
if ( isPositionedAtNBase() ) {
nextBase = 'N';
} else if ( isPositionedAtMaskedBase() && preserveCase ) { // Masked 'N' is not supported
nextBase = (byte) Character.toLowerCase(nextBase);
}
advancePosition();
}
private Byte decodeBaseAtCurrentPosition() {
// The DNA is packed to two bits per base, represented as so: T - 00, C - 01, A - 10, G - 11.
// The first base is in the most significant 2-bit byte; the last base is in the least significant 2 bits.
return twoBitEncodingToBase[(currentByte >> (6 - currentBitOffset)) & 3];
}
private boolean iterationComplete() {
return currentOneBasedGenomicPosition > intervalToDecode.getEnd();
}
private void advancePosition() {
currentOneBasedGenomicPosition++;
if ( iterationComplete() ) {
return;
}
// Bit offsets can be 0, 2, 4, or 6, corresponding to the start bit for each of the four bases encoded in a byte
if ( currentBitOffset <= 4 ) {
currentBitOffset += 2;
} else { // Advance to the next full byte
currentByte = rawBytes.get();
currentBitOffset = 0;
}
advanceBlockIterators();
}
private void advanceBlockIterators() {
advanceBlockIterator(nBlockIterator);
advanceBlockIterator(maskedBlockIterator);
}
private void advanceBlockIterator( final PeekableIterator iterator ) {
while ( iterator.hasNext() && iterator.peek().getEnd() < currentOneBasedGenomicPosition ) {
iterator.next(); // we've passed this interval on the genome, so discard it
}
}
private boolean isPositionedAtNBase() {
final SimpleInterval currentNBlock = nBlockIterator.peek();
return currentNBlock != null && currentNBlock.getStart() <= currentOneBasedGenomicPosition &&
currentNBlock.getEnd() >= currentOneBasedGenomicPosition;
}
private boolean isPositionedAtMaskedBase() {
final SimpleInterval currentMaskedBlock = maskedBlockIterator.peek();
return currentMaskedBlock != null && currentMaskedBlock.getStart() <= currentOneBasedGenomicPosition &&
currentMaskedBlock.getEnd() >= currentOneBasedGenomicPosition;
}
}
/**
* Stores the raw metadata for a specific sequence from the TwoBit file
*/
private static class TwoBitSequenceRecord implements Serializable {
private static final long serialVersionUID = 1L;
private final String sequenceName;
private final int sequenceIndex;
private final int sequenceRecordStartOffset;
private int sequenceBasesStartOffset;
private int dnaSize;
private List nBlocks;
private List maskedBlocks;
public TwoBitSequenceRecord( final String sequenceName, final int sequenceIndex, final int sequenceRecordStartOffset ) {
this.sequenceName = sequenceName;
this.sequenceIndex = sequenceIndex;
this.sequenceRecordStartOffset = sequenceRecordStartOffset;
}
public String getSequenceName() {
return sequenceName;
}
public int getSequenceIndex() {
return sequenceIndex;
}
public int getSequenceRecordStartOffset() {
return sequenceRecordStartOffset;
}
public void setDNASize( final int dnaSize ) {
this.dnaSize = dnaSize;
}
public int getDNASize() {
return dnaSize;
}
public void setSequenceBasesStartOffset( final int sequenceBasesStartOffset ) {
this.sequenceBasesStartOffset = sequenceBasesStartOffset;
}
public int getSequenceBasesStartOffset() {
return sequenceBasesStartOffset;
}
public void setNBlocks( final List nBlocks ) {
this.nBlocks = nBlocks;
}
public List getNBlocks() {
return nBlocks;
}
public void setMaskedBlocks( final List maskedBlocks ) {
this.maskedBlocks = maskedBlocks;
}
public List getMaskedBlocks() {
return maskedBlocks;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy