All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.variant.bcf2.BCF2Decoder Maven / Gradle / Ivy

There is a newer version: 4.1.3
Show newest version
/*
* Copyright (c) 2012 The Broad Institute
* 
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
* 
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* 
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

package htsjdk.variant.bcf2;

import htsjdk.tribble.TribbleException;
import htsjdk.variant.utils.GeneralUtils;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;

public final class BCF2Decoder {
    byte[] recordBytes = null;
    ByteArrayInputStream recordStream = null;

    public BCF2Decoder() {
        // nothing to do
    }

    /**
     * Create a new decoder ready to read BCF2 data from the byte[] recordBytes, for testing purposes
     *
     * @param recordBytes
     */
    protected BCF2Decoder(final byte[] recordBytes) {
        setRecordBytes(recordBytes);
    }

    // ----------------------------------------------------------------------
    //
    // Routines to load, set, skip blocks of underlying data we are decoding
    //
    // ----------------------------------------------------------------------

    /**
     * Reads the next record from input stream and prepare this decoder to decode values from it
     *
     * @param stream
     */
    public void readNextBlock(final int blockSizeInBytes, final InputStream stream) {
        if ( blockSizeInBytes < 0 ) throw new TribbleException("Invalid block size " + blockSizeInBytes);
        setRecordBytes(readRecordBytes(blockSizeInBytes, stream));
    }

    /**
     * Skips the next record from input stream, invalidating current block data
     *
     * @param stream
     */
    public void skipNextBlock(final int blockSizeInBytes, final InputStream stream) {
        try {
            final int bytesRead = (int)stream.skip(blockSizeInBytes);
            validateReadBytes(bytesRead, 1, blockSizeInBytes);
        } catch ( IOException e ) {
            throw new TribbleException("I/O error while reading BCF2 file", e);
        }
        this.recordBytes = null;
        this.recordStream = null;
    }

    /**
     * Returns the byte[] for the block of data we are currently decoding
     * @return
     */
    public byte[] getRecordBytes() {
        return recordBytes;
    }

    /**
     * The size of the current block in bytes
     *
     * @return
     */
    public int getBlockSize() {
        return recordBytes.length;
    }

    public boolean blockIsFullyDecoded() {
        return recordStream.available() == 0;
    }

    /**
     * Use the recordBytes[] to read BCF2 records from now on
     *
     * @param recordBytes
     */
    public void setRecordBytes(final byte[] recordBytes) {
        this.recordBytes = recordBytes;
        this.recordStream = new ByteArrayInputStream(recordBytes);
    }

    // ----------------------------------------------------------------------
    //
    // High-level decoder
    //
    // ----------------------------------------------------------------------

    public final Object decodeTypedValue() throws IOException {
        final byte typeDescriptor = readTypeDescriptor();
        return decodeTypedValue(typeDescriptor);
    }

    public final Object decodeTypedValue(final byte typeDescriptor) throws IOException {
        final int size = decodeNumberOfElements(typeDescriptor);
        return decodeTypedValue(typeDescriptor, size);
    }

    public final Object decodeTypedValue(final byte typeDescriptor, final int size) throws IOException {
        if ( size == 0 ) {
            // missing value => null in java
            return null;
        } else {
            final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
            if ( type == BCF2Type.CHAR ) { // special case string decoding for efficiency
                return decodeLiteralString(size);
            } else if ( size == 1 ) {
                return decodeSingleValue(type);
            } else {
                final ArrayList ints = new ArrayList(size);
                for ( int i = 0; i < size; i++ ) {
                    final Object val = decodeSingleValue(type);
                    if ( val == null ) continue; // auto-pruning.  We remove trailing nulls
                    ints.add(val);
                }
                return ints.isEmpty() ? null : ints; // return null when all of the values are null
            }
        }
    }

    public final Object decodeSingleValue(final BCF2Type type) throws IOException {
        // TODO -- decodeTypedValue should integrate this routine
        final int value = decodeInt(type);

        if ( value == type.getMissingBytes() )
            return null;
        else {
            switch (type) {
                case INT8:
                case INT16:
                case INT32: return value;
                case FLOAT: return rawFloatToFloat(value);
                case CHAR:  return value & 0xFF; // TODO -- I cannot imagine why we'd get here, as string needs to be special cased
                default:    throw new TribbleException("BCF2 codec doesn't know how to decode type " + type );
            }
        }
    }

    // ----------------------------------------------------------------------
    //
    // Decode raw primitive data types (ints, floats, and strings)
    //
    // ----------------------------------------------------------------------

    private final Object decodeLiteralString(final int size) {
        assert size > 0;

        // TODO -- assumes size > 0
        final byte[] bytes = new byte[size]; // TODO -- in principle should just grab bytes from underlying array
        try {
            recordStream.read(bytes);

            int goodLength = 0;
            for ( ; goodLength < bytes.length ; goodLength++ )
                if ( bytes[goodLength] == 0 ) break;

            if ( goodLength == 0 )
                return null;
            else {
                final String s = new String(bytes, 0, goodLength);
                return BCF2Utils.isCollapsedString(s) ? BCF2Utils.explodeStringList(s) : s;
            }
        } catch ( IOException e ) {
            throw new TribbleException("readByte failure", e);
        }
    }

    public final int decodeNumberOfElements(final byte typeDescriptor) throws IOException {
        if ( BCF2Utils.sizeIsOverflow(typeDescriptor) )
            // -1 ensures we explode immediately with a bad size if the result is missing
            return decodeInt(readTypeDescriptor(), -1);
        else
            // the size is inline, so just decode it
            return BCF2Utils.decodeSize(typeDescriptor);
    }

    /**
     * Decode an int from the stream.  If the value in the stream is missing,
     * returns missingValue.  Requires the typeDescriptor indicate an inline
     * single element event
     *
     * @param typeDescriptor
     * @return
     */
    public final int decodeInt(final byte typeDescriptor, final int missingValue) throws IOException {
        final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
        final int i = decodeInt(type);
        return i == type.getMissingBytes() ? missingValue : i;
    }

    public final int decodeInt(final BCF2Type type) throws IOException {
        return type.read(recordStream);
    }

    /**
     * Low-level reader for int[]
     *
     * Requires a typeDescriptor so the function knows how many elements to read,
     * and how they are encoded.
     *
     * If size == 0 => result is null
     * If size > 0 => result depends on the actual values in the stream
     *      -- If the first element read is MISSING, result is null (all values are missing)
     *      -- Else result = int[N] where N is the first N non-missing values decoded
     *
     * @param maybeDest if not null we'll not allocate space for the vector, but instead use
     *                  the externally allocated array of ints to store values.  If the
     *                  size of this vector is < the actual size of the elements, we'll be
     *                  forced to use freshly allocated arrays.  Also note that padded
     *                  int elements are still forced to do a fresh allocation as well.
     * @return see description
     */
    public final int[] decodeIntArray(final int size, final BCF2Type type, int[] maybeDest) throws IOException {
        if ( size == 0 ) {
            return null;
        } else {
            if ( maybeDest != null && maybeDest.length < size )
                maybeDest = null; // by nulling this out we ensure that we do fresh allocations as maybeDest is too small

            final int val1 = decodeInt(type);
            if ( val1 == type.getMissingBytes() ) {
                // fast path for first element being missing
                for ( int i = 1; i < size; i++ ) decodeInt(type);
                return null;
            } else {
                // we know we will have at least 1 element, so making the int[] is worth it
                final int[] ints = maybeDest == null ? new int[size] : maybeDest;
                ints[0] = val1; // we already read the first one
                for ( int i = 1; i < size; i++ ) {
                    ints[i] = decodeInt(type);
                    if ( ints[i] == type.getMissingBytes() ) {
                        // read the rest of the missing values, dropping them
                        for ( int j = i + 1; j < size; j++ ) decodeInt(type);
                        // deal with auto-pruning by returning an int[] containing
                        // only the non-MISSING values.  We do this by copying the first
                        // i elements, as i itself is missing
                        return Arrays.copyOf(ints, i);
                    }
                }
                return ints; // all of the elements were non-MISSING
            }
        }
    }

    public final int[] decodeIntArray(final byte typeDescriptor, final int size) throws IOException {
        final BCF2Type type = BCF2Utils.decodeType(typeDescriptor);
        return decodeIntArray(size, type, null);
    }

    private double rawFloatToFloat(final int rawFloat) {
        return (double)Float.intBitsToFloat(rawFloat);
    }

    // ----------------------------------------------------------------------
    //
    // Utility functions
    //
    // ----------------------------------------------------------------------

    /**
     * Read the size of the next block from inputStream
     *
     * @param inputStream
     * @return
     */
    public final int readBlockSize(final InputStream inputStream) throws IOException {
        return BCF2Type.INT32.read(inputStream);
    }

    /**
     * Read all bytes for a BCF record block into a byte[], and return it
     *
     * Is smart about reading from the stream multiple times to fill the buffer, if necessary
     *
     * @param blockSizeInBytes number of bytes to read
     * @param inputStream the stream to read from
     * @return a non-null byte[] containing exactly blockSizeInBytes bytes from the inputStream
     */
    private static byte[] readRecordBytes(final int blockSizeInBytes, final InputStream inputStream) {
        assert blockSizeInBytes >= 0;

        final byte[] record = new byte[blockSizeInBytes];
        try {
            int bytesRead = 0;
            int nReadAttempts = 0; // keep track of how many times we've read

            // because we might not read enough bytes from the file in a single go, do it in a loop until we get EOF
            while ( bytesRead < blockSizeInBytes ) {
                final int read1 = inputStream.read(record, bytesRead, blockSizeInBytes - bytesRead);
                if ( read1 == -1 )
                    validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes);
                else
                    bytesRead += read1;
            }

            if ( GeneralUtils.DEBUG_MODE_ENABLED && nReadAttempts > 1 ) { // TODO -- remove me
                System.err.println("Required multiple read attempts to actually get the entire BCF2 block, unexpected behavior");
            }

            validateReadBytes(bytesRead, nReadAttempts, blockSizeInBytes);
        } catch ( IOException e ) {
            throw new TribbleException("I/O error while reading BCF2 file", e);
        }

        return record;
    }

    /**
     * Make sure we read the right number of bytes, or throw an error
     *
     * @param actuallyRead
     * @param nReadAttempts
     * @param expected
     */
    private static void validateReadBytes(final int actuallyRead, final int nReadAttempts, final int expected) {
        assert expected >= 0;

        if ( actuallyRead < expected ) {
            throw new TribbleException(
                    String.format("Failed to read next complete record: expected %d bytes but read only %d after %d iterations",
                            expected, actuallyRead, nReadAttempts));
        }
    }

    public final byte readTypeDescriptor() throws IOException {
        return BCF2Utils.readByte(recordStream);
    }
}