All Downloads are FREE. Search and download functionalities are using the official Maven repository.

htsjdk.samtools.BinaryTagCodec Maven / Gradle / Ivy

/*
 * The MIT License
 *
 * Copyright (c) 2009 The Broad Institute
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
package htsjdk.samtools;

import htsjdk.samtools.util.BinaryCodec;
import htsjdk.samtools.util.StringUtil;

import java.lang.reflect.Array;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;

/**
 * Converter between disk and in-memory representation of a SAMRecord tag.
 */
public class BinaryTagCodec {
    // Size in bytes of the fixed part of the disk representation of a tag,
    // i.e. the number of bytes occupied by the tag name and tag type fields.
    private static final int FIXED_TAG_SIZE = 3;

    // Size in bytes of the fixed part of the value of a binary array,
    // i.e. the number of bytes occupied by the array type and the array length.
    private static final int FIXED_BINARY_ARRAY_TAG_SIZE = 5;

    // Integers are stored in the smallest size that will hold them.
    private static final long MAX_INT = Integer.MAX_VALUE;
    private static final long MAX_UINT = MAX_INT * 2 + 1;
    private static final long MAX_SHORT = Short.MAX_VALUE;
    private static final long MAX_USHORT = MAX_SHORT * 2 + 1;
    private static final long MAX_BYTE = Byte.MAX_VALUE;
    private static final long MAX_UBYTE = MAX_BYTE * 2 + 1;

    // Source or sink for disk representation.
    final BinaryCodec binaryCodec;

    /**
     * For writing tags.
     * For reading tags, a BinaryCodec is not used.  See readTags() below.
     * @param binaryCodec where to write the file rep of the tags
     */
    public BinaryTagCodec(final BinaryCodec binaryCodec) {
        this.binaryCodec = binaryCodec;
    }

    /**
     * @param attributeValue In-memory representation of a tag value.
     * @return Size in bytes to store the value on disk.
     */
    private static int getBinaryValueSize(final Object attributeValue) {
        switch (getTagValueType(attributeValue)) {
            case 'Z':
                return ((String)attributeValue).length() + 1;
            case 'A':
                return 1;
            case 'I':
            case 'i':
                return 4;
            case 's':
            case 'S':
                return 2;
            case 'c':
            case 'C':
                return 1;
            case 'f':
                return 4;
            case 'H':
                final byte[] byteArray = (byte[])attributeValue;
                return byteArray.length * 2 + 1;
            case 'B':
                final int numElements = Array.getLength(attributeValue);
                final int elementSize;
                if(attributeValue instanceof byte[]) {
                    elementSize = 1;
                } else if(attributeValue instanceof short[]) {
                    elementSize = 2;
                } else if(attributeValue instanceof int[]) {
                    elementSize = 4;
                } else if(attributeValue instanceof float[]) {
                    elementSize = 4;
                } else {
                    throw new IllegalArgumentException("Unsupported array type: " + attributeValue.getClass());
                }
                return numElements * elementSize + FIXED_BINARY_ARRAY_TAG_SIZE;
            default:
                throw new IllegalArgumentException("When writing BAM, unrecognized tag type " +
                        attributeValue.getClass().getName());
        }
    }

    /**
     * @param value In-memory representation of a tag value.
     * @return Size in bytes to store the tag name, tag type and tag value on disk.
     */
    static int getTagSize(final Object value) {
        return FIXED_TAG_SIZE + getBinaryValueSize(value);
    }

    /**
     * @param value In-memory representation of a tag value.
     * @return One-character disk representation of tag type.
     */
    static char getTagValueType(final Object value) {
        if (value instanceof String) {
            return 'Z';
        } else if (value instanceof Character) {
            return 'A';
        } else if (value instanceof Float) {
            return 'f';
        } else if (value instanceof Number) {
            if (!(value instanceof Byte || value instanceof Short || value instanceof Integer || value instanceof Long)) {
                throw new IllegalArgumentException("Unrecognized tag type " + value.getClass().getName());
            }
            return getIntegerType(((Number)value).longValue());
        } /*
           Note that H tag type is never written anymore, because B style is more compact.
           else if (value instanceof byte[]) {
            return 'H';
           }
          */
        else if (value instanceof byte[] || value instanceof short[] || value instanceof int[] || value instanceof float[]) {
            return 'B';
        } else {
            throw new IllegalArgumentException("When writing BAM, unrecognized tag type " +
                    value.getClass().getName());
        }
    }

    /**
     * @param val Integer tag value.
     * @return Tag type corresponding to the smallest integer type that will hold the given value.
     */
    static private char getIntegerType(final long val) {
        if (val > MAX_UINT) {
            throw new IllegalArgumentException("Integer attribute value too large to be encoded in BAM");
        }
        if (val > MAX_INT) {
            return 'I';
        }
        if (val > MAX_USHORT) {
            return 'i';
        }
        if (val > MAX_SHORT) {
            return 'S';
        }
        if (val > MAX_UBYTE) {
            return 's';
        }
        if (val > MAX_BYTE) {
            return 'C';
        }
        if (val >= Byte.MIN_VALUE) {
            return 'c';
        }
        if (val >= Short.MIN_VALUE) {
            return 's';
        }
        if (val >= Integer.MIN_VALUE) {
            return 'i';
        }
        throw new IllegalArgumentException("Integer attribute value too negative to be encoded in BAM");
    }

    /**
     * Write the given tag name and value to disk.
     */
    public void writeTag(final short tag, final Object value, final boolean isUnsignedArray) {
        binaryCodec.writeShort(tag);
        final char tagValueType = getTagValueType(value);
        binaryCodec.writeByte(tagValueType);

        switch (tagValueType) {
            case 'Z':
                binaryCodec.writeString((String)value, false, true);
                break;
            case 'A':
                binaryCodec.writeByte(((Character)value));
                break;
            case 'I':
                binaryCodec.writeUInt((Long)value);
                break;
            case 'i':
                binaryCodec.writeInt(((Number)value).intValue());
                break;
            case 's':
                binaryCodec.writeShort(((Number)value).shortValue());
                break;
            case 'S':
                binaryCodec.writeUShort(((Number)value).intValue());
                break;
            case 'c':
                binaryCodec.writeByte(((Number)value).byteValue());
                break;
            case 'C':
                binaryCodec.writeUByte(((Integer)value).shortValue());
                break;
            case 'f':
                binaryCodec.writeFloat((Float)value);
                break;
            /*
            Writing H is no longer supported
            case 'H':
                final byte[] byteArray = (byte[])value;
                binaryCodec.writeString(StringUtil.bytesToHexString(byteArray), false, true);
                break;
             */
            case 'B':
                writeArray(value, isUnsignedArray);
                break;
            default:
                throw new IllegalArgumentException("When writing BAM, unrecognized tag type " +
                        value.getClass().getName());
        }
    }

    private void writeArray(final Object value, final boolean isUnsignedArray) {
        if (value instanceof byte[]) {
            binaryCodec.writeByte(isUnsignedArray? 'C': 'c');
            final byte[] array = (byte[]) value;
            binaryCodec.writeInt(array.length);
            for (final byte element: array) binaryCodec.writeByte(element);

        } else if (value instanceof short[]) {
            binaryCodec.writeByte(isUnsignedArray? 'S': 's');
            final short[] array = (short[]) value;
            binaryCodec.writeInt(array.length);
            for (final short element: array) binaryCodec.writeShort(element);

        } else if (value instanceof int[]) {
            binaryCodec.writeByte(isUnsignedArray? 'I': 'i');
            final int[] array = (int[]) value;
            binaryCodec.writeInt(array.length);
            for (final int element: array) binaryCodec.writeInt(element);

        } else if (value instanceof float[]) {
            binaryCodec.writeByte('f');
            final float[] array = (float[]) value;
            binaryCodec.writeInt(array.length);
            for (final float element: array) binaryCodec.writeFloat(element);

        } else throw new SAMException("Unrecognized array value type: " + value.getClass());
    }

    /**
     * Convert tags from little-endian disk representation to in-memory representation.
     * @param binaryRep Byte buffer containing file representation of tags.
     * @param offset Where in binaryRep tags start.
     * @param length How many bytes in binaryRep are tag storage.
     */
    public static SAMBinaryTagAndValue readTags(final byte[] binaryRep, final int offset,
                                                final int length, final ValidationStringency validationStringency) {
        final ByteBuffer byteBuffer = ByteBuffer.wrap(binaryRep, offset, length);
        byteBuffer.order(ByteOrder.LITTLE_ENDIAN);

        SAMBinaryTagAndValue head = null;
        SAMBinaryTagAndValue tail = null;

        while (byteBuffer.hasRemaining()) {
            final short tag = byteBuffer.getShort();
            final byte tagType = byteBuffer.get();
            final SAMBinaryTagAndValue tmp;
            if (tagType != 'B') {
                tmp = new SAMBinaryTagAndValue(tag, readSingleValue(tagType, byteBuffer, validationStringency));
            } else {
                final TagValueAndUnsignedArrayFlag valueAndFlag = readArray(byteBuffer, validationStringency);
                if (valueAndFlag.isUnsignedArray) tmp = new SAMBinaryTagAndUnsignedArrayValue(tag, valueAndFlag.value);
                else tmp = new SAMBinaryTagAndValue(tag, valueAndFlag.value);
            }

            // If samjdk wrote the BAM then the attributes will be in lowest->highest tag order, to inserting at the
            // head each time will be very inefficient. To fix that we check here to see if the tag should go right on
            // the tail and if so stick it there, else insert it through the head.
            if (head == null) {
                head = tmp;
                tail = tmp;
            }
            else if (tmp.tag > tail.tag) {
                tail.insert(tmp);
                tail = tmp;
            }
            else {
                head = head.insert(tmp);
            }
        }

        return head;
    }

    /**
     * Read value of specified non-array type.
     * @param tagType What type to read.
     * @param byteBuffer Little-ending byte buffer to read value from.
     * @return Value in in-memory Object form.
     */
    private static  Object readSingleValue(final byte tagType, final ByteBuffer byteBuffer,
                                           final ValidationStringency validationStringency) {
        switch (tagType) {
            case 'Z':
                return readNullTerminatedString(byteBuffer);
            case 'A':
                return (char)byteBuffer.get();
            case 'I':
                final long val = byteBuffer.getInt() & 0xffffffffL;
                if ( val <= Integer.MAX_VALUE ) {
                    return (int)val;
                }
                // If it won't fit into a signed integer, but is within range for an unsigned 32-bit integer,
                // return it directly as a long
                if (! SAMUtils.isValidUnsignedIntegerAttribute(val)) {
                    SAMUtils.processValidationError(new SAMValidationError(SAMValidationError.Type.TAG_VALUE_TOO_LARGE,
                            "Unsigned integer is out of range for a 32-bit unsigned value: " + val, null), validationStringency);
                }
                return val;
            case 'i':
                return byteBuffer.getInt();
            case 's':
                return (int)byteBuffer.getShort();
            case 'S':
                // Convert to unsigned short stored in an int
                return byteBuffer.getShort() & 0xffff;
            case 'c':
                return (int)byteBuffer.get();
            case 'C':
                // Convert to unsigned byte stored in an int
                return (int)byteBuffer.get() & 0xff;
            case 'f':
                return byteBuffer.getFloat();
            case 'H':
                final String hexRep = readNullTerminatedString(byteBuffer);
                return StringUtil.hexStringToBytes(hexRep);
            default:
                throw new SAMFormatException("Unrecognized tag type: " + (char)tagType);
        }
    }




    /**
     * Read value of specified type.
     * @param byteBuffer Little-ending byte buffer to read value from.
     * @return CVO containing the value in in-memory Object form, and a flag indicating whether it is unsigned or not.
     */
    private static TagValueAndUnsignedArrayFlag readArray(final ByteBuffer byteBuffer,
                                                          final ValidationStringency validationStringency) {
        final byte arrayType = byteBuffer.get();
        final boolean isUnsigned = Character.isUpperCase(arrayType);
        final int length = byteBuffer.getInt();
        final Object value;
        switch (Character.toLowerCase(arrayType)) {
            case 'c': {
                final byte[] array = new byte[length];
                value = array;
                byteBuffer.get(array);
                break;
            }
            case 's': {
                final short[] array = new short[length];
                value = array;
                for (int i = 0; i < length; ++i) {
                    array[i] = byteBuffer.getShort();
                }
                break;
            }

            case 'i': {
                final int[] array = new int[length];
                value = array;
                for (int i = 0; i < length; ++i) {
                    array[i] = byteBuffer.getInt();
                }
                break;
            }

            case 'f': {
                final float[] array = new float[length];
                value = array;
                for (int i = 0; i < length; ++i) {
                    array[i] = byteBuffer.getFloat();
                }
                break;
            }

            default:
                throw new SAMFormatException("Unrecognized tag array type: " + (char)arrayType);
        }
        return new TagValueAndUnsignedArrayFlag(value, isUnsigned);
    }

    private static String readNullTerminatedString(final ByteBuffer byteBuffer) {
        // Count the number of bytes in the string
        byteBuffer.mark();
        final int startPosition = byteBuffer.position();
        while (byteBuffer.get() != 0) {}
        final int endPosition = byteBuffer.position();

        // Don't count null terminator
        final byte[] buf = new byte[endPosition - startPosition - 1];
        // Go back to the start of the string and read out the bytes
        byteBuffer.reset();
        byteBuffer.get(buf);
        // Skip over the null terminator
        byteBuffer.get();
        return StringUtil.bytesToString(buf);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy