All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.dahgan.stream.Decoders.kt Maven / Gradle / Ivy

The newest version!
package io.dahgan.stream

/**
 * Decodes a UTF-32 (LE or BE) byte array to unicode characters.
 */
class UTF32Decoder
private constructor(val combine: (Int, Int, Int, Int) -> Int) : Decoder {

    companion object {
        /**
         * Decodes a UTF-32BE byte array to unicode characters.
         *
         * Combine function combines two bytes of a UTF-32BE character and returns the result.
         */
        fun be(): UTF32Decoder = UTF32Decoder { first, second, third, fourth ->
            fourth + 256 * (third + 256 * (second + 256 * first))
        }

        /**
         * Decodes a UTF-32LE byte array to unicode characters.
         *
         * Combine function combines two bytes of a UTF-32LE character and returns the result.
         */
        fun le(): UTF32Decoder = UTF32Decoder { first, second, third, fourth ->
            first + 256 * (second + 256 * (third + 256 * fourth))
        }
    }

    /**
     * @see Decoder#decode
     */
    override fun decode(bytes: ByteArray, offset: Int): UniChar {
        if (hasFewerThan(offset, 4, bytes)) {
            throw IllegalArgumentException("UTF-32 input contains invalid number of bytes")
        }

        val first = bytes[offset].toUnsignedInt()
        val second = bytes[offset + 1].toUnsignedInt()
        val third = bytes[offset + 2].toUnsignedInt()
        val fourth = bytes[offset + 3].toUnsignedInt()

        return UniChar(offset + 4, combine(first, second, third, fourth))
    }
}

/**
 * Decodes a UTF-16 (LE or BE) byte array to unicode characters.
 */
class UTF16Decoder
private constructor(val combine: (Int, Int) -> Int) : Decoder {

    companion object {
        /**
         * Decodes a UTF-16BE byte array to unicode characters.
         *
         * Combine function combines two bytes of a UTF-16BE character and returns the result.
         */
        fun be(): UTF16Decoder = UTF16Decoder { first, second -> second + first * 256 }

        /**
         * Decodes a UTF-16LE byte array to unicode characters.
         *
         * Combine function combines two bytes of a UTF-16LE character and returns the result.
         */
        fun le(): UTF16Decoder = UTF16Decoder { first, second -> first + second * 256 }
    }

    /**
     * Copied from the unicode FAQs.
     */
    private val surrogateOffset = 0x10000 - (0xD800 * 1024) - 0xDC00

    /**
     * @see Decoder#decode
     */
    override fun decode(bytes: ByteArray, offset: Int): UniChar {
        val head = undo(bytes, offset)
        return when {
            head.code in 0xD800..0xDBFF -> combineLead(head, bytes, head.offset)
            head.code in 0xDC00..0xDFFF -> throw IllegalArgumentException("UTF-16 contains trail surrogate without lead surrogate")
            else -> head
        }
    }

    /**
     * Decodes a UTF-16 (LE or BE) byte array to a unicode char.
     */
    private fun undo(bytes: ByteArray, offset: Int): UniChar {
        if (hasFewerThan(offset, 2, bytes)) {
            throw IllegalArgumentException("UTF-16 input contains odd number of bytes")
        }
        val first = bytes[offset].toUnsignedInt()
        val second = bytes[offset + 1].toUnsignedInt()

        return UniChar(offset + 2, combine(first, second))
    }

    /**
     * Combines two UTF-16 surrogates into a single unicode character.
     */
    private fun combineSurrogates(lead: Int, trail: Int): Int = lead * 1024 + trail + surrogateOffset

    /**
     * Combines the lead surrogate with the head of the rest of the input characters,
     * assumed to be a trail surrogate, and continues combining surrogate pairs.
     */
    private fun combineLead(lead: UniChar, bytes: ByteArray, offset: Int): UniChar {
        if (hasFewerThan(offset, 2, bytes)) {
            throw IllegalArgumentException("UTF-16 contains lead surrogate as final character")
        }

        val tail = undo(bytes, offset)
        val tailChar = tail.code

        if (tail.code in 0xDC00..0xDFFF) {
            return UniChar(tail.offset, combineSurrogates(lead.code, tailChar))
        }
        throw IllegalArgumentException("UTF-16 contains lead surrogate without trail surrogate")
    }
}

/**
 *  Decodes a UTF-8 byte array to unicode characters.
 */
class UTF8Decoder : Decoder {

    /**
     * @see Decoder#decode
     */
    override fun decode(bytes: ByteArray, offset: Int): UniChar {
        if (hasFewerThan(offset, 1, bytes)) {
            throw IllegalArgumentException("UTF-8 input contains invalid number of bytes")
        }

        val first = bytes[offset].toUnsignedInt()

        return when {
            first < 0x80 -> UniChar(offset + 1, first)
            first < 0xC0 -> throw IllegalArgumentException("UTF-8 input contains invalid first byte")
            first < 0xE0 -> decodeTwoUTF8(first, offset + 1, bytes)
            first < 0xF0 -> decodeThreeUTF8(first, offset + 1, bytes)
            first < 0xF8 -> decodeFourUTF8(first, offset + 1, bytes)
            else -> throw IllegalArgumentException("UTF-8 input contains invalid first byte")
        }
    }

    /**
     * Decodes a two-byte UTF-8 character,
     * where the first byte is already available and the second is the head of
     * the bytes, and then continues to undo the UTF-8 encoding.
     */
    private fun decodeTwoUTF8(first: Int, offset: Int, bytes: ByteArray): UniChar {
        if (hasFewerThan(offset, 1, bytes)) {
            throw IllegalArgumentException("UTF-8 double byte char is missing second byte at eof")
        }

        val second = bytes[offset].toUnsignedInt()

        return when {
            second < 0x80 || 0xBF < second -> throw IllegalArgumentException("UTF-8 triple byte char has invalid second byte")
            else -> UniChar(offset + 1, combineTwoUTF8(first, second))
        }
    }

    /**
     * Combines the first and second bytes of a two-byte UTF-8 char into a single unicode char.
     */
    private fun combineTwoUTF8(first: Int, second: Int) =
            (first - 0xC0) * 64 + (second - 0x80)

    /**
     * Decodes a three-byte UTF-8 character,
     * where the first byte is already available and the second and third are the
     * head of the bytes, and then continues to undo the UTF-8 encoding.
     */
    private fun decodeThreeUTF8(first: Int, offset: Int, bytes: ByteArray): UniChar {
        if (hasFewerThan(offset, 2, bytes)) {
            throw IllegalArgumentException("UTF-8 triple byte char is missing bytes at eof")
        }

        val second = bytes[offset].toUnsignedInt()
        val third = bytes[offset + 1].toUnsignedInt()

        return when {
            second < 0x80 || 0xBF < second -> throw IllegalArgumentException("UTF-8 triple byte char has invalid second byte")
            third < 0x80 || 0xBF < third -> throw IllegalArgumentException("UTF-8 triple byte char has invalid third byte")
            else -> UniChar(offset + 2, combineThreeUTF8(first, second, third))
        }
    }

    /**
     * Combines the first, second and third bytes of a three-byte UTF-8 char into a single unicode char.
     */
    private fun combineThreeUTF8(first: Int, second: Int, third: Int) =
            (first - 0xE0) * 4096 + (second - 0x80) * 64 + (third - 0x80)

    /**
     * Decodes a four-byte UTF-8 character, where the first byte is already available and the second, third and fourth
     * are the head of the bytes, and then continues to undo the UTF-8 encoding.
     */
    private fun decodeFourUTF8(first: Int, offset: Int, bytes: ByteArray): UniChar {
        if (hasFewerThan(offset, 3, bytes)) {
            throw IllegalArgumentException("UTF-8 quad byte char is missing bytes at eof")
        }

        val second = bytes[offset].toUnsignedInt()
        val third = bytes[offset + 1].toUnsignedInt()
        val fourth = bytes[offset + 2].toUnsignedInt()

        return when {
            second < 0x80 || 0xBF < second -> throw IllegalArgumentException("UTF-8 quad byte char has invalid second byte")
            third < 0x80 || 0xBF < third -> throw IllegalArgumentException("UTF-8 quad byte char has invalid third byte")
            fourth < 0x80 || 0xBF < fourth -> throw IllegalArgumentException("UTF-8 quad byte char has invalid fourth byte")
            else -> UniChar(offset + 3, combineFourUTF8(first, second, third, fourth))
        }
    }

    /**
     * Combines the first, second, third and fourth bytes of a four-byte UTF-8 char into a single unicode char.
     */
    private fun combineFourUTF8(first: Int, second: Int, third: Int, fourth: Int) =
            (first - 0xF0) * 262144 + (second - 0x80) * 4096 + (third - 0x80) * 64 + (fourth - 0x80)
}

/**
 *  Checks whether there are fewer than n bytes left to read.
 */
private fun hasFewerThan(offset: Int, n: Int, bytes: ByteArray) = bytes.size - offset < n

/**
 * Copies the byte in an Int and returns the int representation of it.
 */
private fun Byte.toUnsignedInt(): Int = this.toInt() and 0xFF




© 2015 - 2024 Weber Informatics LLC | Privacy Policy