All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jetbrains.kotlin.library.encodings.WobblyTF8.kt Maven / Gradle / Ivy

There is a newer version: 2.0.0
Show newest version
/*
 * Copyright 2010-2021 JetBrains s.r.o. and Kotlin Programming Language contributors.
 * Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
 */

@file:Suppress("NOTHING_TO_INLINE")

package org.jetbrains.kotlin.library.encodings

/**
 * Implementation of Wobbly Transformation Format 8, a superset of UTF-8
 * that encodes surrogate code points if they are not in a pair.
 *
 * See also https://simonsapin.github.io/wtf-8/
 */
object WobblyTF8 {

    fun encode(string: String): ByteArray {
        val stringLength = string.length
        if (stringLength == 0) return EMPTY_BYTE_ARRAY

        val buffer = ByteArray(stringLength * 3) // Allocate for the worse case.
        var writtenBytes = 0

        var index = 0
        while (index < stringLength) {
            val char1 = string[index++]

            if (char1 < '\u0080') {
                // U+0000..U+007F -> 0xxxxxxx
                // 7 meaningful bits -> 1 byte
                buffer[writtenBytes++] = char1.code
            } else if (char1 < '\u0800') {
                // U+0080..U+07FF -> 110xxxxx 10xxxxxx
                // 11 meaningful bits -> 2 bytes
                val codePoint = char1.code
                buffer[writtenBytes++] = (codePoint ushr 6) or 0b1100_0000
                buffer[writtenBytes++] = (codePoint and 0b0011_1111) or 0b1000_0000
            } else {
                if (Character.isHighSurrogate(char1) && index < stringLength) {
                    val char2 = string[index]
                    if (Character.isLowSurrogate(char2)) {
                        // a pair of surrogates, encode as in traditional UTF8
                        // U+D800..U+DBFF + U+DC00..U+DFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                        // 21 meaningful bits -> 4 bytes
                        index++
                        val codePoint = Character.toCodePoint(char1, char2)
                        buffer[writtenBytes++] = (codePoint ushr 18) or 0b1111_0000
                        buffer[writtenBytes++] = ((codePoint ushr 12) and 0b0011_1111) or 0b1000_0000
                        buffer[writtenBytes++] = ((codePoint ushr 6) and 0b0011_1111) or 0b1000_0000
                        buffer[writtenBytes++] = (codePoint and 0b0011_1111) or 0b1000_0000
                        continue
                    }
                }

                // U+0800..U+FFFF -> 1110xxxx 10xxxxxx 10xxxxxx
                // 16 meaningful bits -> 3 bytes
                val codePoint = char1.code
                buffer[writtenBytes++] = (codePoint ushr 12) or 0b1110_0000
                buffer[writtenBytes++] = ((codePoint ushr 6) and 0b0011_1111) or 0b1000_0000
                buffer[writtenBytes++] = (codePoint and 0b0011_1111) or 0b1000_0000
            }
        }

        return if (buffer.size == writtenBytes) buffer else buffer.copyOf(writtenBytes)
    }

    fun decode(array: ByteArray): String {
        val arraySize = array.size
        if (arraySize == 0) return EMPTY_STRING

        val buffer = CharArray(arraySize) // Allocate for the worse case.
        var charsWritten = 0

        var index = 0
        while (index < arraySize) {
            val byte1 = array.readByteAsInt(index++)

            if (byte1 and 0b1000_0000 == 0) {
                // 0xxxxxxx -> U+0000..U+007F
                // 1 byte -> 7 meaningful bits
                buffer[charsWritten++] = byte1
                continue
            } else if (byte1 ushr 5 == 0b000_0110) {
                // 110xxxxx 10xxxxxx -> U+0080..U+07FF
                // 2 bytes -> 11 meaningful bits
                if (index < arraySize) {
                    val byte2 = array.readByteAsInt(index)
                    if (isValidContinuation(byte2)) {
                        index++
                        buffer[charsWritten++] = ((byte1 and 0b0001_1111) shl 6) or (byte2 and 0b0011_1111)
                        continue
                    }
                }
            } else if (byte1 ushr 4 == 0b0000_1110) {
                // 1110xxxx 10xxxxxx 10xxxxxx -> U+0800..U+FFFF
                // 3 bytes -> 16 meaningful bits
                if (index < arraySize) {
                    val byte2 = array.readByteAsInt(index)
                    if (isValidContinuation(byte2)) {
                        index++
                        if (index < arraySize) {
                            val byte3 = array.readByteAsInt(index)
                            if (isValidContinuation(byte3)) {
                                index++
                                buffer[charsWritten++] = ((byte1 and 0b0000_1111) shl 12) or
                                        ((byte2 and 0b0011_1111) shl 6) or
                                        (byte3 and 0b0011_1111)
                                continue
                            }
                        }
                    }
                }
            } else if (byte1 ushr 3 == 0b0001_1110) {
                // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -> U+D800..U+DBFF + U+DC00..U+DFFF
                // 4 bytes -> 21 meaningful bits
                if (index < arraySize) {
                    val byte2 = array.readByteAsInt(index)
                    if (isValidContinuation(byte2)) {
                        index++
                        if (index < arraySize) {
                            val byte3 = array.readByteAsInt(index)
                            if (isValidContinuation(byte3)) {
                                index++
                                if (index < arraySize) {
                                    val byte4 = array.readByteAsInt(index)
                                    if (isValidContinuation(byte4)) {
                                        index++
                                        val codePoint = ((byte1 and 0b0000_0111) shl 18) or
                                                ((byte2 and 0b0011_1111) shl 12) or
                                                ((byte3 and 0b0011_1111) shl 6) or
                                                (byte4 and 0b0011_1111)
                                        buffer[charsWritten++] = Character.highSurrogate(codePoint)
                                        buffer[charsWritten++] = Character.lowSurrogate(codePoint)
                                        continue
                                    }
                                }
                            }
                        }
                    }
                }
            }

            // unexpected end of the byte sequence or unexpected bit pattern
            buffer[charsWritten++] = REPLACEMENT_CHAR
        }

        return if (buffer.size == charsWritten) String(buffer) else String(buffer, 0, charsWritten)
    }

    private fun ByteArray.readByteAsInt(index: Int): Int = this[index].toInt() and 0b1111_1111

    private operator fun ByteArray.set(index: Int, value: Int) {
        this[index] = value.toByte()
    }

    private operator fun CharArray.set(index: Int, value: Int) {
        this[index] = value.toChar()
    }

    private fun isValidContinuation(byteN: Int) = byteN ushr 6 == 0b0000_0010

    private val EMPTY_BYTE_ARRAY = byteArrayOf()
    private const val EMPTY_STRING = ""

    private const val REPLACEMENT_CHAR = '\ufffd'
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy