commonMain.okio.Utf8.kt Maven / Gradle / Ivy
/*
* Copyright (C) 2017 Square, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Okio assumes most applications use UTF-8 exclusively, and offers optimized implementations of
* common operations on UTF-8 strings.
*
*
*
*
* [ByteString]
* [Buffer], [BufferedSink], [BufferedSource]
*
*
* Encode a string
* [ByteString.encodeUtf8]
* [BufferedSink.writeUtf8]
*
*
* Encode a code point
*
* [BufferedSink.writeUtf8CodePoint]
*
*
* Decode a string
* [ByteString.utf8]
* [BufferedSource.readUtf8], [BufferedSource.readUtf8]
*
*
* Decode a code point
*
* [BufferedSource.readUtf8CodePoint]
*
*
* Decode until the next `\r\n` or `\n`
*
* [BufferedSource.readUtf8LineStrict],
* [BufferedSource.readUtf8LineStrict]
*
*
* Decode until the next `\r\n`, `\n`, or `EOF`
*
* [BufferedSource.readUtf8Line]
*
*
* Measure the bytes in a UTF-8 string
* [Utf8.size], [Utf8.size]
*
*
*/
@file:JvmName("Utf8")
package okio
import kotlin.jvm.JvmName
import kotlin.jvm.JvmOverloads
/**
* Returns the number of bytes used to encode the slice of `string` as UTF-8 when using
* [BufferedSink.writeUtf8].
*/
@JvmOverloads
@JvmName("size")
fun String.utf8Size(beginIndex: Int = 0, endIndex: Int = length): Long {
require(beginIndex >= 0) { "beginIndex < 0: $beginIndex" }
require(endIndex >= beginIndex) { "endIndex < beginIndex: $endIndex < $beginIndex" }
require(endIndex <= length) { "endIndex > string.length: $endIndex > $length" }
var result = 0L
var i = beginIndex
while (i < endIndex) {
val c = this[i].code
if (c < 0x80) {
// A 7-bit character with 1 byte.
result++
i++
} else if (c < 0x800) {
// An 11-bit character with 2 bytes.
result += 2
i++
} else if (c < 0xd800 || c > 0xdfff) {
// A 16-bit character with 3 bytes.
result += 3
i++
} else {
val low = if (i + 1 < endIndex) this[i + 1].code else 0
if (c > 0xdbff || low < 0xdc00 || low > 0xdfff) {
// A malformed surrogate, which yields '?'.
result++
i++
} else {
// A 21-bit character with 4 bytes.
result += 4
i += 2
}
}
}
return result
}
internal const val REPLACEMENT_BYTE: Byte = '?'.code.toByte()
internal const val REPLACEMENT_CHARACTER: Char = '\ufffd'
internal const val REPLACEMENT_CODE_POINT: Int = REPLACEMENT_CHARACTER.code
@Suppress("NOTHING_TO_INLINE") // Syntactic sugar.
internal inline fun isIsoControl(codePoint: Int): Boolean =
(codePoint in 0x00..0x1F) || (codePoint in 0x7F..0x9F)
@Suppress("NOTHING_TO_INLINE") // Syntactic sugar.
internal inline fun isUtf8Continuation(byte: Byte): Boolean {
// 0b10xxxxxx
return byte and 0xc0 == 0x80
}
// TODO combine with Buffer.writeUtf8?
// TODO combine with Buffer.writeUtf8CodePoint?
internal inline fun String.processUtf8Bytes(
beginIndex: Int,
endIndex: Int,
yield: (Byte) -> Unit
) {
// Transcode a UTF-16 String to UTF-8 bytes.
var index = beginIndex
while (index < endIndex) {
val c = this[index]
when {
c < '\u0080' -> {
// Emit a 7-bit character with 1 byte.
yield(c.code.toByte()) // 0xxxxxxx
index++
// Assume there is going to be more ASCII
while (index < endIndex && this[index] < '\u0080') {
yield(this[index++].code.toByte())
}
}
c < '\u0800' -> {
// Emit a 11-bit character with 2 bytes.
/* ktlint-disable no-multi-spaces */
yield((c.code shr 6 or 0xc0).toByte()) // 110xxxxx
yield((c.code and 0x3f or 0x80).toByte()) // 10xxxxxx
/* ktlint-enable no-multi-spaces */
index++
}
c !in '\ud800'..'\udfff' -> {
// Emit a 16-bit character with 3 bytes.
/* ktlint-disable no-multi-spaces */
yield((c.code shr 12 or 0xe0).toByte()) // 1110xxxx
yield((c.code shr 6 and 0x3f or 0x80).toByte()) // 10xxxxxx
yield((c.code and 0x3f or 0x80).toByte()) // 10xxxxxx
/* ktlint-enable no-multi-spaces */
index++
}
else -> {
// c is a surrogate. Make sure it is a high surrogate & that its successor is a low
// surrogate. If not, the UTF-16 is invalid, in which case we emit a replacement
// byte.
if (c > '\udbff' ||
endIndex <= index + 1 ||
this[index + 1] !in '\udc00'..'\udfff'
) {
yield(REPLACEMENT_BYTE)
index++
} else {
// UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
// UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
// Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
val codePoint = (
((c.code shl 10) + this[index + 1].code) +
(0x010000 - (0xd800 shl 10) - 0xdc00)
)
// Emit a 21-bit character with 4 bytes.
/* ktlint-disable no-multi-spaces */
yield((codePoint shr 18 or 0xf0).toByte()) // 11110xxx
yield((codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx
yield((codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxyyyy
yield((codePoint and 0x3f or 0x80).toByte()) // 10yyyyyy
/* ktlint-enable no-multi-spaces */
index += 2
}
}
}
}
}
// TODO combine with Buffer.readUtf8CodePoint?
internal inline fun ByteArray.processUtf8CodePoints(
beginIndex: Int,
endIndex: Int,
yield: (Int) -> Unit
) {
var index = beginIndex
while (index < endIndex) {
val b0 = this[index]
when {
b0 >= 0 -> {
// 0b0xxxxxxx
yield(b0.toInt())
index++
// Assume there is going to be more ASCII
while (index < endIndex && this[index] >= 0) {
yield(this[index++].toInt())
}
}
b0 shr 5 == -2 -> {
// 0b110xxxxx
index += process2Utf8Bytes(index, endIndex) { yield(it) }
}
b0 shr 4 == -2 -> {
// 0b1110xxxx
index += process3Utf8Bytes(index, endIndex) { yield(it) }
}
b0 shr 3 == -2 -> {
// 0b11110xxx
index += process4Utf8Bytes(index, endIndex) { yield(it) }
}
else -> {
// 0b10xxxxxx - Unexpected continuation
// 0b111111xxx - Unknown encoding
yield(REPLACEMENT_CODE_POINT)
index++
}
}
}
}
// Value added to the high UTF-16 surrogate after shifting
internal const val HIGH_SURROGATE_HEADER = 0xd800 - (0x010000 ushr 10)
// Value added to the low UTF-16 surrogate after masking
internal const val LOG_SURROGATE_HEADER = 0xdc00
// TODO combine with Buffer.readUtf8?
internal inline fun ByteArray.processUtf16Chars(
beginIndex: Int,
endIndex: Int,
yield: (Char) -> Unit
) {
var index = beginIndex
while (index < endIndex) {
val b0 = this[index]
when {
b0 >= 0 -> {
// 0b0xxxxxxx
yield(b0.toInt().toChar())
index++
// Assume there is going to be more ASCII
// This is almost double the performance of the outer loop
while (index < endIndex && this[index] >= 0) {
yield(this[index++].toInt().toChar())
}
}
b0 shr 5 == -2 -> {
// 0b110xxxxx
index += process2Utf8Bytes(index, endIndex) { yield(it.toChar()) }
}
b0 shr 4 == -2 -> {
// 0b1110xxxx
index += process3Utf8Bytes(index, endIndex) { yield(it.toChar()) }
}
b0 shr 3 == -2 -> {
// 0b11110xxx
index += process4Utf8Bytes(index, endIndex) { codePoint ->
if (codePoint != REPLACEMENT_CODE_POINT) {
// Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits)
// UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits)
// UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits)
/* ktlint-disable no-multi-spaces paren-spacing */
yield(((codePoint ushr 10 ) + HIGH_SURROGATE_HEADER).toChar())
/* ktlint-enable no-multi-spaces paren-spacing */
yield(((codePoint and 0x03ff) + LOG_SURROGATE_HEADER).toChar())
} else {
yield(REPLACEMENT_CHARACTER)
}
}
}
else -> {
// 0b10xxxxxx - Unexpected continuation
// 0b111111xxx - Unknown encoding
yield(REPLACEMENT_CHARACTER)
index++
}
}
}
}
// ===== UTF-8 Encoding and Decoding ===== //
/*
The following 3 methods take advantage of using XOR on 2's complement store
numbers to quickly and efficiently combine the important data of UTF-8 encoded
bytes. This will be best explained using an example, so lets take the following
encoded character '∇' = \u2207.
Using the Unicode code point for this character, 0x2207, we will split the
binary representation into 3 sections as follows:
0x2207 = 0b0010 0010 0000 0111
xxxx yyyy yyzz zzzz
Now take each section of bits and add the appropriate header:
utf8(0x2207) = 0b1110 xxxx 0b10yy yyyy 0b10zz zzzz
= 0b1110 0010 0b1000 1000 0b1000 0111
= 0xe2 0x88 0x87
We have now just encoded this as a 3 byte UTF-8 character. More information
about different sizes of characters can be found here:
https://en.wikipedia.org/wiki/UTF-8
Encoding was pretty easy, but decoding is a bit more complicated. We need to
first determine the number of bytes used to represent the character, strip all
the headers, and then combine all the bits into a single integer. Let's use the
character we just encoded and work backwards, taking advantage of 2's complement
integer representation and the XOR function.
Let's look at the decimal representation of these bytes:
0xe2, 0x88, 0x87 = -30, -120, -121
The first interesting thing to notice is that UTF-8 headers all start with 1 -
except for ASCII which is encoded as a single byte - which means all UTF-8 bytes
will be negative. So converting these to integers results in a lot of 1's added
because they are store as 2's complement:
0xe2 = -30 = 0xffff ffe2
0x88 = -120 = 0xffff ff88
0x87 = -121 = 0xffff ff87
Now let's XOR these with their corresponding UTF-8 byte headers to see what
happens:
0xffff ffe2 xor 0xffff ffe0 = 0x0000 0002
0xffff ff88 xor 0xffff ff80 = 0x0000 0008
0xffff ff87 xor 0xffff ff80 = 0x0000 0007
***This is why we must first convert the byte header mask to a byte and then
back to an integer, so it is properly converted to a 2's complement negative
number which can be applied to each byte.***
Now let's look at the binary representation to see how we can combine these to
create the Unicode code point:
0b0000 0010 0b0000 1000 0b0000 0111
0b1110 xxxx 0b10yy yyyy 0b10zz zzzz
Combining each section will require some bit shifting, but then they can just
be OR'd together. They can also be XOR'd together which makes use of a single,
COMMUTATIVE, operator through the entire calculation.
<< 12 = 00000010
<< 6 = 00001000
<< 0 = 00000111
XOR = 00000010001000000111
code point = 0b0010 0010 0000 0111
= 0x2207
And there we have it! The decoded UTF-8 character '∇'! And because the XOR
operator is commutative, we can re-arrange all this XOR and shifting to create
a single mask that can be applied to 3-byte UTF-8 characters after their bytes
have been shifted and XOR'd together.
*/
// Mask used to remove byte headers from a 2 byte encoded UTF-8 character
internal const val MASK_2BYTES = 0x0f80
// MASK_2BYTES =
// (0xc0.toByte() shl 6) xor
// (0x80.toByte().toInt())
internal inline fun ByteArray.process2Utf8Bytes(
beginIndex: Int,
endIndex: Int,
yield: (Int) -> Unit
): Int {
if (endIndex <= beginIndex + 1) {
yield(REPLACEMENT_CODE_POINT)
// Only 1 byte remaining - underflow
return 1
}
val b0 = this[beginIndex]
val b1 = this[beginIndex + 1]
if (!isUtf8Continuation(b1)) {
yield(REPLACEMENT_CODE_POINT)
return 1
}
val codePoint =
(
MASK_2BYTES
xor (b1.toInt())
xor (b0.toInt() shl 6)
)
when {
codePoint < 0x80 -> {
yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
}
else -> {
yield(codePoint)
}
}
return 2
}
// Mask used to remove byte headers from a 3 byte encoded UTF-8 character
internal const val MASK_3BYTES = -0x01e080
// MASK_3BYTES =
// (0xe0.toByte() shl 12) xor
// (0x80.toByte() shl 6) xor
// (0x80.toByte().toInt())
internal inline fun ByteArray.process3Utf8Bytes(
beginIndex: Int,
endIndex: Int,
yield: (Int) -> Unit
): Int {
if (endIndex <= beginIndex + 2) {
// At least 2 bytes remaining
yield(REPLACEMENT_CODE_POINT)
if (endIndex <= beginIndex + 1 || !isUtf8Continuation(this[beginIndex + 1])) {
// Only 1 byte remaining - underflow
// Or 2nd byte is not a continuation - malformed
return 1
} else {
// Only 2 bytes remaining - underflow
return 2
}
}
val b0 = this[beginIndex]
val b1 = this[beginIndex + 1]
if (!isUtf8Continuation(b1)) {
yield(REPLACEMENT_CODE_POINT)
return 1
}
val b2 = this[beginIndex + 2]
if (!isUtf8Continuation(b2)) {
yield(REPLACEMENT_CODE_POINT)
return 2
}
val codePoint =
(
MASK_3BYTES
xor (b2.toInt())
xor (b1.toInt() shl 6)
xor (b0.toInt() shl 12)
)
when {
codePoint < 0x800 -> {
yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
}
codePoint in 0xd800..0xdfff -> {
yield(REPLACEMENT_CODE_POINT) // Reject partial surrogates.
}
else -> {
yield(codePoint)
}
}
return 3
}
// Mask used to remove byte headers from a 4 byte encoded UTF-8 character
internal const val MASK_4BYTES = 0x381f80
// MASK_4BYTES =
// (0xf0.toByte() shl 18) xor
// (0x80.toByte() shl 12) xor
// (0x80.toByte() shl 6) xor
// (0x80.toByte().toInt())
internal inline fun ByteArray.process4Utf8Bytes(
beginIndex: Int,
endIndex: Int,
yield: (Int) -> Unit
): Int {
if (endIndex <= beginIndex + 3) {
// At least 3 bytes remaining
yield(REPLACEMENT_CODE_POINT)
if (endIndex <= beginIndex + 1 || !isUtf8Continuation(this[beginIndex + 1])) {
// Only 1 byte remaining - underflow
// Or 2nd byte is not a continuation - malformed
return 1
} else if (endIndex <= beginIndex + 2 || !isUtf8Continuation(this[beginIndex + 2])) {
// Only 2 bytes remaining - underflow
// Or 3rd byte is not a continuation - malformed
return 2
} else {
// Only 3 bytes remaining - underflow
return 3
}
}
val b0 = this[beginIndex]
val b1 = this[beginIndex + 1]
if (!isUtf8Continuation(b1)) {
yield(REPLACEMENT_CODE_POINT)
return 1
}
val b2 = this[beginIndex + 2]
if (!isUtf8Continuation(b2)) {
yield(REPLACEMENT_CODE_POINT)
return 2
}
val b3 = this[beginIndex + 3]
if (!isUtf8Continuation(b3)) {
yield(REPLACEMENT_CODE_POINT)
return 3
}
val codePoint =
(
MASK_4BYTES
xor (b3.toInt())
xor (b2.toInt() shl 6)
xor (b1.toInt() shl 12)
xor (b0.toInt() shl 18)
)
when {
codePoint > 0x10ffff -> {
yield(REPLACEMENT_CODE_POINT) // Reject code points larger than the Unicode maximum.
}
codePoint in 0xd800..0xdfff -> {
yield(REPLACEMENT_CODE_POINT) // Reject partial surrogates.
}
codePoint < 0x10000 -> {
yield(REPLACEMENT_CODE_POINT) // Reject overlong code points.
}
else -> {
yield(codePoint)
}
}
return 4
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy