All Downloads are FREE. Search and download functionalities are using the official Maven repository.

commonMain.Utf8.kt Maven / Gradle / Ivy

There is a newer version: 0.6.0
Show newest version
/*
 * Copyright 2017-2023 JetBrains s.r.o. and respective authors and developers.
 * Use of this source code is governed by the Apache 2.0 license that can be found in the LICENCE file.
 */

/*
 * Copyright (C) 2017 Square, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * `kotlinx-io` assumes most applications use UTF-8 exclusively, and offers optimized implementations of
 * common operations on UTF-8 strings.
 *
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
 * 
[ByteString][Buffer], [BufferedSink], [BufferedSource]
Encode a string[ByteString.encodeUtf8][BufferedSink.writeUtf8]
Encode a code point[BufferedSink.writeUtf8CodePoint]
Decode a string[ByteString.utf8][BufferedSource.readUtf8], [BufferedSource.readUtf8]
Decode a code point[BufferedSource.readUtf8CodePoint]
Decode until the next `\r\n` or `\n`[BufferedSource.readUtf8LineStrict], * [BufferedSource.readUtf8LineStrict]
Decode until the next `\r\n`, `\n`, or `EOF`[BufferedSource.readUtf8Line]
Measure the bytes in a UTF-8 string[Utf8.size], [Utf8.size]
*/ package kotlinx.io import kotlinx.io.internal.* import kotlinx.io.unsafe.UnsafeBufferOperations import kotlinx.io.unsafe.withData import kotlin.math.min /** * Returns the number of bytes used to encode the slice of `string` as UTF-8 when using [Sink.writeString]. * * @param startIndex the index (inclusive) of the first character to encode, `0` by default. * @param endIndex the index (exclusive) of the character past the last character to encode, `string.length` by default. * * @throws IndexOutOfBoundsException when [startIndex] or [endIndex] is out of range of string indices. * @throws IllegalArgumentException when `startIndex > endIndex`. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.utf8SizeSample */ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long { checkBounds(length, startIndex, endIndex) var result = 0L var i = startIndex while (i < endIndex) { val c = this[i].code if (c < 0x80) { // A 7-bit character with 1 byte. result++ i++ } else if (c < 0x800) { // An 11-bit character with 2 bytes. result += 2 i++ } else if (c < 0xd800 || c > 0xdfff) { // A 16-bit character with 3 bytes. result += 3 i++ } else { val low = if (i + 1 < endIndex) this[i + 1].code else 0 if (c > 0xdbff || low < 0xdc00 || low > 0xdfff) { // A malformed surrogate, which yields '?'. result++ i++ } else { // A 21-bit character with 4 bytes. result += 4 i += 2 } } } return result } /** * Encodes [codePoint] in UTF-8 and writes it to this sink. * * [codePoint] should represent valid Unicode code point, meaning that its value should be within the Unicode codespace * (`U+000000` .. `U+10ffff`), otherwise [IllegalArgumentException] will be thrown. * * Note that in general, a value retrieved from [Char.code] could not be written directly * as it may be a part of a [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2) (that could be * detected using [Char.isSurrogate], or [Char.isHighSurrogate] and [Char.isLowSurrogate]). * Such a pair of characters needs to be manually converted back to a single code point * which then could be written to a [Sink]. * Without such a conversion, data written to a [Sink] can not be converted back * to a string from which a surrogate pair was retrieved. * * More specifically, all code points mapping to UTF-16 surrogates (`U+d800`..`U+dfff`) * will be written as `?` characters (`U+0063`). * * @param codePoint the codePoint to be written. * * @throws IllegalStateException when the sink is closed. * @throws IllegalArgumentException when [codePoint] value is negative, or greater than `U+10ffff`. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8CodePointSample * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeSurrogatePair */ @OptIn(DelicateIoApi::class) public fun Sink.writeCodePointValue(codePoint: Int): Unit = writeToInternalBuffer { it.commonWriteUtf8CodePoint(codePoint) } /** * Encodes the characters at [startIndex] up to [endIndex] from [string] in UTF-8 and writes it to this sink. * * @param string the string to be encoded. * @param startIndex the index (inclusive) of the first character to encode, 0 by default. * @param endIndex the index (exclusive) of a character past to a last character to encode, `string.length` by default. * * @throws IndexOutOfBoundsException when [startIndex] or [endIndex] is out of range of [string] indices. * @throws IllegalArgumentException when `startIndex > endIndex`. * @throws IllegalStateException when the sink is closed. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8Sample */ @OptIn(DelicateIoApi::class) public fun Sink.writeString(string: String, startIndex: Int = 0, endIndex: Int = string.length) { checkBounds(string.length, startIndex, endIndex) writeToInternalBuffer { it.commonWriteUtf8(startIndex, endIndex, string::get) } } /** * Encodes the characters at [startIndex] up to [endIndex] from [chars] in UTF-8 and writes it to this sink. * * @param chars the string to be encoded. * @param startIndex the index (inclusive) of the first character to encode, 0 by default. * @param endIndex the index (exclusive) of a character past to a last character to encode, `chars.length` by default. * * @throws IndexOutOfBoundsException when [startIndex] or [endIndex] is out of range of [chars] indices. * @throws IllegalArgumentException when `startIndex > endIndex`. * @throws IllegalStateException when the sink is closed. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8SeqSample */ @OptIn(DelicateIoApi::class) public fun Sink.writeString(chars: CharSequence, startIndex: Int = 0, endIndex: Int = chars.length) { checkBounds(chars.length, startIndex, endIndex) writeToInternalBuffer { it.commonWriteUtf8(startIndex, endIndex, chars::get) } } /** * Removes all bytes from this source, decodes them as UTF-8, and returns the string. * * Returns the empty string if this source is empty. * * @throws IllegalStateException when the source is closed. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8 */ @OptIn(InternalIoApi::class) public fun Source.readString(): String { request(Long.MAX_VALUE) // Request all data return buffer.commonReadUtf8(buffer.size) } /** * Removes all bytes from this buffer, decodes them as UTF-8, and returns the string. * * Returns the empty string if this buffer is empty. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8 */ public fun Buffer.readString(): String { return commonReadUtf8(size) } /** * Removes [byteCount] bytes from this source, decodes them as UTF-8, and returns the string. * * @param byteCount the number of bytes to read from the source for string decoding. * * @throws IllegalArgumentException when [byteCount] is negative. * @throws EOFException when the source is exhausted before reading [byteCount] bytes from it. * @throws IllegalStateException when the source is closed. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8 */ @OptIn(InternalIoApi::class) public fun Source.readString(byteCount: Long): String { require(byteCount) return buffer.commonReadUtf8(byteCount) } /** * Decodes a single code point value from UTF-8 code units, reading between 1 and 4 bytes as necessary. * * If this source is exhausted before a complete code point can be read, this throws an * [EOFException] and consumes no input. * * If this source starts with an ill-formed UTF-8 code units sequence, this method will remove * 1 or more non-UTF-8 bytes and return the replacement character (`U+fffd`). * * The replacement character (`U+fffd`) will be also returned if the source starts with a well-formed * code units sequences, but a decoded value does not pass further validation, such as * the value is out of range (beyond the `0x10ffff` limit of Unicode), maps to UTF-16 surrogates (`U+d800`..`U+dfff`), * or an overlong encoding is detected (such as `0xc080` for the NUL character in modified UTF-8). * * Note that in general, returned value may not be directly converted to [Char] as it may be out * of [Char]'s values range and should be manually converted to a * [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2). * * @throws EOFException when the source is exhausted before a complete code point can be read. * @throws IllegalStateException when the source is closed. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8CodePointSample * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.surrogatePairs */ @OptIn(InternalIoApi::class) public fun Source.readCodePointValue(): Int { if (this is Buffer) { return commonReadUtf8CodePoint() } require(1) val b0 = buffer[0].toInt() when { b0 and 0xe0 == 0xc0 -> require(2) b0 and 0xf0 == 0xe0 -> require(3) b0 and 0xf8 == 0xf0 -> require(4) } return buffer.commonReadUtf8CodePoint() } /** * Removes and returns UTF-8 encoded characters up to but not including the next line break. A line break is * either `"\n"` or `"\r\n"`; these characters are not included in the result. * * On the end of the stream this method returns null. If the source doesn't end with a line break, then * an implicit line break is assumed. Null is returned once the source is exhausted. * * @throws IllegalStateException when the source is closed. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readLinesSample */ @OptIn(InternalIoApi::class) public fun Source.readLine(): String? { if (!request(1)) return null var lfIndex = this.indexOf('\n'.code.toByte()) return when (lfIndex) { -1L -> readString() 0L -> { skip(1) "" } else -> { var skipBytes = 1 if (buffer[lfIndex - 1] == '\r'.code.toByte()) { lfIndex -= 1 skipBytes += 1 } val string = readString(lfIndex) skip(skipBytes.toLong()) string } } } /** * Removes and returns UTF-8 encoded characters up to but not including the next line break, throwing * [EOFException] if a line break was not encountered. A line break is either `"\n"` or `"\r\n"`; * these characters are not included in the result. * * The returned string will have at most [limit] UTF-8 bytes, and the maximum number of bytes * scanned is `limit + 2`. If `limit == 0` this will always throw an [EOFException] because no * bytes will be scanned. * * No bytes are discarded if the match fails. * * @param limit the maximum UTF-8 bytes constituting a returned string. * * @throws EOFException when the source does not contain a string consisting with at most [limit] bytes followed by * line break characters. * @throws IllegalStateException when the source is closed. * @throws IllegalArgumentException when [limit] is negative. * @throws IOException when some I/O error occurs. * * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readLinesSample */ @OptIn(InternalIoApi::class) public fun Source.readLineStrict(limit: Long = Long.MAX_VALUE): String { require(limit >= 0) { "limit ($limit) < 0" } require(1) var lfIndex = indexOf('\n'.code.toByte(), startIndex = 0, endIndex = limit) if (lfIndex == 0L) { skip(1) return "" } if (lfIndex > 0) { var skipBytes = 1L if (buffer[lfIndex - 1] == '\r'.code.toByte()) { lfIndex -= 1 skipBytes += 1 } val str = readString(lfIndex) skip(skipBytes) return str } // we reached the end of the source before hitting the limit if (buffer.size < limit) throw EOFException() // we can't read data anymore if (limit == Long.MAX_VALUE) throw EOFException() // there is no more data if (!request(limit + 1)) throw EOFException() val b = buffer[limit] if (b == '\n'.code.toByte()) { val str = readString(limit) skip(1) return str } // check if the last byte is CR and the byte passed it is LF if (b != '\r'.code.toByte() || !request(limit + 2)) throw EOFException() if (buffer[limit + 1] != '\n'.code.toByte()) throw EOFException() val res = readString(limit) skip(2) return res } private fun Buffer.commonReadUtf8CodePoint(): Int { require(1) val b0 = this[0] var codePoint: Int val byteCount: Int val min: Int when { b0 and 0x80 == 0 -> { // 0xxxxxxx. codePoint = b0 and 0x7f byteCount = 1 // 7 bits (ASCII). min = 0x0 } b0 and 0xe0 == 0xc0 -> { // 0x110xxxxx codePoint = b0 and 0x1f byteCount = 2 // 11 bits (5 + 6). min = 0x80 } b0 and 0xf0 == 0xe0 -> { // 0x1110xxxx codePoint = b0 and 0x0f byteCount = 3 // 16 bits (4 + 6 + 6). min = 0x800 } b0 and 0xf8 == 0xf0 -> { // 0x11110xxx codePoint = b0 and 0x07 byteCount = 4 // 21 bits (3 + 6 + 6 + 6). min = 0x10000 } else -> { // We expected the first byte of a code point but got something else. skip(1) return REPLACEMENT_CODE_POINT } } if (size < byteCount) { throw EOFException("size < $byteCount: $size (to read code point prefixed 0x${b0.toHexString()})") } // Read the continuation bytes. If we encounter a non-continuation byte, the sequence consumed // thus far is truncated and is decoded as the replacement character. That non-continuation byte // is left in the stream for processing by the next call to readUtf8CodePoint(). for (i in 1 until byteCount) { val b = this[i.toLong()] if (b and 0xc0 == 0x80) { // 0x10xxxxxx codePoint = codePoint shl 6 codePoint = codePoint or (b and 0x3f) } else { skip(i.toLong()) return REPLACEMENT_CODE_POINT } } skip(byteCount.toLong()) return when { codePoint > 0x10ffff -> { REPLACEMENT_CODE_POINT // Reject code points larger than the Unicode maximum. } codePoint in 0xd800..0xdfff -> { REPLACEMENT_CODE_POINT // Reject partial surrogates. } codePoint < min -> { REPLACEMENT_CODE_POINT // Reject overlong code points. } else -> codePoint } } @OptIn(UnsafeIoApi::class) private inline fun Buffer.commonWriteUtf8(beginIndex: Int, endIndex: Int, charAt: (Int) -> Char) { // Transcode a UTF-16 chars to UTF-8 bytes. var i = beginIndex while (i < endIndex) { var c = charAt(i).code when { c < 0x80 -> { UnsafeBufferOperations.writeToTail(this, 1) { ctx, segment -> val segmentOffset = -i val runLimit = minOf(endIndex, i + segment.remainingCapacity) // Emit a 7-bit character with 1 byte. ctx.setUnchecked(segment, segmentOffset + i++, c.toByte()) // 0xxxxxxx // Fast-path contiguous runs of ASCII characters. This is ugly, but yields a ~4x performance // improvement over independent calls to writeByte(). while (i < runLimit) { c = charAt(i).code if (c >= 0x80) break ctx.setUnchecked(segment, segmentOffset + i++, c.toByte()) // 0xxxxxxx } i + segmentOffset // Equivalent to i - (previous i). } } c < 0x800 -> { // Emit a 11-bit character with 2 bytes. UnsafeBufferOperations.writeToTail(this, 2) { ctx, segment -> ctx.setUnchecked( segment, 0, (c shr 6 or 0xc0).toByte(), // 110xxxxx (c and 0x3f or 0x80).toByte() // 10xxxxxx ) 2 } i++ } c < 0xd800 || c > 0xdfff -> { // Emit a 16-bit character with 3 bytes. UnsafeBufferOperations.writeToTail(this, 3) { ctx, segment -> ctx.setUnchecked( segment, 0, (c shr 12 or 0xe0).toByte(), // 1110xxxx (c shr 6 and 0x3f or 0x80).toByte(), // 10xxxxxx (c and 0x3f or 0x80).toByte() // 10xxxxxx ) 3 } i++ } else -> { // c is a surrogate. Make sure it is a high surrogate & that its successor is a low // surrogate. If not, the UTF-16 is invalid, in which case we emit a replacement // character. val low = (if (i + 1 < endIndex) charAt(i + 1).code else 0) if (c > 0xdbff || low !in 0xdc00..0xdfff) { writeByte('?'.code.toByte()) i++ } else { // UTF-16 high surrogate: 110110xxxxxxxxxx (10 bits) // UTF-16 low surrogate: 110111yyyyyyyyyy (10 bits) // Unicode code point: 00010000000000000000 + xxxxxxxxxxyyyyyyyyyy (21 bits) val codePoint = 0x010000 + (c and 0x03ff shl 10 or (low and 0x03ff)) // Emit a 21-bit character with 4 bytes. UnsafeBufferOperations.writeToTail(this, 4) { ctx, segment -> ctx.setUnchecked(segment, 0, (codePoint shr 18 or 0xf0).toByte(), // 11110xxx (codePoint shr 12 and 0x3f or 0x80).toByte(), // 10xxxxxx (codePoint shr 6 and 0x3f or 0x80).toByte(), // 10xxyyyy (codePoint and 0x3f or 0x80).toByte() // 10yyyyyy ) 4 } i += 2 } } } } } @OptIn(UnsafeIoApi::class) private fun Buffer.commonWriteUtf8CodePoint(codePoint: Int) { when { codePoint < 0 || codePoint > 0x10ffff -> { throw IllegalArgumentException( "Code point value is out of Unicode codespace 0..0x10ffff: 0x${codePoint.toHexString()} ($codePoint)" ) } codePoint < 0x80 -> { // Emit a 7-bit code point with 1 byte. writeByte(codePoint.toByte()) } codePoint < 0x800 -> { // Emit a 11-bit code point with 2 bytes. UnsafeBufferOperations.writeToTail(this, 2) { ctx, segment -> ctx.setUnchecked(segment, 0, (codePoint shr 6 or 0xc0).toByte()) // 110xxxxx ctx.setUnchecked(segment, 1, (codePoint and 0x3f or 0x80).toByte()) // 10xxxxxx 2 } } codePoint in 0xd800..0xdfff -> { // Emit a replacement character for a partial surrogate. writeByte('?'.code.toByte()) } codePoint < 0x10000 -> { // Emit a 16-bit code point with 3 bytes. UnsafeBufferOperations.writeToTail(this, 3) { ctx, segment -> ctx.setUnchecked(segment, 0, (codePoint shr 12 or 0xe0).toByte()) // 1110xxxx ctx.setUnchecked(segment, 1, (codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxxxxx ctx.setUnchecked(segment, 2, (codePoint and 0x3f or 0x80).toByte()) // 10xxxxxx 3 } } else -> { // [0x10000, 0x10ffff] // Emit a 21-bit code point with 4 bytes. UnsafeBufferOperations.writeToTail(this, 4) { ctx, segment -> ctx.setUnchecked(segment,0, (codePoint shr 18 or 0xf0).toByte()) // 11110xxx ctx.setUnchecked(segment,1, (codePoint shr 12 and 0x3f or 0x80).toByte()) // 10xxxxxx ctx.setUnchecked(segment,2, (codePoint shr 6 and 0x3f or 0x80).toByte()) // 10xxyyyy ctx.setUnchecked(segment,3, (codePoint and 0x3f or 0x80).toByte()) // 10yyyyyy 4 } } } } @OptIn(UnsafeIoApi::class) private fun Buffer.commonReadUtf8(byteCount: Long): String { // Invariant: byteCount was request()'ed into this buffer beforehand if (byteCount == 0L) return "" UnsafeBufferOperations.forEachSegment(this) { ctx, segment -> if (segment.size >= byteCount) { var result = "" ctx.withData(segment) { data, pos, limit -> result = data.commonToUtf8String(pos, min(limit, pos + byteCount.toInt())) skip(byteCount) return result } } // If the string spans multiple segments, delegate to readBytes() return readByteArray(byteCount.toInt()).commonToUtf8String() } error("Unreacheable") }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy