All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.datasketches.memory.Utf8 Maven / Gradle / Ivy

There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.memory;

import static java.lang.Character.isSurrogate;
import static java.lang.Character.isSurrogatePair;
import static java.lang.Character.toCodePoint;
import static org.apache.datasketches.memory.UnsafeUtil.unsafe;

import java.io.IOException;
import java.nio.BufferOverflowException;
import java.nio.CharBuffer;

/**
 * Encoding and decoding implementations of {@link WritableMemory#putCharsToUtf8} and
 * {@link Memory#getCharsFromUtf8}.
 *
 * 

This is specifically designed to reduce the production of intermediate objects (garbage), * thus significantly reducing pressure on the JVM Garbage Collector. * *

UTF-8 encoding/decoding is adapted from * https://github.com/protocolbuffers/protobuf/blob/master/java/core/src/main/java/com/google/protobuf/Utf8.java * *

Copyright 2008 Google Inc. All rights reserved. * https://developers.google.com/protocol-buffers/ * See LICENSE. * * @author Lee Rhodes * @author Roman Leventov */ final class Utf8 { private Utf8() { } //Decode static final int getCharsFromUtf8(final long offsetBytes, final int utf8LengthBytes, final Appendable dst, final long cumBaseOffset, final Object unsafeObj) throws IOException, Utf8CodingException { if ((dst instanceof CharBuffer) && ((CharBuffer) dst).hasArray()) { return getCharBufferCharsFromUtf8(offsetBytes, ((CharBuffer) dst), utf8LengthBytes, cumBaseOffset, unsafeObj); } //Decode Direct CharBuffers and all other Appendables final long address = cumBaseOffset + offsetBytes; // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). // Need to keep this loop int-indexed, because it's faster for Hotspot JIT, it doesn't insert // savepoint polls on each iteration. int i = 0; for (; i < utf8LengthBytes; i++) { final byte b = unsafe.getByte(unsafeObj, address + i); if (!DecodeUtil.isOneByte(b)) { break; } dst.append((char) b); } if (i == utf8LengthBytes) { return i; } return getNonAsciiCharsFromUtf8(dst, address + i, address + utf8LengthBytes, unsafeObj, cumBaseOffset) + i; } /* * Optimize for heap CharBuffer manually, because Hotspot JIT doesn't itself unfold this * abstraction well (doesn't hoist array bound checks, etc.) */ private static int getCharBufferCharsFromUtf8(final long offsetBytes, final CharBuffer cbuf, final int utf8LengthBytes, final long cumBaseOffset, final Object unsafeObj) { final char[] carr = cbuf.array(); final int startCpos = cbuf.position() + cbuf.arrayOffset(); int cpos = startCpos; final int clim = cbuf.arrayOffset() + cbuf.limit(); final long address = cumBaseOffset + offsetBytes; int i = 0; //byte index // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this). // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII). final int cbufNoCheckLimit = Math.min(utf8LengthBytes, clim - cpos); // Need to keep this loop int-indexed, because it's faster for Hotspot JIT, it doesn't insert // savepoint polls on each iteration. for (; i < cbufNoCheckLimit; i++) { final byte b = unsafe.getByte(unsafeObj, address + i); if (!DecodeUtil.isOneByte(b)) { break; } // Not checking CharBuffer bounds! carr[cpos++] = (char) b; } for (; i < utf8LengthBytes; i++) { final byte b = unsafe.getByte(unsafeObj, address + i); if (!DecodeUtil.isOneByte(b)) { break; } checkCharBufferPos(cbuf, cpos, clim); carr[cpos++] = (char) b; } if (i == utf8LengthBytes) { cbuf.position(cpos - cbuf.arrayOffset()); return cpos - startCpos; } return getCharBufferNonAsciiCharsFromUtf8(cbuf, carr, cpos, clim, address + i, address + utf8LengthBytes, unsafeObj, cumBaseOffset) - cbuf.arrayOffset(); } private static int getCharBufferNonAsciiCharsFromUtf8(final CharBuffer cbuf, final char[] carr, int cpos, final int clim, long address, final long addressLimit, final Object unsafeObj, final long cumBaseOffset) { while (address < addressLimit) { final byte byte1 = unsafe.getByte(unsafeObj, address++); if (DecodeUtil.isOneByte(byte1)) { checkCharBufferPos(cbuf, cpos, clim); carr[cpos++] = (char) byte1; // It's common for there to be multiple ASCII characters in a run mixed in, so add an // extra optimized loop to take care of these runs. while (address < addressLimit) { final byte b = unsafe.getByte(unsafeObj, address); if (!DecodeUtil.isOneByte(b)) { break; } address++; checkCharBufferPos(cbuf, cpos, clim); carr[cpos++] = (char) b; } } else if (DecodeUtil.isTwoBytes(byte1)) { if (address >= addressLimit) { cbuf.position(cpos - cbuf.arrayOffset()); final long off = address - cumBaseOffset; final long limit = addressLimit - cumBaseOffset; throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 2); } checkCharBufferPos(cbuf, cpos, clim); DecodeUtil.handleTwoBytesCharBuffer( byte1, /* byte2 */ unsafe.getByte(unsafeObj, address++), cbuf, carr, cpos); cpos++; } else if (DecodeUtil.isThreeBytes(byte1)) { if (address >= (addressLimit - 1)) { cbuf.position(cpos - cbuf.arrayOffset()); final long off = address - cumBaseOffset; final long limit = addressLimit - cumBaseOffset; throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 3); } checkCharBufferPos(cbuf, cpos, clim); DecodeUtil.handleThreeBytesCharBuffer( byte1, /* byte2 */ unsafe.getByte(unsafeObj, address++), /* byte3 */ unsafe.getByte(unsafeObj, address++), cbuf, carr, cpos); cpos++; } else { if (address >= (addressLimit - 2)) { cbuf.position(cpos - cbuf.arrayOffset()); final long off = address - cumBaseOffset; final long limit = addressLimit - cumBaseOffset; throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 4); } if (cpos >= (clim - 1)) { cbuf.position(cpos - cbuf.arrayOffset()); throw new BufferOverflowException(); } DecodeUtil.handleFourBytesCharBuffer( byte1, /* byte2 */ unsafe.getByte(unsafeObj, address++), /* byte3 */ unsafe.getByte(unsafeObj, address++), /* byte4 */ unsafe.getByte(unsafeObj, address++), cbuf, carr, cpos); cpos += 2; } } cbuf.position(cpos - cbuf.arrayOffset()); return cpos; } //Decodes into Appendable destination //returns num of chars decoded private static int getNonAsciiCharsFromUtf8(final Appendable dst, long address, final long addressLimit, final Object unsafeObj, final long cumBaseOffset) throws IOException { int chars = 0; while (address < addressLimit) { final byte byte1 = unsafe.getByte(unsafeObj, address++); if (DecodeUtil.isOneByte(byte1)) { dst.append((char) byte1); chars++; // It's common for there to be multiple ASCII characters in a run mixed in, so add an // extra optimized loop to take care of these runs. while (address < addressLimit) { final byte b = unsafe.getByte(unsafeObj, address); if (!DecodeUtil.isOneByte(b)) { break; } address++; dst.append((char) b); chars++; } } else if (DecodeUtil.isTwoBytes(byte1)) { if (address >= addressLimit) { final long off = address - cumBaseOffset; final long limit = addressLimit - cumBaseOffset; throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 2); } DecodeUtil.handleTwoBytes( byte1, /* byte2 */ unsafe.getByte(unsafeObj, address++), dst); chars++; } else if (DecodeUtil.isThreeBytes(byte1)) { if (address >= (addressLimit - 1)) { final long off = address - cumBaseOffset; final long limit = addressLimit - cumBaseOffset; throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 3); } DecodeUtil.handleThreeBytes( byte1, /* byte2 */ unsafe.getByte(unsafeObj, address++), /* byte3 */ unsafe.getByte(unsafeObj, address++), dst); chars++; } else { if (address >= (addressLimit - 2)) { final long off = address - cumBaseOffset; final long limit = addressLimit - cumBaseOffset; throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 4); } DecodeUtil.handleFourBytes( byte1, /* byte2 */ unsafe.getByte(unsafeObj, address++), /* byte3 */ unsafe.getByte(unsafeObj, address++), /* byte4 */ unsafe.getByte(unsafeObj, address++), dst); chars += 2; } } return chars; } private static void checkCharBufferPos(final CharBuffer cbuf, final int cpos, final int clim) { if (cpos == clim) { cbuf.position(cpos - cbuf.arrayOffset()); throw new BufferOverflowException(); } } /******************/ //Encode static long putCharsToUtf8(final long offsetBytes, final CharSequence src, final long capacityBytes, final long cumBaseOffset, final Object unsafeObj) { int cIdx = 0; //src character index long bIdx = cumBaseOffset + offsetBytes; //byte index long bCnt = 0; //bytes inserted final long byteLimit = cumBaseOffset + capacityBytes; //unsafe index limit final int utf16Length = src.length(); //Quickly dispatch an ASCII sequence for (char c; (cIdx < utf16Length) && ((cIdx + bIdx) < byteLimit) && ((c = src.charAt(cIdx)) < 0x80); cIdx++, bCnt++) { unsafe.putByte(unsafeObj, bIdx + cIdx, (byte) c); } //encountered a non-ascii character if (cIdx == utf16Length) { //done. // next relative byte index in memory is (bIdx + utf16Length) - cumBaseOffset. return bCnt; } bIdx += cIdx; //bytes == characters for ascii for (char c; cIdx < utf16Length; cIdx++) { //process the remaining characters c = src.charAt(cIdx); if ((c < 0x80) && (bIdx < byteLimit)) { //Encode ASCII, 0 through 0x007F. unsafe.putByte(unsafeObj, bIdx++, (byte) c); bCnt++; } else //c MUST BE >= 0x0080 || j >= byteLimit if ((c < 0x800) && (bIdx < (byteLimit - 1))) { //Encode 0x80 through 0x7FF. //This is for almost all Latin-script alphabets plus Greek, Cyrillic, Hebrew, Arabic, etc. //We must have target space for at least 2 Utf8 bytes. unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 6) | (c >>> 6))); unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c))); bCnt += 2; } else //c > 0x800 || j >= byteLimit - 1 || j >= byteLimit if ( !isSurrogate(c) && (bIdx < (byteLimit - 2)) ) { //Encode the remainder of the BMP that are not surrogates: // 0x0800 thru 0xD7FF; 0xE000 thru 0xFFFF, the max single-char code point //We must have target space for at least 3 Utf8 bytes. unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 5) | (c >>> 12))); unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (c >>> 6)))); unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c))); bCnt += 3; } else { //c is a surrogate || j >= byteLimit - 2 || j >= byteLimit - 1 || j >= byteLimit //At this point we are either: // 1) Attempting to encode Code Points outside the BMP. // // The only way to properly encode code points outside the BMP into Utf8 bytes is to use // High/Low pairs of surrogate characters. Therefore, we must have at least 2 source // characters remaining, at least 4 bytes of memory space remaining, and the next 2 // characters must be a valid surrogate pair. // // 2) There is insufficient Memory space to encode the current character from one of the // ifs above. // // We proceed assuming (1). If the following test fails, we move to an exception. final char low; if ( (cIdx <= (utf16Length - 2)) && (bIdx <= (byteLimit - 4)) && isSurrogatePair(c, low = src.charAt(cIdx + 1)) ) { //we are good cIdx++; //skip over low surrogate final int codePoint = toCodePoint(c, low); unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 4) | (codePoint >>> 18))); unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 12)))); unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 6)))); unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & codePoint))); bCnt += 4; } else { //We are going to throw an exception. So we have time to figure out // what was wrong and hopefully throw an intelligent message! //check the BMP code point cases and their required memory limits if ( ((c < 0X0080) && (bIdx >= byteLimit)) || ((c < 0x0800) && (bIdx >= (byteLimit - 1))) || ((c < 0xFFFF) && (bIdx >= (byteLimit - 2))) ) { throw Utf8CodingException.outOfMemory(); } if (cIdx > (utf16Length - 2)) { //the last char is an unpaired surrogate throw Utf8CodingException.unpairedSurrogate(c); } if (bIdx > (byteLimit - 4)) { //4 Memory bytes required to encode a surrogate pair. final int remaining = (int) ((bIdx - byteLimit) + 4L); throw Utf8CodingException.shortUtf8EncodeByteLength(remaining); } if (!isSurrogatePair(c, src.charAt(cIdx + 1)) ) { //Not a surrogate pair. throw Utf8CodingException.illegalSurrogatePair(c, src.charAt(cIdx + 1)); } //This should not happen :) throw new IllegalArgumentException("Unknown Utf8 encoding exception"); } } } //final long localOffsetBytes = bIdx - cumBaseOffset; return bCnt; } /*****************/ /** * Utility methods for decoding UTF-8 bytes into {@link String}. Callers are responsible for * extracting bytes (possibly using Unsafe methods), and checking remaining bytes. All other * UTF-8 validity checks and codepoint conversions happen in this class. * * @see Wikipedia: UTF-8 */ private static class DecodeUtil { /** * Returns whether this is a single-byte UTF-8 encoding. * This is for ASCII. * *

Code Plane 0, Code Point range U+0000 to U+007F. * *

Bit Patterns: *

  • Byte 1: '0xxxxxxx'
  • *
* @param b the byte being tested * @return true if this is a single-byte UTF-8 encoding, i.e., b is ≥ 0. */ private static boolean isOneByte(final byte b) { return b >= 0; } /** * Returns whether this is the start of a two-byte UTF-8 encoding. One-byte encoding must * already be excluded. * This is for almost all Latin-script alphabets plus Greek, Cyrillic, Hebrew, Arabic, etc. * *

Code Plane 0, Code Point range U+0080 to U+07FF. * *

Bit Patterns: *

  • Byte 1: '110xxxxx'
  • *
  • Byte 2: '10xxxxxx'
  • *
* *

All bytes must be < 0xE0. * * @param b the byte being tested * @return true if this is the start of a two-byte UTF-8 encoding. */ private static boolean isTwoBytes(final byte b) { return b < (byte) 0xE0; } /** * Returns whether this is the start of a three-byte UTF-8 encoding. Two-byte encoding must * already be excluded. * This is for the rest of the BMP, which includes most common Chinese, Japanese and Korean * characters. * *

Code Plane 0, Code Point range U+0800 to U+FFFF. * *

Bit Patterns: *

  • Byte 1: '1110xxxx'
  • *
  • Byte 2: '10xxxxxx'
  • *
  • Byte 3: '10xxxxxx'
  • *
* All bytes must be less than 0xF0. * * @param b the byte being tested * @return true if this is the start of a three-byte UTF-8 encoding, i.e., b ≥ 0XF0. */ private static boolean isThreeBytes(final byte b) { return b < (byte) 0xF0; } /* * Note that if three-byte UTF-8 coding has been excluded and if the current byte is * ≥ 0XF0, it must be the start of a four-byte UTF-8 encoding. * This is for the less common CJKV characters, historic scripts, math symbols, emoji, etc. * *

Code Plane 1 through 16, Code Point range U+10000 to U+10FFFF. * *

Bit Patterns: *

  • Byte 1: '11110xxx'
  • *
  • Byte 2: '10xxxxxx'
  • *
  • Byte 3: '10xxxxxx'
  • *
  • Byte 4: '10xxxxxx'
  • *
*/ private static void handleTwoBytes( final byte byte1, final byte byte2, final Appendable dst) throws IOException, Utf8CodingException { // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and // overlong 2-byte, '11000001'. if ((byte1 < (byte) 0xC2) || isNotTrailingByte(byte2)) { final byte[] out = new byte[] {byte1, byte2}; throw Utf8CodingException.illegalUtf8DecodeByteSequence(out); } dst.append((char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2))); } private static void handleTwoBytesCharBuffer( final byte byte1, final byte byte2, final CharBuffer cb, final char[] ca, final int cp) throws Utf8CodingException { // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and // overlong 2-byte, '11000001'. if ((byte1 < (byte) 0xC2) || isNotTrailingByte(byte2)) { final byte[] out = new byte[] {byte1, byte2}; cb.position(cp - cb.arrayOffset()); throw Utf8CodingException.illegalUtf8DecodeByteSequence(out); } ca[cp] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2)); } private static void handleThreeBytes( final byte byte1, final byte byte2, final byte byte3, final Appendable dst) throws IOException, Utf8CodingException { if (isNotTrailingByte(byte2) // overlong? 5 most significant bits must not all be zero || ((byte1 == (byte) 0xE0) && (byte2 < (byte) 0xA0)) // check for illegal surrogate codepoints || ((byte1 == (byte) 0xED) && (byte2 >= (byte) 0xA0)) || isNotTrailingByte(byte3)) { final byte[] out = new byte[] {byte1, byte2, byte3}; throw Utf8CodingException.illegalUtf8DecodeByteSequence(out); } dst.append((char) (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3))); } private static void handleThreeBytesCharBuffer( final byte byte1, final byte byte2, final byte byte3, final CharBuffer cb, final char[] ca, final int cp) throws Utf8CodingException { if (isNotTrailingByte(byte2) // overlong? 5 most significant bits must not all be zero || ((byte1 == (byte) 0xE0) && (byte2 < (byte) 0xA0)) // check for illegal surrogate codepoints || ((byte1 == (byte) 0xED) && (byte2 >= (byte) 0xA0)) || isNotTrailingByte(byte3)) { cb.position(cp - cb.arrayOffset()); final byte[] out = new byte[] {byte1, byte2, byte3}; throw Utf8CodingException.illegalUtf8DecodeByteSequence(out); } ca[cp] = (char) (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)); } private static void handleFourBytes( final byte byte1, final byte byte2, final byte byte3, final byte byte4, final Appendable dst) throws IOException, Utf8CodingException { if (isNotTrailingByte(byte2) // Check that 1 <= plane <= 16. Tricky optimized form of: // valid 4-byte leading byte? // if (byte1 > (byte) 0xF4 || // overlong? 4 most significant bits must not all be zero // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || // codepoint larger than the highest code point (U+10FFFF)? // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) || ((((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0) || isNotTrailingByte(byte3) || isNotTrailingByte(byte4)) { final byte[] out = new byte[] { byte1, byte2, byte3, byte4 }; throw Utf8CodingException.illegalUtf8DecodeByteSequence(out); } final int codepoint = ((byte1 & 0x07) << 18) | (trailingByteValue(byte2) << 12) | (trailingByteValue(byte3) << 6) | trailingByteValue(byte4); dst.append(DecodeUtil.highSurrogate(codepoint)); dst.append(DecodeUtil.lowSurrogate(codepoint)); } private static void handleFourBytesCharBuffer( final byte byte1, final byte byte2, final byte byte3, final byte byte4, final CharBuffer cb, final char[] ca, final int cp) throws Utf8CodingException { if (isNotTrailingByte(byte2) // Check that 1 <= plane <= 16. Tricky optimized form of: // valid 4-byte leading byte? // if (byte1 > (byte) 0xF4 || // overlong? 4 most significant bits must not all be zero // byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 || // codepoint larger than the highest code point (U+10FFFF)? // byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) || ((((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0) || isNotTrailingByte(byte3) || isNotTrailingByte(byte4)) { cb.position(cp - cb.arrayOffset()); final byte[] out = new byte[] { byte1, byte2, byte3, byte4 }; throw Utf8CodingException.illegalUtf8DecodeByteSequence(out); } final int codepoint = ((byte1 & 0x07) << 18) | (trailingByteValue(byte2) << 12) | (trailingByteValue(byte3) << 6) | trailingByteValue(byte4); ca[cp] = DecodeUtil.highSurrogate(codepoint); ca[cp + 1] = DecodeUtil.lowSurrogate(codepoint); } /* * Returns whether the byte is not a valid continuation of the form '10XXXXXX'. */ private static boolean isNotTrailingByte(final byte b) { return b > (byte) 0xBF; } /* * Returns the actual value of the trailing byte (removes the prefix '10') for composition. */ private static int trailingByteValue(final byte b) { return b & 0x3F; } private static char highSurrogate(final int codePoint) { return (char) ((Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)) + (codePoint >>> 10)); } private static char lowSurrogate(final int codePoint) { return (char) (Character.MIN_LOW_SURROGATE + (codePoint & 0x3ff)); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy