org.apache.datasketches.memory.Utf8 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datasketches-memory Show documentation
High-performance native memory access.
There is a newer version: 5.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.memory;

import static java.lang.Character.isSurrogate;
import static java.lang.Character.isSurrogatePair;
import static java.lang.Character.toCodePoint;
import static org.apache.datasketches.memory.UnsafeUtil.unsafe;

import java.io.IOException;
import java.nio.BufferOverflowException;
import java.nio.CharBuffer;

/**
 * Encoding and decoding implementations of {@link WritableMemory#putCharsToUtf8} and
 * {@link Memory#getCharsFromUtf8}.
 *
 * This is specifically designed to reduce the production of intermediate objects (garbage),
 * thus significantly reducing pressure on the JVM Garbage Collector.
 *
 * 
UTF-8 encoding/decoding is adapted from
 * https://github.com/protocolbuffers/protobuf/blob/master/java/core/src/main/java/com/google/protobuf/Utf8.java
 *
 * 
Copyright 2008 Google Inc.  All rights reserved.
 * https://developers.google.com/protocol-buffers/
 * See LICENSE.
 *
 * @author Lee Rhodes
 * @author Roman Leventov
 */
final class Utf8 {

  private Utf8() { }

  //Decode
  static final int getCharsFromUtf8(final long offsetBytes, final int utf8LengthBytes,
      final Appendable dst, final long cumBaseOffset, final Object unsafeObj)
          throws IOException, Utf8CodingException {

    if ((dst instanceof CharBuffer) && ((CharBuffer) dst).hasArray()) {
      return getCharBufferCharsFromUtf8(offsetBytes, ((CharBuffer) dst), utf8LengthBytes,
          cumBaseOffset, unsafeObj);
    }

    //Decode Direct CharBuffers and all other Appendables

    final long address = cumBaseOffset + offsetBytes;

    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
    // Need to keep this loop int-indexed, because it's faster for Hotspot JIT, it doesn't insert
    // savepoint polls on each iteration.
    int i = 0;
    for (; i < utf8LengthBytes; i++) {
      final byte b = unsafe.getByte(unsafeObj, address + i);
      if (!DecodeUtil.isOneByte(b)) {
        break;
      }
      dst.append((char) b);
    }
    if (i == utf8LengthBytes) {
      return i;
    }
    return getNonAsciiCharsFromUtf8(dst, address + i, address + utf8LengthBytes, unsafeObj,
        cumBaseOffset) + i;
  }

  /*
   * Optimize for heap CharBuffer manually, because Hotspot JIT doesn't itself unfold this
   * abstraction well (doesn't hoist array bound checks, etc.)
   */
  private static int getCharBufferCharsFromUtf8(final long offsetBytes, final CharBuffer cbuf,
        final int utf8LengthBytes, final long cumBaseOffset, final Object unsafeObj) {
    final char[] carr = cbuf.array();
    final int startCpos = cbuf.position() + cbuf.arrayOffset();
    int cpos = startCpos;
    final int clim = cbuf.arrayOffset() + cbuf.limit();
    final long address = cumBaseOffset + offsetBytes;
    int i = 0; //byte index

    // Optimize for 100% ASCII (Hotspot loves small simple top-level loops like this).
    // This simple loop stops when we encounter a byte >= 0x80 (i.e. non-ASCII).
    final int cbufNoCheckLimit = Math.min(utf8LengthBytes, clim - cpos);
    // Need to keep this loop int-indexed, because it's faster for Hotspot JIT, it doesn't insert
    // savepoint polls on each iteration.
    for (; i < cbufNoCheckLimit; i++) {
      final byte b = unsafe.getByte(unsafeObj, address + i);
      if (!DecodeUtil.isOneByte(b)) {
        break;
      }
      // Not checking CharBuffer bounds!
      carr[cpos++] = (char) b;
    }

    for (; i < utf8LengthBytes; i++) {
      final byte b = unsafe.getByte(unsafeObj, address + i);
      if (!DecodeUtil.isOneByte(b)) {
        break;
      }
      checkCharBufferPos(cbuf, cpos, clim);
      carr[cpos++] = (char) b;
    }
    if (i == utf8LengthBytes) {
      cbuf.position(cpos - cbuf.arrayOffset());
      return cpos - startCpos;
    }

    return getCharBufferNonAsciiCharsFromUtf8(cbuf, carr, cpos, clim, address + i,
        address + utf8LengthBytes, unsafeObj, cumBaseOffset) - cbuf.arrayOffset();
  }

  private static int getCharBufferNonAsciiCharsFromUtf8(final CharBuffer cbuf, final char[] carr,
      int cpos, final int clim, long address, final long addressLimit, final Object unsafeObj,
      final long cumBaseOffset) {

    while (address < addressLimit) {
      final byte byte1 = unsafe.getByte(unsafeObj, address++);
      if (DecodeUtil.isOneByte(byte1)) {
        checkCharBufferPos(cbuf, cpos, clim);
        carr[cpos++] = (char) byte1;
        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
        // extra optimized loop to take care of these runs.
        while (address < addressLimit) {
          final byte b = unsafe.getByte(unsafeObj, address);
          if (!DecodeUtil.isOneByte(b)) {
            break;
          }
          address++;
          checkCharBufferPos(cbuf, cpos, clim);
          carr[cpos++] = (char) b;
        }
      }
      else if (DecodeUtil.isTwoBytes(byte1)) {
        if (address >= addressLimit) {
          cbuf.position(cpos - cbuf.arrayOffset());
          final long off = address - cumBaseOffset;
          final long limit = addressLimit - cumBaseOffset;
          throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 2);
        }
        checkCharBufferPos(cbuf, cpos, clim);
        DecodeUtil.handleTwoBytesCharBuffer(
          byte1,
          /* byte2 */ unsafe.getByte(unsafeObj, address++),
          cbuf, carr, cpos);
        cpos++;
      }
      else if (DecodeUtil.isThreeBytes(byte1)) {
        if (address >= (addressLimit - 1)) {
          cbuf.position(cpos - cbuf.arrayOffset());
          final long off = address - cumBaseOffset;
          final long limit = addressLimit - cumBaseOffset;
          throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 3);
        }
        checkCharBufferPos(cbuf, cpos, clim);
        DecodeUtil.handleThreeBytesCharBuffer(
          byte1,
          /* byte2 */ unsafe.getByte(unsafeObj, address++),
          /* byte3 */ unsafe.getByte(unsafeObj, address++),
          cbuf, carr, cpos);
        cpos++;
      }
      else {
        if (address >= (addressLimit - 2)) {
          cbuf.position(cpos - cbuf.arrayOffset());
          final long off = address - cumBaseOffset;
          final long limit = addressLimit - cumBaseOffset;
          throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 4);
        }
        if (cpos >= (clim - 1)) {
          cbuf.position(cpos - cbuf.arrayOffset());
          throw new BufferOverflowException();
        }
        DecodeUtil.handleFourBytesCharBuffer(
          byte1,
          /* byte2 */ unsafe.getByte(unsafeObj, address++),
          /* byte3 */ unsafe.getByte(unsafeObj, address++),
          /* byte4 */ unsafe.getByte(unsafeObj, address++),
          cbuf, carr, cpos);
        cpos += 2;
      }
    }
    cbuf.position(cpos - cbuf.arrayOffset());
    return cpos;
  }

  //Decodes into Appendable destination
  //returns num of chars decoded
  private static int getNonAsciiCharsFromUtf8(final Appendable dst, long address,
      final long addressLimit, final Object unsafeObj, final long cumBaseOffset)
          throws IOException {
    int chars = 0;
    while (address < addressLimit) {
      final byte byte1 = unsafe.getByte(unsafeObj, address++);
      if (DecodeUtil.isOneByte(byte1)) {
        dst.append((char) byte1);
        chars++;
        // It's common for there to be multiple ASCII characters in a run mixed in, so add an
        // extra optimized loop to take care of these runs.
        while (address < addressLimit) {
          final byte b = unsafe.getByte(unsafeObj, address);
          if (!DecodeUtil.isOneByte(b)) {
            break;
          }
          address++;
          dst.append((char) b);
          chars++;
        }
      }
      else if (DecodeUtil.isTwoBytes(byte1)) {
        if (address >= addressLimit) {
          final long off = address - cumBaseOffset;
          final long limit = addressLimit - cumBaseOffset;
          throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 2);
        }
        DecodeUtil.handleTwoBytes(
            byte1,
            /* byte2 */ unsafe.getByte(unsafeObj, address++),
            dst);
        chars++;
      }
      else if (DecodeUtil.isThreeBytes(byte1)) {
        if (address >= (addressLimit - 1)) {
          final long off = address - cumBaseOffset;
          final long limit = addressLimit - cumBaseOffset;
          throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 3);
        }
        DecodeUtil.handleThreeBytes(
            byte1,
            /* byte2 */ unsafe.getByte(unsafeObj, address++),
            /* byte3 */ unsafe.getByte(unsafeObj, address++),
            dst);
        chars++;
      }
      else {
        if (address >= (addressLimit - 2)) {
          final long off = address - cumBaseOffset;
          final long limit = addressLimit - cumBaseOffset;
          throw Utf8CodingException.shortUtf8DecodeByteSequence(byte1, off, limit, 4);
        }
        DecodeUtil.handleFourBytes(
            byte1,
            /* byte2 */ unsafe.getByte(unsafeObj, address++),
            /* byte3 */ unsafe.getByte(unsafeObj, address++),
            /* byte4 */ unsafe.getByte(unsafeObj, address++),
            dst);
        chars += 2;
      }
    }
    return chars;
  }

  private static void checkCharBufferPos(final CharBuffer cbuf, final int cpos, final int clim) {
    if (cpos == clim) {
      cbuf.position(cpos - cbuf.arrayOffset());
      throw new BufferOverflowException();
    }
  }

  /******************/
  //Encode
  static long putCharsToUtf8(final long offsetBytes, final CharSequence src,
        final long capacityBytes, final long cumBaseOffset, final Object unsafeObj) {


    int cIdx = 0; //src character index
    long bIdx = cumBaseOffset + offsetBytes; //byte index
    long bCnt = 0; //bytes inserted

    final long byteLimit = cumBaseOffset + capacityBytes; //unsafe index limit

    final int utf16Length = src.length();
    //Quickly dispatch an ASCII sequence
    for (char c;
        (cIdx < utf16Length) && ((cIdx + bIdx) < byteLimit) && ((c = src.charAt(cIdx)) < 0x80);
        cIdx++, bCnt++) {
      unsafe.putByte(unsafeObj, bIdx + cIdx, (byte) c);
    }
    //encountered a non-ascii character
    if (cIdx == utf16Length) { //done.
      // next relative byte index in memory is (bIdx + utf16Length) - cumBaseOffset.
      return bCnt;
    }
    bIdx += cIdx; //bytes == characters for ascii

    for (char c; cIdx < utf16Length; cIdx++) { //process the remaining characters
      c = src.charAt(cIdx);

      if ((c < 0x80) && (bIdx < byteLimit)) {
        //Encode ASCII, 0 through 0x007F.
        unsafe.putByte(unsafeObj, bIdx++, (byte) c);
        bCnt++;
      }

      else
      //c MUST BE >= 0x0080 || j >= byteLimit

      if ((c < 0x800) && (bIdx < (byteLimit - 1))) {
        //Encode 0x80 through 0x7FF.
        //This is for almost all Latin-script alphabets plus Greek, Cyrillic, Hebrew, Arabic, etc.
        //We must have target space for at least 2 Utf8 bytes.
        unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 6) | (c >>> 6)));
        unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c)));
        bCnt += 2;
      }

      else
      //c > 0x800 || j >= byteLimit - 1 || j >= byteLimit

      if ( !isSurrogate(c) && (bIdx < (byteLimit - 2)) ) {
        //Encode the remainder of the BMP that are not surrogates:
        //  0x0800 thru 0xD7FF; 0xE000 thru 0xFFFF, the max single-char code point
        //We must have target space for at least 3 Utf8 bytes.
        unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 5) | (c >>> 12)));
        unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (c >>> 6))));
        unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & c)));
        bCnt += 3;
      }

      else {
        //c is a surrogate || j >= byteLimit - 2 || j >= byteLimit - 1 || j >= byteLimit

        //At this point we are either:
        // 1) Attempting to encode Code Points outside the BMP.
        //
        //    The only way to properly encode code points outside the BMP into Utf8 bytes is to use
        //    High/Low pairs of surrogate characters. Therefore, we must have at least 2 source
        //    characters remaining, at least 4 bytes of memory space remaining, and the next 2
        //    characters must be a valid surrogate pair.
        //
        // 2) There is insufficient Memory space to encode the current character from one of the
        //    ifs above.
        //
        // We proceed assuming (1). If the following test fails, we move to an exception.

        final char low;
        if ( (cIdx <= (utf16Length - 2))
            && (bIdx <= (byteLimit - 4))
            && isSurrogatePair(c, low = src.charAt(cIdx + 1)) ) { //we are good
          cIdx++; //skip over low surrogate
          final int codePoint = toCodePoint(c, low);
          unsafe.putByte(unsafeObj, bIdx++, (byte) ((0xF << 4) | (codePoint >>> 18)));
          unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 12))));
          unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & (codePoint >>> 6))));
          unsafe.putByte(unsafeObj, bIdx++, (byte) (0x80 | (0x3F & codePoint)));
          bCnt += 4;
        }

        else {
          //We are going to throw an exception. So we have time to figure out
          // what was wrong and hopefully throw an intelligent message!

          //check the BMP code point cases and their required memory limits
          if (   ((c < 0X0080) && (bIdx >= byteLimit))
              || ((c < 0x0800) && (bIdx >= (byteLimit - 1)))
              || ((c < 0xFFFF) && (bIdx >= (byteLimit - 2))) ) {
            throw Utf8CodingException.outOfMemory();
          }

          if (cIdx > (utf16Length - 2)) { //the last char is an unpaired surrogate
            throw Utf8CodingException.unpairedSurrogate(c);
          }

          if (bIdx > (byteLimit - 4)) {
            //4 Memory bytes required to encode a surrogate pair.
            final int remaining = (int) ((bIdx - byteLimit) + 4L);
            throw Utf8CodingException.shortUtf8EncodeByteLength(remaining);
          }

          if (!isSurrogatePair(c, src.charAt(cIdx + 1)) ) {
            //Not a surrogate pair.
            throw Utf8CodingException.illegalSurrogatePair(c, src.charAt(cIdx + 1));
          }

          //This should not happen :)
          throw new IllegalArgumentException("Unknown Utf8 encoding exception");
        }
      }
    }
    //final long localOffsetBytes = bIdx - cumBaseOffset;
    return bCnt;
  }

  /*****************/
  /**
   * Utility methods for decoding UTF-8 bytes into {@link String}. Callers are responsible for
   * extracting bytes (possibly using Unsafe methods), and checking remaining bytes. All other
   * UTF-8 validity checks and codepoint conversions happen in this class.
   *
   * @see Wikipedia: UTF-8
   */
  private static class DecodeUtil {

    /**
     * Returns whether this is a single-byte UTF-8 encoding.
     * This is for ASCII.
     *
     * 
Code Plane 0, Code Point range U+0000 to U+007F.
     *
     * 
Bit Patterns:
     * 
Byte 1: '0xxxxxxx'

     * 
     * @param b the byte being tested
     * @return true if this is a single-byte UTF-8 encoding, i.e., b is ≥ 0.
     */
    private static boolean isOneByte(final byte b) {
      return b >= 0;
    }

    /**
     * Returns whether this is the start of a two-byte UTF-8 encoding. One-byte encoding must
     * already be excluded.
     * This is for almost all Latin-script alphabets plus Greek, Cyrillic, Hebrew, Arabic, etc.
     *
     * Code Plane 0, Code Point range U+0080 to U+07FF.
     *
     * 
Bit Patterns:
     * 
Byte 1: '110xxxxx'
     * Byte 2: '10xxxxxx'
     * 
     *
     * All bytes must be < 0xE0.
     *
     * @param b the byte being tested
     * @return true if this is the start of a two-byte UTF-8 encoding.
     */
    private static boolean isTwoBytes(final byte b) {
      return b < (byte) 0xE0;
    }

    /**
     * Returns whether this is the start of a three-byte UTF-8 encoding. Two-byte encoding must
     * already be excluded.
     * This is for the rest of the BMP, which includes most common Chinese, Japanese and Korean
     * characters.
     *
     * 
Code Plane 0, Code Point range U+0800 to U+FFFF.
     *
     * 
Bit Patterns:
     * 
Byte 1: '1110xxxx'
     * Byte 2: '10xxxxxx'
     * Byte 3: '10xxxxxx'
     * 
     * All bytes must be less than 0xF0.
     *
     * @param b the byte being tested
     * @return true if this is the start of a three-byte UTF-8 encoding, i.e., b ≥ 0XF0.
     */
    private static boolean isThreeBytes(final byte b) {
      return b < (byte) 0xF0;
    }

    /*
     * Note that if three-byte UTF-8 coding has been excluded and if the current byte is
     * ≥ 0XF0, it must be the start of a four-byte UTF-8 encoding.
     * This is for the less common CJKV characters, historic scripts, math symbols, emoji, etc.
     *
     * Code Plane 1 through 16, Code Point range U+10000 to U+10FFFF.
     *
     * 
Bit Patterns:
     * 
Byte 1: '11110xxx'
     * Byte 2: '10xxxxxx'
     * Byte 3: '10xxxxxx'
     * Byte 4: '10xxxxxx'
     * 
     */

    private static void handleTwoBytes(
        final byte byte1, final byte byte2,
        final Appendable dst)
        throws IOException, Utf8CodingException {
      // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
      // overlong 2-byte, '11000001'.
      if ((byte1 < (byte) 0xC2)
          || isNotTrailingByte(byte2)) {
        final byte[] out = new byte[] {byte1, byte2};
        throw Utf8CodingException.illegalUtf8DecodeByteSequence(out);
      }
      dst.append((char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2)));
    }

    private static void handleTwoBytesCharBuffer(
        final byte byte1, final byte byte2,
        final CharBuffer cb, final char[] ca, final int cp)
        throws Utf8CodingException {
      // Simultaneously checks for illegal trailing-byte in leading position (<= '11000000') and
      // overlong 2-byte, '11000001'.
      if ((byte1 < (byte) 0xC2)
          || isNotTrailingByte(byte2)) {
        final byte[] out = new byte[] {byte1, byte2};
        cb.position(cp - cb.arrayOffset());
        throw Utf8CodingException.illegalUtf8DecodeByteSequence(out);
      }
      ca[cp] = (char) (((byte1 & 0x1F) << 6) | trailingByteValue(byte2));
    }

    private static void handleThreeBytes(
        final byte byte1, final byte byte2, final byte byte3,
        final Appendable dst)
        throws IOException, Utf8CodingException {
      if (isNotTrailingByte(byte2)
          // overlong? 5 most significant bits must not all be zero
          || ((byte1 == (byte) 0xE0) && (byte2 < (byte) 0xA0))
          // check for illegal surrogate codepoints
          || ((byte1 == (byte) 0xED) && (byte2 >= (byte) 0xA0))
          || isNotTrailingByte(byte3)) {
        final byte[] out = new byte[] {byte1, byte2, byte3};
        throw Utf8CodingException.illegalUtf8DecodeByteSequence(out);
      }
      dst.append((char)
          (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3)));
    }

    private static void handleThreeBytesCharBuffer(
        final byte byte1, final byte byte2, final byte byte3,
        final CharBuffer cb, final char[] ca, final int cp)
        throws Utf8CodingException {
      if (isNotTrailingByte(byte2)
          // overlong? 5 most significant bits must not all be zero
          || ((byte1 == (byte) 0xE0) && (byte2 < (byte) 0xA0))
          // check for illegal surrogate codepoints
          || ((byte1 == (byte) 0xED) && (byte2 >= (byte) 0xA0))
          || isNotTrailingByte(byte3)) {
        cb.position(cp - cb.arrayOffset());
        final byte[] out = new byte[] {byte1, byte2, byte3};
        throw Utf8CodingException.illegalUtf8DecodeByteSequence(out);
      }
      ca[cp] = (char)
              (((byte1 & 0x0F) << 12) | (trailingByteValue(byte2) << 6) | trailingByteValue(byte3));
    }

    private static void handleFourBytes(
        final byte byte1, final byte byte2, final byte byte3, final byte byte4,
        final Appendable dst)
        throws IOException, Utf8CodingException {
      if (isNotTrailingByte(byte2)
          // Check that 1 <= plane <= 16. Tricky optimized form of:
          //   valid 4-byte leading byte?
          // if (byte1 > (byte) 0xF4 ||
          //   overlong? 4 most significant bits must not all be zero
          //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
          //   codepoint larger than the highest code point (U+10FFFF)?
          //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
          || ((((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0)
          || isNotTrailingByte(byte3)
          || isNotTrailingByte(byte4)) {
        final byte[] out = new byte[] { byte1, byte2, byte3, byte4 };
        throw Utf8CodingException.illegalUtf8DecodeByteSequence(out);
      }
      final int codepoint = ((byte1 & 0x07) << 18)
          | (trailingByteValue(byte2) << 12)
          | (trailingByteValue(byte3) << 6)
          | trailingByteValue(byte4);
      dst.append(DecodeUtil.highSurrogate(codepoint));
      dst.append(DecodeUtil.lowSurrogate(codepoint));
    }

    private static void handleFourBytesCharBuffer(
        final byte byte1, final byte byte2, final byte byte3, final byte byte4,
        final CharBuffer cb, final char[] ca, final int cp)
        throws Utf8CodingException {
      if (isNotTrailingByte(byte2)
          // Check that 1 <= plane <= 16. Tricky optimized form of:
          //   valid 4-byte leading byte?
          // if (byte1 > (byte) 0xF4 ||
          //   overlong? 4 most significant bits must not all be zero
          //     byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 ||
          //   codepoint larger than the highest code point (U+10FFFF)?
          //     byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F)
          || ((((byte1 << 28) + (byte2 - (byte) 0x90)) >> 30) != 0)
          || isNotTrailingByte(byte3)
          || isNotTrailingByte(byte4)) {
        cb.position(cp - cb.arrayOffset());
        final byte[] out = new byte[] { byte1, byte2, byte3, byte4 };
        throw Utf8CodingException.illegalUtf8DecodeByteSequence(out);
      }
      final int codepoint = ((byte1 & 0x07) << 18)
          | (trailingByteValue(byte2) << 12)
          | (trailingByteValue(byte3) << 6)
          | trailingByteValue(byte4);
      ca[cp] = DecodeUtil.highSurrogate(codepoint);
      ca[cp + 1] = DecodeUtil.lowSurrogate(codepoint);
    }

    /*
     * Returns whether the byte is not a valid continuation of the form '10XXXXXX'.
     */
    private static boolean isNotTrailingByte(final byte b) {
      return b > (byte) 0xBF;
    }

    /*
     * Returns the actual value of the trailing byte (removes the prefix '10') for composition.
     */
    private static int trailingByteValue(final byte b) {
      return b & 0x3F;
    }

    private static char highSurrogate(final int codePoint) {
      return (char)
          ((Character.MIN_HIGH_SURROGATE
          - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10))
          + (codePoint >>> 10));
    }

    private static char lowSurrogate(final int codePoint) {
      return (char) (Character.MIN_LOW_SURROGATE + (codePoint & 0x3ff));
    }
  }

}