io.rsocket.util.CharByteBufUtil Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of rsocket-core Show documentation
Core functionality for the RSocket library
The newest version!
package io.rsocket.util;

import static io.netty.util.internal.StringUtil.isSurrogate;

import io.netty.buffer.ByteBuf;
import io.netty.util.CharsetUtil;
import io.netty.util.internal.MathUtil;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.util.Arrays;

public class CharByteBufUtil {

  private static final byte WRITE_UTF_UNKNOWN = (byte) '?';

  private CharByteBufUtil() {}

  /**
   * Returns the exact bytes length of UTF8 character sequence.
   *
   * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, char[])}.
   */
  public static int utf8Bytes(final char[] seq) {
    return utf8ByteCount(seq, 0, seq.length);
  }

  /**
   * This method is producing the exact length according to {@link #writeUtf8(ByteBuf, char[], int,
   * int)}.
   */
  public static int utf8Bytes(final char[] seq, int start, int end) {
    return utf8ByteCount(checkCharSequenceBounds(seq, start, end), start, end);
  }

  private static int utf8ByteCount(final char[] seq, int start, int end) {
    int i = start;
    // ASCII fast path
    while (i < end && seq[i] < 0x80) {
      ++i;
    }
    // !ASCII is packed in a separate method to let the ASCII case be smaller
    return i < end ? (i - start) + utf8BytesNonAscii(seq, i, end) : i - start;
  }

  private static int utf8BytesNonAscii(final char[] seq, final int start, final int end) {
    int encodedLength = 0;
    for (int i = start; i < end; i++) {
      final char c = seq[i];
      // making it 100% branchless isn't rewarding due to the many bit operations necessary!
      if (c < 0x800) {
        // branchless version of: (c <= 127 ? 0:1) + 1
        encodedLength += ((0x7f - c) >>> 31) + 1;
      } else if (isSurrogate(c)) {
        if (!Character.isHighSurrogate(c)) {
          encodedLength++;
          // WRITE_UTF_UNKNOWN
          continue;
        }
        final char c2;
        try {
          // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to
          // avoid
          // duplicate bounds checking with charAt.
          c2 = seq[++i];
        } catch (IndexOutOfBoundsException ignored) {
          encodedLength++;
          // WRITE_UTF_UNKNOWN
          break;
        }
        if (!Character.isLowSurrogate(c2)) {
          // WRITE_UTF_UNKNOWN + (Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2)
          encodedLength += 2;
          continue;
        }
        // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
        encodedLength += 4;
      } else {
        encodedLength += 3;
      }
    }
    return encodedLength;
  }

  private static char[] checkCharSequenceBounds(char[] seq, int start, int end) {
    if (MathUtil.isOutOfBounds(start, end - start, seq.length)) {
      throw new IndexOutOfBoundsException(
          "expected: 0 <= start("
              + start
              + ") <= end ("
              + end
              + ") <= seq.length("
              + seq.length
              + ')');
    }
    return seq;
  }

  /**
   * Encode a {@code char[]} in UTF-8 and write it
   * into {@link ByteBuf}.
   *
   * This method returns the actual number of bytes written.
   */
  public static int writeUtf8(ByteBuf buf, char[] seq) {
    return writeUtf8(buf, seq, 0, seq.length);
  }

  /**
   * Equivalent to {@link #writeUtf8(ByteBuf, char[]) writeUtf8(buf, seq.subSequence(start, end),
   * reserveBytes)} but avoids subsequence object allocation if possible.
   *
   * @return actual number of bytes written
   */
  public static int writeUtf8(ByteBuf buf, char[] seq, int start, int end) {
    return writeUtf8(buf, buf.writerIndex(), checkCharSequenceBounds(seq, start, end), start, end);
  }

  // Fast-Path implementation
  static int writeUtf8(ByteBuf buffer, int writerIndex, char[] seq, int start, int end) {
    int oldWriterIndex = writerIndex;

    // We can use the _set methods as these not need to do any index checks and reference checks.
    // This is possible as we called ensureWritable(...) before.
    for (int i = start; i < end; i++) {
      char c = seq[i];
      if (c < 0x80) {
        buffer.setByte(writerIndex++, (byte) c);
      } else if (c < 0x800) {
        buffer.setByte(writerIndex++, (byte) (0xc0 | (c >> 6)));
        buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
      } else if (isSurrogate(c)) {
        if (!Character.isHighSurrogate(c)) {
          buffer.setByte(writerIndex++, WRITE_UTF_UNKNOWN);
          continue;
        }
        final char c2;
        if (seq.length > ++i) {
          // Surrogate Pair consumes 2 characters. Optimistically try to get the next character to
          // avoid
          // duplicate bounds checking with charAt. If an IndexOutOfBoundsException is thrown we
          // will
          // re-throw a more informative exception describing the problem.
          c2 = seq[i];
        } else {
          buffer.setByte(writerIndex++, WRITE_UTF_UNKNOWN);
          break;
        }
        // Extra method to allow inlining the rest of writeUtf8 which is the most likely code path.
        writerIndex = writeUtf8Surrogate(buffer, writerIndex, c, c2);
      } else {
        buffer.setByte(writerIndex++, (byte) (0xe0 | (c >> 12)));
        buffer.setByte(writerIndex++, (byte) (0x80 | ((c >> 6) & 0x3f)));
        buffer.setByte(writerIndex++, (byte) (0x80 | (c & 0x3f)));
      }
    }
    buffer.writerIndex(writerIndex);
    return writerIndex - oldWriterIndex;
  }

  private static int writeUtf8Surrogate(ByteBuf buffer, int writerIndex, char c, char c2) {
    if (!Character.isLowSurrogate(c2)) {
      buffer.setByte(writerIndex++, WRITE_UTF_UNKNOWN);
      buffer.setByte(writerIndex++, Character.isHighSurrogate(c2) ? WRITE_UTF_UNKNOWN : c2);
      return writerIndex;
    }
    int codePoint = Character.toCodePoint(c, c2);
    // See http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G2630.
    buffer.setByte(writerIndex++, (byte) (0xf0 | (codePoint >> 18)));
    buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 12) & 0x3f)));
    buffer.setByte(writerIndex++, (byte) (0x80 | ((codePoint >> 6) & 0x3f)));
    buffer.setByte(writerIndex++, (byte) (0x80 | (codePoint & 0x3f)));
    return writerIndex;
  }

  public static char[] readUtf8(ByteBuf byteBuf, int length) {
    CharsetDecoder charsetDecoder = CharsetUtil.UTF_8.newDecoder();
    int en = (int) (length * (double) charsetDecoder.maxCharsPerByte());
    char[] ca = new char[en];

    CharBuffer charBuffer = CharBuffer.wrap(ca);
    ByteBuffer byteBuffer =
        byteBuf.nioBufferCount() == 1
            ? byteBuf.internalNioBuffer(byteBuf.readerIndex(), length)
            : byteBuf.nioBuffer(byteBuf.readerIndex(), length);
    byteBuffer.mark();
    try {
      CoderResult cr = charsetDecoder.decode(byteBuffer, charBuffer, true);
      if (!cr.isUnderflow()) cr.throwException();
      cr = charsetDecoder.flush(charBuffer);
      if (!cr.isUnderflow()) cr.throwException();

      byteBuffer.reset();
      byteBuf.skipBytes(length);

      return safeTrim(charBuffer.array(), charBuffer.position());
    } catch (CharacterCodingException x) {
      // Substitution is always enabled,
      // so this shouldn't happen
      throw new IllegalStateException("unable to decode char array from the given buffer", x);
    }
  }

  private static char[] safeTrim(char[] ca, int len) {
    if (len == ca.length) return ca;
    else return Arrays.copyOf(ca, len);
  }
}