All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dinky.shaded.paimon.data.BinaryString Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.dinky.shaded.paimon.data;

import org.dinky.shaded.paimon.annotation.Public;
import org.dinky.shaded.paimon.memory.MemorySegment;
import org.dinky.shaded.paimon.memory.MemorySegmentUtils;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import static org.dinky.shaded.paimon.memory.MemorySegmentUtils.allocateReuseBytes;
import static org.dinky.shaded.paimon.memory.MemorySegmentUtils.allocateReuseChars;
import static org.dinky.shaded.paimon.utils.Preconditions.checkArgument;

/**
 * A string which is backed by {@link MemorySegment}s.
 *
 * @since 0.4.0
 */
@Public
public final class BinaryString extends BinarySection implements Comparable {

    private static final long serialVersionUID = 1L;

    public static final BinaryString EMPTY_UTF8 = BinaryString.fromBytes(encodeUTF8(""));

    public BinaryString(MemorySegment[] segments, int offset, int sizeInBytes) {
        super(segments, offset, sizeInBytes);
        this.segments = segments;
        this.offset = offset;
        this.sizeInBytes = sizeInBytes;
    }

    // ------------------------------------------------------------------------------------------
    // Construction Utilities
    // ------------------------------------------------------------------------------------------

    public static BinaryString fromAddress(MemorySegment[] segments, int offset, int numBytes) {
        return new BinaryString(segments, offset, numBytes);
    }

    @Nullable
    public static BinaryString fromString(String str) {
        if (str == null) {
            return null;
        }
        return fromBytes(encodeUTF8(str));
    }

    /** Creates a {@link BinaryString} instance from the given UTF-8 bytes. */
    public static BinaryString fromBytes(byte[] bytes) {
        return fromBytes(bytes, 0, bytes.length);
    }

    /**
     * Creates a {@link BinaryString} instance from the given UTF-8 bytes with offset and number of
     * bytes.
     */
    public static BinaryString fromBytes(byte[] bytes, int offset, int numBytes) {
        return new BinaryString(new MemorySegment[] {MemorySegment.wrap(bytes)}, offset, numBytes);
    }

    /** Creates a {@link BinaryString} instance that contains `length` spaces. */
    public static BinaryString blankString(int length) {
        byte[] spaces = new byte[length];
        Arrays.fill(spaces, (byte) ' ');
        return fromBytes(spaces);
    }

    // ------------------------------------------------------------------------------------------
    // Public Interfaces
    // ------------------------------------------------------------------------------------------

    @Override
    public String toString() {
        byte[] bytes = allocateReuseBytes(sizeInBytes);
        MemorySegmentUtils.copyToBytes(segments, offset, bytes, 0, sizeInBytes);
        return decodeUTF8(bytes, 0, sizeInBytes);
    }

    /**
     * Compares two strings lexicographically. Since UTF-8 uses groups of six bits, it is sometimes
     * useful to use octal notation which uses 3-bit groups. With a calculator which can convert
     * between hexadecimal and octal it can be easier to manually create or interpret UTF-8 compared
     * with using binary. So we just compare the binary.
     */
    @Override
    public int compareTo(@Nonnull BinaryString other) {
        if (segments.length == 1 && other.segments.length == 1) {
            int len = Math.min(sizeInBytes, other.sizeInBytes);
            MemorySegment seg1 = segments[0];
            MemorySegment seg2 = other.segments[0];

            for (int i = 0; i < len; i++) {
                int res = (seg1.get(offset + i) & 0xFF) - (seg2.get(other.offset + i) & 0xFF);
                if (res != 0) {
                    return res;
                }
            }
            return sizeInBytes - other.sizeInBytes;
        }

        // if there are multi segments.
        return compareMultiSegments(other);
    }

    /** Find the boundaries of segments, and then compare MemorySegment. */
    private int compareMultiSegments(BinaryString other) {

        if (sizeInBytes == 0 || other.sizeInBytes == 0) {
            return sizeInBytes - other.sizeInBytes;
        }

        int len = Math.min(sizeInBytes, other.sizeInBytes);

        MemorySegment seg1 = segments[0];
        MemorySegment seg2 = other.segments[0];

        int segmentSize = segments[0].size();
        int otherSegmentSize = other.segments[0].size();

        int sizeOfFirst1 = segmentSize - offset;
        int sizeOfFirst2 = otherSegmentSize - other.offset;

        int varSegIndex1 = 1;
        int varSegIndex2 = 1;

        // find the first segment of this string.
        while (sizeOfFirst1 <= 0) {
            sizeOfFirst1 += segmentSize;
            seg1 = segments[varSegIndex1++];
        }

        while (sizeOfFirst2 <= 0) {
            sizeOfFirst2 += otherSegmentSize;
            seg2 = other.segments[varSegIndex2++];
        }

        int offset1 = segmentSize - sizeOfFirst1;
        int offset2 = otherSegmentSize - sizeOfFirst2;

        int needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);

        while (needCompare > 0) {
            // compare in one segment.
            for (int i = 0; i < needCompare; i++) {
                int res = (seg1.get(offset1 + i) & 0xFF) - (seg2.get(offset2 + i) & 0xFF);
                if (res != 0) {
                    return res;
                }
            }
            if (needCompare == len) {
                break;
            }
            len -= needCompare;
            // next segment
            if (sizeOfFirst1 < sizeOfFirst2) { // I am smaller
                seg1 = segments[varSegIndex1++];
                offset1 = 0;
                offset2 += needCompare;
                sizeOfFirst1 = segmentSize;
                sizeOfFirst2 -= needCompare;
            } else if (sizeOfFirst1 > sizeOfFirst2) { // other is smaller
                seg2 = other.segments[varSegIndex2++];
                offset2 = 0;
                offset1 += needCompare;
                sizeOfFirst2 = otherSegmentSize;
                sizeOfFirst1 -= needCompare;
            } else { // same, should go ahead both.
                seg1 = segments[varSegIndex1++];
                seg2 = other.segments[varSegIndex2++];
                offset1 = 0;
                offset2 = 0;
                sizeOfFirst1 = segmentSize;
                sizeOfFirst2 = otherSegmentSize;
            }
            needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);
        }

        checkArgument(needCompare == len);

        return sizeInBytes - other.sizeInBytes;
    }

    // ------------------------------------------------------------------------------------------
    // Public methods on BinaryString
    // ------------------------------------------------------------------------------------------

    /** Returns the number of UTF-8 code points in the string. */
    public int numChars() {
        if (inFirstSegment()) {
            int len = 0;
            for (int i = 0; i < sizeInBytes; i += numBytesForFirstByte(getByteOneSegment(i))) {
                len++;
            }
            return len;
        } else {
            return numCharsMultiSegs();
        }
    }

    private int numCharsMultiSegs() {
        int len = 0;
        int segSize = segments[0].size();
        BinaryString.SegmentAndOffset index = firstSegmentAndOffset(segSize);
        int i = 0;
        while (i < sizeInBytes) {
            int charBytes = numBytesForFirstByte(index.value());
            i += charBytes;
            len++;
            index.skipBytes(charBytes, segSize);
        }
        return len;
    }

    /**
     * Returns the {@code byte} value at the specified index. An index ranges from {@code 0} to
     * {@code sizeInBytes - 1}.
     *
     * @param index the index of the {@code byte} value.
     * @return the {@code byte} value at the specified index of this UTF-8 bytes.
     * @exception IndexOutOfBoundsException if the {@code index} argument is negative or not less
     *     than the length of this UTF-8 bytes.
     */
    public byte byteAt(int index) {
        int globalOffset = offset + index;
        int size = segments[0].size();
        if (globalOffset < size) {
            return segments[0].get(globalOffset);
        } else {
            return segments[globalOffset / size].get(globalOffset % size);
        }
    }

    /** Copy a new {@code BinaryString}. */
    public BinaryString copy() {
        byte[] copy = MemorySegmentUtils.copyToBytes(segments, offset, sizeInBytes);
        return BinaryString.fromBytes(copy);
    }

    /**
     * Returns a binary string that is a substring of this binary string. The substring begins at
     * the specified {@code beginIndex} and extends to the character at index {@code endIndex - 1}.
     *
     * 

Examples: * *

* *
     * fromString("hamburger").substring(4, 8) returns binary string "urge"
     * fromString("smiles").substring(1, 5) returns binary string "mile"
     * 
* *
* * @param beginIndex the beginning index, inclusive. * @param endIndex the ending index, exclusive. * @return the specified substring, return EMPTY_UTF8 when index out of bounds instead of * StringIndexOutOfBoundsException. */ public BinaryString substring(int beginIndex, int endIndex) { if (endIndex <= beginIndex || beginIndex >= sizeInBytes) { return EMPTY_UTF8; } if (inFirstSegment()) { MemorySegment segment = segments[0]; int i = 0; int c = 0; while (i < sizeInBytes && c < beginIndex) { i += numBytesForFirstByte(segment.get(i + offset)); c += 1; } int j = i; while (i < sizeInBytes && c < endIndex) { i += numBytesForFirstByte(segment.get(i + offset)); c += 1; } if (i > j) { byte[] bytes = new byte[i - j]; segment.get(offset + j, bytes, 0, i - j); return fromBytes(bytes); } else { return EMPTY_UTF8; } } else { return substringMultiSegs(beginIndex, endIndex); } } private BinaryString substringMultiSegs(final int start, final int until) { int segSize = segments[0].size(); BinaryString.SegmentAndOffset index = firstSegmentAndOffset(segSize); int i = 0; int c = 0; while (i < sizeInBytes && c < start) { int charSize = numBytesForFirstByte(index.value()); i += charSize; index.skipBytes(charSize, segSize); c += 1; } int j = i; while (i < sizeInBytes && c < until) { int charSize = numBytesForFirstByte(index.value()); i += charSize; index.skipBytes(charSize, segSize); c += 1; } if (i > j) { return fromBytes(MemorySegmentUtils.copyToBytes(segments, offset + j, i - j)); } else { return EMPTY_UTF8; } } /** * Returns true if and only if this BinaryString contains the specified sequence of bytes * values. * * @param s the sequence to search for * @return true if this BinaryString contains {@code s}, false otherwise */ public boolean contains(final BinaryString s) { if (s.sizeInBytes == 0) { return true; } int find = MemorySegmentUtils.find( segments, offset, sizeInBytes, s.segments, s.offset, s.sizeInBytes); return find != -1; } /** * Tests if this BinaryString starts with the specified prefix. * * @param prefix the prefix. * @return {@code true} if the bytes represented by the argument is a prefix of the bytes * represented by this string; {@code false} otherwise. Note also that {@code true} will be * returned if the argument is an empty BinaryString or is equal to this {@code * BinaryString} object as determined by the {@link #equals(Object)} method. */ public boolean startsWith(final BinaryString prefix) { return matchAt(prefix, 0); } /** * Tests if this BinaryString ends with the specified suffix. * * @param suffix the suffix. * @return {@code true} if the bytes represented by the argument is a suffix of the bytes * represented by this object; {@code false} otherwise. Note that the result will be {@code * true} if the argument is the empty string or is equal to this {@code BinaryString} object * as determined by the {@link #equals(Object)} method. */ public boolean endsWith(final BinaryString suffix) { return matchAt(suffix, sizeInBytes - suffix.sizeInBytes); } /** * Returns a string whose value is this string, with any leading and trailing whitespace * removed. * * @return A string whose value is this string, with any leading and trailing white space * removed, or this string if it has no leading or trailing white space. */ public BinaryString trim() { if (inFirstSegment()) { int s = 0; int e = this.sizeInBytes - 1; // skip all of the space (0x20) in the left side while (s < this.sizeInBytes && getByteOneSegment(s) == 0x20) { s++; } // skip all of the space (0x20) in the right side while (e >= s && getByteOneSegment(e) == 0x20) { e--; } if (s > e) { // empty string return EMPTY_UTF8; } else { return copyBinaryStringInOneSeg(s, e - s + 1); } } else { return trimMultiSegs(); } } private BinaryString trimMultiSegs() { int s = 0; int e = this.sizeInBytes - 1; int segSize = segments[0].size(); BinaryString.SegmentAndOffset front = firstSegmentAndOffset(segSize); // skip all of the space (0x20) in the left side while (s < this.sizeInBytes && front.value() == 0x20) { s++; front.nextByte(segSize); } BinaryString.SegmentAndOffset behind = lastSegmentAndOffset(segSize); // skip all of the space (0x20) in the right side while (e >= s && behind.value() == 0x20) { e--; behind.previousByte(segSize); } if (s > e) { // empty string return EMPTY_UTF8; } else { return copyBinaryString(s, e); } } /** * Returns the index within this string of the first occurrence of the specified substring, * starting at the specified index. * * @param str the substring to search for. * @param fromIndex the index from which to start the search. * @return the index of the first occurrence of the specified substring, starting at the * specified index, or {@code -1} if there is no such occurrence. */ public int indexOf(BinaryString str, int fromIndex) { if (str.sizeInBytes == 0) { return 0; } if (inFirstSegment()) { // position in byte int byteIdx = 0; // position is char int charIdx = 0; while (byteIdx < sizeInBytes && charIdx < fromIndex) { byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx)); charIdx++; } do { if (byteIdx + str.sizeInBytes > sizeInBytes) { return -1; } if (MemorySegmentUtils.equals( segments, offset + byteIdx, str.segments, str.offset, str.sizeInBytes)) { return charIdx; } byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx)); charIdx++; } while (byteIdx < sizeInBytes); return -1; } else { return indexOfMultiSegs(str, fromIndex); } } private int indexOfMultiSegs(BinaryString str, int fromIndex) { // position in byte int byteIdx = 0; // position is char int charIdx = 0; int segSize = segments[0].size(); BinaryString.SegmentAndOffset index = firstSegmentAndOffset(segSize); while (byteIdx < sizeInBytes && charIdx < fromIndex) { int charBytes = numBytesForFirstByte(index.value()); byteIdx += charBytes; charIdx++; index.skipBytes(charBytes, segSize); } do { if (byteIdx + str.sizeInBytes > sizeInBytes) { return -1; } if (MemorySegmentUtils.equals( segments, offset + byteIdx, str.segments, str.offset, str.sizeInBytes)) { return charIdx; } int charBytes = numBytesForFirstByte(index.segment.get(index.offset)); byteIdx += charBytes; charIdx++; index.skipBytes(charBytes, segSize); } while (byteIdx < sizeInBytes); return -1; } /** * Converts all of the characters in this {@code BinaryString} to upper case. * * @return the {@code BinaryString}, converted to uppercase. */ public BinaryString toUpperCase() { if (sizeInBytes == 0) { return EMPTY_UTF8; } int size = segments[0].size(); BinaryString.SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); byte[] bytes = new byte[sizeInBytes]; bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); for (int i = 0; i < sizeInBytes; i++) { byte b = segmentAndOffset.value(); if (numBytesForFirstByte(b) != 1) { // fallback return javaToUpperCase(); } int upper = Character.toUpperCase((int) b); if (upper > 127) { // fallback return javaToUpperCase(); } bytes[i] = (byte) upper; segmentAndOffset.nextByte(size); } return fromBytes(bytes); } private BinaryString javaToUpperCase() { return fromString(toString().toUpperCase()); } /** * Converts all of the characters in this {@code BinaryString} to lower case. * * @return the {@code BinaryString}, converted to lowercase. */ public BinaryString toLowerCase() { if (sizeInBytes == 0) { return EMPTY_UTF8; } int size = segments[0].size(); BinaryString.SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size); byte[] bytes = new byte[sizeInBytes]; bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value()); for (int i = 0; i < sizeInBytes; i++) { byte b = segmentAndOffset.value(); if (numBytesForFirstByte(b) != 1) { // fallback return javaToLowerCase(); } int lower = Character.toLowerCase((int) b); if (lower > 127) { // fallback return javaToLowerCase(); } bytes[i] = (byte) lower; segmentAndOffset.nextByte(size); } return fromBytes(bytes); } private BinaryString javaToLowerCase() { return fromString(toString().toLowerCase()); } // ------------------------------------------------------------------------------------------ // Internal methods on BinaryString // ------------------------------------------------------------------------------------------ byte getByteOneSegment(int i) { return segments[0].get(offset + i); } boolean inFirstSegment() { return sizeInBytes + offset <= segments[0].size(); } private boolean matchAt(final BinaryString s, int pos) { return (inFirstSegment() && s.inFirstSegment()) ? matchAtOneSeg(s, pos) : matchAtVarSeg(s, pos); } private boolean matchAtOneSeg(final BinaryString s, int pos) { return s.sizeInBytes + pos <= sizeInBytes && pos >= 0 && segments[0].equalTo(s.segments[0], offset + pos, s.offset, s.sizeInBytes); } private boolean matchAtVarSeg(final BinaryString s, int pos) { return s.sizeInBytes + pos <= sizeInBytes && pos >= 0 && MemorySegmentUtils.equals( segments, offset + pos, s.segments, s.offset, s.sizeInBytes); } BinaryString copyBinaryStringInOneSeg(int start, int len) { byte[] newBytes = new byte[len]; segments[0].get(offset + start, newBytes, 0, len); return fromBytes(newBytes); } BinaryString copyBinaryString(int start, int end) { int len = end - start + 1; byte[] newBytes = new byte[len]; MemorySegmentUtils.copyToBytes(segments, offset + start, newBytes, 0, len); return fromBytes(newBytes); } BinaryString.SegmentAndOffset firstSegmentAndOffset(int segSize) { int segIndex = offset / segSize; return new BinaryString.SegmentAndOffset(segIndex, offset % segSize); } BinaryString.SegmentAndOffset lastSegmentAndOffset(int segSize) { int lastOffset = offset + sizeInBytes - 1; int segIndex = lastOffset / segSize; return new BinaryString.SegmentAndOffset(segIndex, lastOffset % segSize); } private BinaryString.SegmentAndOffset startSegmentAndOffset(int segSize) { return inFirstSegment() ? new BinaryString.SegmentAndOffset(0, offset) : firstSegmentAndOffset(segSize); } /** CurrentSegment and positionInSegment. */ class SegmentAndOffset { int segIndex; MemorySegment segment; int offset; private SegmentAndOffset(int segIndex, int offset) { this.segIndex = segIndex; this.segment = segments[segIndex]; this.offset = offset; } private void assignSegment() { segment = segIndex >= 0 && segIndex < segments.length ? segments[segIndex] : null; } void previousByte(int segSize) { offset--; if (offset == -1) { segIndex--; assignSegment(); offset = segSize - 1; } } void nextByte(int segSize) { offset++; checkAdvance(segSize); } private void checkAdvance(int segSize) { if (offset == segSize) { advance(); } } private void advance() { segIndex++; assignSegment(); offset = 0; } void skipBytes(int n, int segSize) { int remaining = segSize - this.offset; if (remaining > n) { this.offset += n; } else { while (true) { int toSkip = Math.min(remaining, n); n -= toSkip; if (n <= 0) { this.offset += toSkip; checkAdvance(segSize); return; } advance(); remaining = segSize - this.offset; } } } byte value() { return this.segment.get(this.offset); } } /** * Returns the number of bytes for a code point with the first byte as `b`. * * @param b The first byte of a code point */ static int numBytesForFirstByte(final byte b) { if (b >= 0) { // 1 byte, 7 bits: 0xxxxxxx return 1; } else if ((b >> 5) == -2 && (b & 0x1e) != 0) { // 2 bytes, 11 bits: 110xxxxx 10xxxxxx return 2; } else if ((b >> 4) == -2) { // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx return 3; } else if ((b >> 3) == -2) { // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx return 4; } else { // Skip the first byte disallowed in UTF-8 // Handling errors quietly, same semantics to java String. return 1; } } // ------------------------------------------------------------------------ // UTF-8 encoding and decoding (High-speed version, 30% faster+) // ------------------------------------------------------------------------ private static final int MAX_BYTES_PER_CHAR = 3; /** This method must have the same result with JDK's String.getBytes. */ public static byte[] encodeUTF8(String str) { byte[] bytes = allocateReuseBytes(str.length() * MAX_BYTES_PER_CHAR); int len = encodeUTF8(str, bytes); return Arrays.copyOf(bytes, len); } public static int encodeUTF8(String str, byte[] bytes) { int offset = 0; int len = str.length(); int sl = offset + len; int dp = 0; int dlASCII = dp + Math.min(len, bytes.length); // ASCII only optimized loop while (dp < dlASCII && str.charAt(offset) < '\u0080') { bytes[dp++] = (byte) str.charAt(offset++); } while (offset < sl) { char c = str.charAt(offset++); if (c < 0x80) { // Have at most seven bits bytes[dp++] = (byte) c; } else if (c < 0x800) { // 2 bytes, 11 bits bytes[dp++] = (byte) (0xc0 | (c >> 6)); bytes[dp++] = (byte) (0x80 | (c & 0x3f)); } else if (Character.isSurrogate(c)) { final int uc; int ip = offset - 1; if (Character.isHighSurrogate(c)) { if (sl - ip < 2) { uc = -1; } else { char d = str.charAt(ip + 1); if (Character.isLowSurrogate(d)) { uc = Character.toCodePoint(c, d); } else { // for some illegal character // the jdk will ignore the origin character and cast it to '?' // this acts the same with jdk return defaultEncodeUTF8(str, bytes); } } } else { if (Character.isLowSurrogate(c)) { // for some illegal character // the jdk will ignore the origin character and cast it to '?' // this acts the same with jdk return defaultEncodeUTF8(str, bytes); } else { uc = c; } } if (uc < 0) { bytes[dp++] = (byte) '?'; } else { bytes[dp++] = (byte) (0xf0 | ((uc >> 18))); bytes[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f)); bytes[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f)); bytes[dp++] = (byte) (0x80 | (uc & 0x3f)); offset++; // 2 chars } } else { // 3 bytes, 16 bits bytes[dp++] = (byte) (0xe0 | ((c >> 12))); bytes[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f)); bytes[dp++] = (byte) (0x80 | (c & 0x3f)); } } return dp; } public static int defaultEncodeUTF8(String str, byte[] bytes) { try { byte[] buffer = str.getBytes("UTF-8"); System.arraycopy(buffer, 0, bytes, 0, buffer.length); return buffer.length; } catch (UnsupportedEncodingException e) { throw new RuntimeException("encodeUTF8 error", e); } } public static String decodeUTF8(byte[] input, int offset, int byteLen) { char[] chars = allocateReuseChars(byteLen); int len = decodeUTF8Strict(input, offset, byteLen, chars); if (len < 0) { return defaultDecodeUTF8(input, offset, byteLen); } return new String(chars, 0, len); } public static int decodeUTF8Strict(byte[] sa, int sp, int len, char[] da) { final int sl = sp + len; int dp = 0; int dlASCII = Math.min(len, da.length); // ASCII only optimized loop while (dp < dlASCII && sa[sp] >= 0) { da[dp++] = (char) sa[sp++]; } while (sp < sl) { int b1 = sa[sp++]; if (b1 >= 0) { // 1 byte, 7 bits: 0xxxxxxx da[dp++] = (char) b1; } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { // 2 bytes, 11 bits: 110xxxxx 10xxxxxx if (sp < sl) { int b2 = sa[sp++]; if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2) return -1; } else { da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80))); } continue; } return -1; } else if ((b1 >> 4) == -2) { // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx if (sp + 1 < sl) { int b2 = sa[sp++]; int b3 = sa[sp++]; if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3) return -1; } else { char c = (char) ((b1 << 12) ^ (b2 << 6) ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); if (Character.isSurrogate(c)) { return -1; } else { da[dp++] = c; } } continue; } return -1; } else if ((b1 >> 3) == -2) { // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx if (sp + 2 < sl) { int b2 = sa[sp++]; int b3 = sa[sp++]; int b4 = sa[sp++]; int uc = ((b1 << 18) ^ (b2 << 12) ^ (b3 << 6) ^ (b4 ^ (((byte) 0xF0 << 18) ^ ((byte) 0x80 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); // isMalformed4 and shortest form check if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) || !Character.isSupplementaryCodePoint(uc)) { return -1; } else { da[dp++] = Character.highSurrogate(uc); da[dp++] = Character.lowSurrogate(uc); } continue; } return -1; } else { return -1; } } return dp; } public static String decodeUTF8(MemorySegment input, int offset, int byteLen) { char[] chars = allocateReuseChars(byteLen); int len = decodeUTF8Strict(input, offset, byteLen, chars); if (len < 0) { byte[] bytes = allocateReuseBytes(byteLen); input.get(offset, bytes, 0, byteLen); return defaultDecodeUTF8(bytes, 0, byteLen); } return new String(chars, 0, len); } public static int decodeUTF8Strict(MemorySegment segment, int sp, int len, char[] da) { final int sl = sp + len; int dp = 0; int dlASCII = Math.min(len, da.length); // ASCII only optimized loop while (dp < dlASCII && segment.get(sp) >= 0) { da[dp++] = (char) segment.get(sp++); } while (sp < sl) { int b1 = segment.get(sp++); if (b1 >= 0) { // 1 byte, 7 bits: 0xxxxxxx da[dp++] = (char) b1; } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { // 2 bytes, 11 bits: 110xxxxx 10xxxxxx if (sp < sl) { int b2 = segment.get(sp++); if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2) return -1; } else { da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80))); } continue; } return -1; } else if ((b1 >> 4) == -2) { // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx if (sp + 1 < sl) { int b2 = segment.get(sp++); int b3 = segment.get(sp++); if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80) || (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3) return -1; } else { char c = (char) ((b1 << 12) ^ (b2 << 6) ^ (b3 ^ (((byte) 0xE0 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); if (Character.isSurrogate(c)) { return -1; } else { da[dp++] = c; } } continue; } return -1; } else if ((b1 >> 3) == -2) { // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx if (sp + 2 < sl) { int b2 = segment.get(sp++); int b3 = segment.get(sp++); int b4 = segment.get(sp++); int uc = ((b1 << 18) ^ (b2 << 12) ^ (b3 << 6) ^ (b4 ^ (((byte) 0xF0 << 18) ^ ((byte) 0x80 << 12) ^ ((byte) 0x80 << 6) ^ ((byte) 0x80)))); // isMalformed4 and shortest form check if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80) || !Character.isSupplementaryCodePoint(uc)) { return -1; } else { da[dp++] = Character.highSurrogate(uc); da[dp++] = Character.lowSurrogate(uc); } continue; } return -1; } else { return -1; } } return dp; } public static String defaultDecodeUTF8(byte[] bytes, int offset, int len) { return new String(bytes, offset, len, StandardCharsets.UTF_8); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy