org.dinky.shaded.paimon.data.BinaryString Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dinky-shaded-paimon Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.dinky.shaded.paimon.data;

import org.dinky.shaded.paimon.annotation.Public;
import org.dinky.shaded.paimon.memory.MemorySegment;
import org.dinky.shaded.paimon.memory.MemorySegmentUtils;

import javax.annotation.Nonnull;
import javax.annotation.Nullable;

import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

import static org.dinky.shaded.paimon.memory.MemorySegmentUtils.allocateReuseBytes;
import static org.dinky.shaded.paimon.memory.MemorySegmentUtils.allocateReuseChars;
import static org.dinky.shaded.paimon.utils.Preconditions.checkArgument;

/**
 * A string which is backed by {@link MemorySegment}s.
 *
 * @since 0.4.0
 */
@Public
public final class BinaryString extends BinarySection implements Comparable {

    private static final long serialVersionUID = 1L;

    public static final BinaryString EMPTY_UTF8 = BinaryString.fromBytes(encodeUTF8(""));

    public BinaryString(MemorySegment[] segments, int offset, int sizeInBytes) {
        super(segments, offset, sizeInBytes);
        this.segments = segments;
        this.offset = offset;
        this.sizeInBytes = sizeInBytes;
    }

    // ------------------------------------------------------------------------------------------
    // Construction Utilities
    // ------------------------------------------------------------------------------------------

    public static BinaryString fromAddress(MemorySegment[] segments, int offset, int numBytes) {
        return new BinaryString(segments, offset, numBytes);
    }

    @Nullable
    public static BinaryString fromString(String str) {
        if (str == null) {
            return null;
        }
        return fromBytes(encodeUTF8(str));
    }

    /** Creates a {@link BinaryString} instance from the given UTF-8 bytes. */
    public static BinaryString fromBytes(byte[] bytes) {
        return fromBytes(bytes, 0, bytes.length);
    }

    /**
     * Creates a {@link BinaryString} instance from the given UTF-8 bytes with offset and number of
     * bytes.
     */
    public static BinaryString fromBytes(byte[] bytes, int offset, int numBytes) {
        return new BinaryString(new MemorySegment[] {MemorySegment.wrap(bytes)}, offset, numBytes);
    }

    /** Creates a {@link BinaryString} instance that contains `length` spaces. */
    public static BinaryString blankString(int length) {
        byte[] spaces = new byte[length];
        Arrays.fill(spaces, (byte) ' ');
        return fromBytes(spaces);
    }

    // ------------------------------------------------------------------------------------------
    // Public Interfaces
    // ------------------------------------------------------------------------------------------

    @Override
    public String toString() {
        byte[] bytes = allocateReuseBytes(sizeInBytes);
        MemorySegmentUtils.copyToBytes(segments, offset, bytes, 0, sizeInBytes);
        return decodeUTF8(bytes, 0, sizeInBytes);
    }

    /**
     * Compares two strings lexicographically. Since UTF-8 uses groups of six bits, it is sometimes
     * useful to use octal notation which uses 3-bit groups. With a calculator which can convert
     * between hexadecimal and octal it can be easier to manually create or interpret UTF-8 compared
     * with using binary. So we just compare the binary.
     */
    @Override
    public int compareTo(@Nonnull BinaryString other) {
        if (segments.length == 1 && other.segments.length == 1) {
            int len = Math.min(sizeInBytes, other.sizeInBytes);
            MemorySegment seg1 = segments[0];
            MemorySegment seg2 = other.segments[0];

            for (int i = 0; i < len; i++) {
                int res = (seg1.get(offset + i) & 0xFF) - (seg2.get(other.offset + i) & 0xFF);
                if (res != 0) {
                    return res;
                }
            }
            return sizeInBytes - other.sizeInBytes;
        }

        // if there are multi segments.
        return compareMultiSegments(other);
    }

    /** Find the boundaries of segments, and then compare MemorySegment. */
    private int compareMultiSegments(BinaryString other) {

        if (sizeInBytes == 0 || other.sizeInBytes == 0) {
            return sizeInBytes - other.sizeInBytes;
        }

        int len = Math.min(sizeInBytes, other.sizeInBytes);

        MemorySegment seg1 = segments[0];
        MemorySegment seg2 = other.segments[0];

        int segmentSize = segments[0].size();
        int otherSegmentSize = other.segments[0].size();

        int sizeOfFirst1 = segmentSize - offset;
        int sizeOfFirst2 = otherSegmentSize - other.offset;

        int varSegIndex1 = 1;
        int varSegIndex2 = 1;

        // find the first segment of this string.
        while (sizeOfFirst1 <= 0) {
            sizeOfFirst1 += segmentSize;
            seg1 = segments[varSegIndex1++];
        }

        while (sizeOfFirst2 <= 0) {
            sizeOfFirst2 += otherSegmentSize;
            seg2 = other.segments[varSegIndex2++];
        }

        int offset1 = segmentSize - sizeOfFirst1;
        int offset2 = otherSegmentSize - sizeOfFirst2;

        int needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);

        while (needCompare > 0) {
            // compare in one segment.
            for (int i = 0; i < needCompare; i++) {
                int res = (seg1.get(offset1 + i) & 0xFF) - (seg2.get(offset2 + i) & 0xFF);
                if (res != 0) {
                    return res;
                }
            }
            if (needCompare == len) {
                break;
            }
            len -= needCompare;
            // next segment
            if (sizeOfFirst1 < sizeOfFirst2) { // I am smaller
                seg1 = segments[varSegIndex1++];
                offset1 = 0;
                offset2 += needCompare;
                sizeOfFirst1 = segmentSize;
                sizeOfFirst2 -= needCompare;
            } else if (sizeOfFirst1 > sizeOfFirst2) { // other is smaller
                seg2 = other.segments[varSegIndex2++];
                offset2 = 0;
                offset1 += needCompare;
                sizeOfFirst2 = otherSegmentSize;
                sizeOfFirst1 -= needCompare;
            } else { // same, should go ahead both.
                seg1 = segments[varSegIndex1++];
                seg2 = other.segments[varSegIndex2++];
                offset1 = 0;
                offset2 = 0;
                sizeOfFirst1 = segmentSize;
                sizeOfFirst2 = otherSegmentSize;
            }
            needCompare = Math.min(Math.min(sizeOfFirst1, sizeOfFirst2), len);
        }

        checkArgument(needCompare == len);

        return sizeInBytes - other.sizeInBytes;
    }

    // ------------------------------------------------------------------------------------------
    // Public methods on BinaryString
    // ------------------------------------------------------------------------------------------

    /** Returns the number of UTF-8 code points in the string. */
    public int numChars() {
        if (inFirstSegment()) {
            int len = 0;
            for (int i = 0; i < sizeInBytes; i += numBytesForFirstByte(getByteOneSegment(i))) {
                len++;
            }
            return len;
        } else {
            return numCharsMultiSegs();
        }
    }

    private int numCharsMultiSegs() {
        int len = 0;
        int segSize = segments[0].size();
        BinaryString.SegmentAndOffset index = firstSegmentAndOffset(segSize);
        int i = 0;
        while (i < sizeInBytes) {
            int charBytes = numBytesForFirstByte(index.value());
            i += charBytes;
            len++;
            index.skipBytes(charBytes, segSize);
        }
        return len;
    }

    /**
     * Returns the {@code byte} value at the specified index. An index ranges from {@code 0} to
     * {@code sizeInBytes - 1}.
     *
     * @param index the index of the {@code byte} value.
     * @return the {@code byte} value at the specified index of this UTF-8 bytes.
     * @exception IndexOutOfBoundsException if the {@code index} argument is negative or not less
     *     than the length of this UTF-8 bytes.
     */
    public byte byteAt(int index) {
        int globalOffset = offset + index;
        int size = segments[0].size();
        if (globalOffset < size) {
            return segments[0].get(globalOffset);
        } else {
            return segments[globalOffset / size].get(globalOffset % size);
        }
    }

    /** Copy a new {@code BinaryString}. */
    public BinaryString copy() {
        byte[] copy = MemorySegmentUtils.copyToBytes(segments, offset, sizeInBytes);
        return BinaryString.fromBytes(copy);
    }

    /**
     * Returns a binary string that is a substring of this binary string. The substring begins at
     * the specified {@code beginIndex} and extends to the character at index {@code endIndex - 1}.
     *
     * Examples:
     *
     * 

     *
     *      * fromString("hamburger").substring(4, 8) returns binary string "urge"
     * fromString("smiles").substring(1, 5) returns binary string "mile"
     * 
     *
     * 
     *
     * @param beginIndex the beginning index, inclusive.
     * @param endIndex the ending index, exclusive.
     * @return the specified substring, return EMPTY_UTF8 when index out of bounds instead of
     *     StringIndexOutOfBoundsException.
     */
    public BinaryString substring(int beginIndex, int endIndex) {
        if (endIndex <= beginIndex || beginIndex >= sizeInBytes) {
            return EMPTY_UTF8;
        }
        if (inFirstSegment()) {
            MemorySegment segment = segments[0];
            int i = 0;
            int c = 0;
            while (i < sizeInBytes && c < beginIndex) {
                i += numBytesForFirstByte(segment.get(i + offset));
                c += 1;
            }

            int j = i;
            while (i < sizeInBytes && c < endIndex) {
                i += numBytesForFirstByte(segment.get(i + offset));
                c += 1;
            }

            if (i > j) {
                byte[] bytes = new byte[i - j];
                segment.get(offset + j, bytes, 0, i - j);
                return fromBytes(bytes);
            } else {
                return EMPTY_UTF8;
            }
        } else {
            return substringMultiSegs(beginIndex, endIndex);
        }
    }

    private BinaryString substringMultiSegs(final int start, final int until) {
        int segSize = segments[0].size();
        BinaryString.SegmentAndOffset index = firstSegmentAndOffset(segSize);
        int i = 0;
        int c = 0;
        while (i < sizeInBytes && c < start) {
            int charSize = numBytesForFirstByte(index.value());
            i += charSize;
            index.skipBytes(charSize, segSize);
            c += 1;
        }

        int j = i;
        while (i < sizeInBytes && c < until) {
            int charSize = numBytesForFirstByte(index.value());
            i += charSize;
            index.skipBytes(charSize, segSize);
            c += 1;
        }

        if (i > j) {
            return fromBytes(MemorySegmentUtils.copyToBytes(segments, offset + j, i - j));
        } else {
            return EMPTY_UTF8;
        }
    }

    /**
     * Returns true if and only if this BinaryString contains the specified sequence of bytes
     * values.
     *
     * @param s the sequence to search for
     * @return true if this BinaryString contains {@code s}, false otherwise
     */
    public boolean contains(final BinaryString s) {
        if (s.sizeInBytes == 0) {
            return true;
        }
        int find =
                MemorySegmentUtils.find(
                        segments, offset, sizeInBytes, s.segments, s.offset, s.sizeInBytes);
        return find != -1;
    }

    /**
     * Tests if this BinaryString starts with the specified prefix.
     *
     * @param prefix the prefix.
     * @return {@code true} if the bytes represented by the argument is a prefix of the bytes
     *     represented by this string; {@code false} otherwise. Note also that {@code true} will be
     *     returned if the argument is an empty BinaryString or is equal to this {@code
     *     BinaryString} object as determined by the {@link #equals(Object)} method.
     */
    public boolean startsWith(final BinaryString prefix) {
        return matchAt(prefix, 0);
    }

    /**
     * Tests if this BinaryString ends with the specified suffix.
     *
     * @param suffix the suffix.
     * @return {@code true} if the bytes represented by the argument is a suffix of the bytes
     *     represented by this object; {@code false} otherwise. Note that the result will be {@code
     *     true} if the argument is the empty string or is equal to this {@code BinaryString} object
     *     as determined by the {@link #equals(Object)} method.
     */
    public boolean endsWith(final BinaryString suffix) {
        return matchAt(suffix, sizeInBytes - suffix.sizeInBytes);
    }

    /**
     * Returns a string whose value is this string, with any leading and trailing whitespace
     * removed.
     *
     * @return A string whose value is this string, with any leading and trailing white space
     *     removed, or this string if it has no leading or trailing white space.
     */
    public BinaryString trim() {
        if (inFirstSegment()) {
            int s = 0;
            int e = this.sizeInBytes - 1;
            // skip all of the space (0x20) in the left side
            while (s < this.sizeInBytes && getByteOneSegment(s) == 0x20) {
                s++;
            }
            // skip all of the space (0x20) in the right side
            while (e >= s && getByteOneSegment(e) == 0x20) {
                e--;
            }
            if (s > e) {
                // empty string
                return EMPTY_UTF8;
            } else {
                return copyBinaryStringInOneSeg(s, e - s + 1);
            }
        } else {
            return trimMultiSegs();
        }
    }

    private BinaryString trimMultiSegs() {
        int s = 0;
        int e = this.sizeInBytes - 1;
        int segSize = segments[0].size();
        BinaryString.SegmentAndOffset front = firstSegmentAndOffset(segSize);
        // skip all of the space (0x20) in the left side
        while (s < this.sizeInBytes && front.value() == 0x20) {
            s++;
            front.nextByte(segSize);
        }
        BinaryString.SegmentAndOffset behind = lastSegmentAndOffset(segSize);
        // skip all of the space (0x20) in the right side
        while (e >= s && behind.value() == 0x20) {
            e--;
            behind.previousByte(segSize);
        }
        if (s > e) {
            // empty string
            return EMPTY_UTF8;
        } else {
            return copyBinaryString(s, e);
        }
    }

    /**
     * Returns the index within this string of the first occurrence of the specified substring,
     * starting at the specified index.
     *
     * @param str the substring to search for.
     * @param fromIndex the index from which to start the search.
     * @return the index of the first occurrence of the specified substring, starting at the
     *     specified index, or {@code -1} if there is no such occurrence.
     */
    public int indexOf(BinaryString str, int fromIndex) {
        if (str.sizeInBytes == 0) {
            return 0;
        }
        if (inFirstSegment()) {
            // position in byte
            int byteIdx = 0;
            // position is char
            int charIdx = 0;
            while (byteIdx < sizeInBytes && charIdx < fromIndex) {
                byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx));
                charIdx++;
            }
            do {
                if (byteIdx + str.sizeInBytes > sizeInBytes) {
                    return -1;
                }
                if (MemorySegmentUtils.equals(
                        segments, offset + byteIdx, str.segments, str.offset, str.sizeInBytes)) {
                    return charIdx;
                }
                byteIdx += numBytesForFirstByte(getByteOneSegment(byteIdx));
                charIdx++;
            } while (byteIdx < sizeInBytes);

            return -1;
        } else {
            return indexOfMultiSegs(str, fromIndex);
        }
    }

    private int indexOfMultiSegs(BinaryString str, int fromIndex) {
        // position in byte
        int byteIdx = 0;
        // position is char
        int charIdx = 0;
        int segSize = segments[0].size();
        BinaryString.SegmentAndOffset index = firstSegmentAndOffset(segSize);
        while (byteIdx < sizeInBytes && charIdx < fromIndex) {
            int charBytes = numBytesForFirstByte(index.value());
            byteIdx += charBytes;
            charIdx++;
            index.skipBytes(charBytes, segSize);
        }
        do {
            if (byteIdx + str.sizeInBytes > sizeInBytes) {
                return -1;
            }
            if (MemorySegmentUtils.equals(
                    segments, offset + byteIdx, str.segments, str.offset, str.sizeInBytes)) {
                return charIdx;
            }
            int charBytes = numBytesForFirstByte(index.segment.get(index.offset));
            byteIdx += charBytes;
            charIdx++;
            index.skipBytes(charBytes, segSize);
        } while (byteIdx < sizeInBytes);

        return -1;
    }

    /**
     * Converts all of the characters in this {@code BinaryString} to upper case.
     *
     * @return the {@code BinaryString}, converted to uppercase.
     */
    public BinaryString toUpperCase() {
        if (sizeInBytes == 0) {
            return EMPTY_UTF8;
        }
        int size = segments[0].size();
        BinaryString.SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
        byte[] bytes = new byte[sizeInBytes];
        bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value());
        for (int i = 0; i < sizeInBytes; i++) {
            byte b = segmentAndOffset.value();
            if (numBytesForFirstByte(b) != 1) {
                // fallback
                return javaToUpperCase();
            }
            int upper = Character.toUpperCase((int) b);
            if (upper > 127) {
                // fallback
                return javaToUpperCase();
            }
            bytes[i] = (byte) upper;
            segmentAndOffset.nextByte(size);
        }
        return fromBytes(bytes);
    }

    private BinaryString javaToUpperCase() {
        return fromString(toString().toUpperCase());
    }

    /**
     * Converts all of the characters in this {@code BinaryString} to lower case.
     *
     * @return the {@code BinaryString}, converted to lowercase.
     */
    public BinaryString toLowerCase() {
        if (sizeInBytes == 0) {
            return EMPTY_UTF8;
        }
        int size = segments[0].size();
        BinaryString.SegmentAndOffset segmentAndOffset = startSegmentAndOffset(size);
        byte[] bytes = new byte[sizeInBytes];
        bytes[0] = (byte) Character.toTitleCase(segmentAndOffset.value());
        for (int i = 0; i < sizeInBytes; i++) {
            byte b = segmentAndOffset.value();
            if (numBytesForFirstByte(b) != 1) {
                // fallback
                return javaToLowerCase();
            }
            int lower = Character.toLowerCase((int) b);
            if (lower > 127) {
                // fallback
                return javaToLowerCase();
            }
            bytes[i] = (byte) lower;
            segmentAndOffset.nextByte(size);
        }
        return fromBytes(bytes);
    }

    private BinaryString javaToLowerCase() {
        return fromString(toString().toLowerCase());
    }

    // ------------------------------------------------------------------------------------------
    // Internal methods on BinaryString
    // ------------------------------------------------------------------------------------------

    byte getByteOneSegment(int i) {
        return segments[0].get(offset + i);
    }

    boolean inFirstSegment() {
        return sizeInBytes + offset <= segments[0].size();
    }

    private boolean matchAt(final BinaryString s, int pos) {
        return (inFirstSegment() && s.inFirstSegment())
                ? matchAtOneSeg(s, pos)
                : matchAtVarSeg(s, pos);
    }

    private boolean matchAtOneSeg(final BinaryString s, int pos) {
        return s.sizeInBytes + pos <= sizeInBytes
                && pos >= 0
                && segments[0].equalTo(s.segments[0], offset + pos, s.offset, s.sizeInBytes);
    }

    private boolean matchAtVarSeg(final BinaryString s, int pos) {
        return s.sizeInBytes + pos <= sizeInBytes
                && pos >= 0
                && MemorySegmentUtils.equals(
                        segments, offset + pos, s.segments, s.offset, s.sizeInBytes);
    }

    BinaryString copyBinaryStringInOneSeg(int start, int len) {
        byte[] newBytes = new byte[len];
        segments[0].get(offset + start, newBytes, 0, len);
        return fromBytes(newBytes);
    }

    BinaryString copyBinaryString(int start, int end) {
        int len = end - start + 1;
        byte[] newBytes = new byte[len];
        MemorySegmentUtils.copyToBytes(segments, offset + start, newBytes, 0, len);
        return fromBytes(newBytes);
    }

    BinaryString.SegmentAndOffset firstSegmentAndOffset(int segSize) {
        int segIndex = offset / segSize;
        return new BinaryString.SegmentAndOffset(segIndex, offset % segSize);
    }

    BinaryString.SegmentAndOffset lastSegmentAndOffset(int segSize) {
        int lastOffset = offset + sizeInBytes - 1;
        int segIndex = lastOffset / segSize;
        return new BinaryString.SegmentAndOffset(segIndex, lastOffset % segSize);
    }

    private BinaryString.SegmentAndOffset startSegmentAndOffset(int segSize) {
        return inFirstSegment()
                ? new BinaryString.SegmentAndOffset(0, offset)
                : firstSegmentAndOffset(segSize);
    }

    /** CurrentSegment and positionInSegment. */
    class SegmentAndOffset {
        int segIndex;
        MemorySegment segment;
        int offset;

        private SegmentAndOffset(int segIndex, int offset) {
            this.segIndex = segIndex;
            this.segment = segments[segIndex];
            this.offset = offset;
        }

        private void assignSegment() {
            segment = segIndex >= 0 && segIndex < segments.length ? segments[segIndex] : null;
        }

        void previousByte(int segSize) {
            offset--;
            if (offset == -1) {
                segIndex--;
                assignSegment();
                offset = segSize - 1;
            }
        }

        void nextByte(int segSize) {
            offset++;
            checkAdvance(segSize);
        }

        private void checkAdvance(int segSize) {
            if (offset == segSize) {
                advance();
            }
        }

        private void advance() {
            segIndex++;
            assignSegment();
            offset = 0;
        }

        void skipBytes(int n, int segSize) {
            int remaining = segSize - this.offset;
            if (remaining > n) {
                this.offset += n;
            } else {
                while (true) {
                    int toSkip = Math.min(remaining, n);
                    n -= toSkip;
                    if (n <= 0) {
                        this.offset += toSkip;
                        checkAdvance(segSize);
                        return;
                    }
                    advance();
                    remaining = segSize - this.offset;
                }
            }
        }

        byte value() {
            return this.segment.get(this.offset);
        }
    }

    /**
     * Returns the number of bytes for a code point with the first byte as `b`.
     *
     * @param b The first byte of a code point
     */
    static int numBytesForFirstByte(final byte b) {
        if (b >= 0) {
            // 1 byte, 7 bits: 0xxxxxxx
            return 1;
        } else if ((b >> 5) == -2 && (b & 0x1e) != 0) {
            // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
            return 2;
        } else if ((b >> 4) == -2) {
            // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
            return 3;
        } else if ((b >> 3) == -2) {
            // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            return 4;
        } else {
            // Skip the first byte disallowed in UTF-8
            // Handling errors quietly, same semantics to java String.
            return 1;
        }
    }

    // ------------------------------------------------------------------------
    // UTF-8 encoding and decoding (High-speed version, 30% faster+)
    // ------------------------------------------------------------------------

    private static final int MAX_BYTES_PER_CHAR = 3;

    /** This method must have the same result with JDK's String.getBytes. */
    public static byte[] encodeUTF8(String str) {
        byte[] bytes = allocateReuseBytes(str.length() * MAX_BYTES_PER_CHAR);
        int len = encodeUTF8(str, bytes);
        return Arrays.copyOf(bytes, len);
    }

    public static int encodeUTF8(String str, byte[] bytes) {
        int offset = 0;
        int len = str.length();
        int sl = offset + len;
        int dp = 0;
        int dlASCII = dp + Math.min(len, bytes.length);

        // ASCII only optimized loop
        while (dp < dlASCII && str.charAt(offset) < '\u0080') {
            bytes[dp++] = (byte) str.charAt(offset++);
        }

        while (offset < sl) {
            char c = str.charAt(offset++);
            if (c < 0x80) {
                // Have at most seven bits
                bytes[dp++] = (byte) c;
            } else if (c < 0x800) {
                // 2 bytes, 11 bits
                bytes[dp++] = (byte) (0xc0 | (c >> 6));
                bytes[dp++] = (byte) (0x80 | (c & 0x3f));
            } else if (Character.isSurrogate(c)) {
                final int uc;
                int ip = offset - 1;
                if (Character.isHighSurrogate(c)) {
                    if (sl - ip < 2) {
                        uc = -1;
                    } else {
                        char d = str.charAt(ip + 1);
                        if (Character.isLowSurrogate(d)) {
                            uc = Character.toCodePoint(c, d);
                        } else {
                            // for some illegal character
                            // the jdk will ignore the origin character and cast it to '?'
                            // this acts the same with jdk
                            return defaultEncodeUTF8(str, bytes);
                        }
                    }
                } else {
                    if (Character.isLowSurrogate(c)) {
                        // for some illegal character
                        // the jdk will ignore the origin character and cast it to '?'
                        // this acts the same with jdk
                        return defaultEncodeUTF8(str, bytes);
                    } else {
                        uc = c;
                    }
                }

                if (uc < 0) {
                    bytes[dp++] = (byte) '?';
                } else {
                    bytes[dp++] = (byte) (0xf0 | ((uc >> 18)));
                    bytes[dp++] = (byte) (0x80 | ((uc >> 12) & 0x3f));
                    bytes[dp++] = (byte) (0x80 | ((uc >> 6) & 0x3f));
                    bytes[dp++] = (byte) (0x80 | (uc & 0x3f));
                    offset++; // 2 chars
                }
            } else {
                // 3 bytes, 16 bits
                bytes[dp++] = (byte) (0xe0 | ((c >> 12)));
                bytes[dp++] = (byte) (0x80 | ((c >> 6) & 0x3f));
                bytes[dp++] = (byte) (0x80 | (c & 0x3f));
            }
        }
        return dp;
    }

    public static int defaultEncodeUTF8(String str, byte[] bytes) {
        try {
            byte[] buffer = str.getBytes("UTF-8");
            System.arraycopy(buffer, 0, bytes, 0, buffer.length);
            return buffer.length;
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("encodeUTF8 error", e);
        }
    }

    public static String decodeUTF8(byte[] input, int offset, int byteLen) {
        char[] chars = allocateReuseChars(byteLen);
        int len = decodeUTF8Strict(input, offset, byteLen, chars);
        if (len < 0) {
            return defaultDecodeUTF8(input, offset, byteLen);
        }
        return new String(chars, 0, len);
    }

    public static int decodeUTF8Strict(byte[] sa, int sp, int len, char[] da) {
        final int sl = sp + len;
        int dp = 0;
        int dlASCII = Math.min(len, da.length);

        // ASCII only optimized loop
        while (dp < dlASCII && sa[sp] >= 0) {
            da[dp++] = (char) sa[sp++];
        }

        while (sp < sl) {
            int b1 = sa[sp++];
            if (b1 >= 0) {
                // 1 byte, 7 bits: 0xxxxxxx
                da[dp++] = (char) b1;
            } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
                // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
                if (sp < sl) {
                    int b2 = sa[sp++];
                    if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2)
                        return -1;
                    } else {
                        da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80)));
                    }
                    continue;
                }
                return -1;
            } else if ((b1 >> 4) == -2) {
                // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
                if (sp + 1 < sl) {
                    int b2 = sa[sp++];
                    int b3 = sa[sp++];
                    if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80)
                            || (b2 & 0xc0) != 0x80
                            || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3)
                        return -1;
                    } else {
                        char c =
                                (char)
                                        ((b1 << 12)
                                                ^ (b2 << 6)
                                                ^ (b3
                                                        ^ (((byte) 0xE0 << 12)
                                                                ^ ((byte) 0x80 << 6)
                                                                ^ ((byte) 0x80))));
                        if (Character.isSurrogate(c)) {
                            return -1;
                        } else {
                            da[dp++] = c;
                        }
                    }
                    continue;
                }
                return -1;
            } else if ((b1 >> 3) == -2) {
                // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                if (sp + 2 < sl) {
                    int b2 = sa[sp++];
                    int b3 = sa[sp++];
                    int b4 = sa[sp++];
                    int uc =
                            ((b1 << 18)
                                    ^ (b2 << 12)
                                    ^ (b3 << 6)
                                    ^ (b4
                                            ^ (((byte) 0xF0 << 18)
                                                    ^ ((byte) 0x80 << 12)
                                                    ^ ((byte) 0x80 << 6)
                                                    ^ ((byte) 0x80))));
                    // isMalformed4 and shortest form check
                    if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80)
                            || !Character.isSupplementaryCodePoint(uc)) {
                        return -1;
                    } else {
                        da[dp++] = Character.highSurrogate(uc);
                        da[dp++] = Character.lowSurrogate(uc);
                    }
                    continue;
                }
                return -1;
            } else {
                return -1;
            }
        }
        return dp;
    }

    public static String decodeUTF8(MemorySegment input, int offset, int byteLen) {
        char[] chars = allocateReuseChars(byteLen);
        int len = decodeUTF8Strict(input, offset, byteLen, chars);
        if (len < 0) {
            byte[] bytes = allocateReuseBytes(byteLen);
            input.get(offset, bytes, 0, byteLen);
            return defaultDecodeUTF8(bytes, 0, byteLen);
        }
        return new String(chars, 0, len);
    }

    public static int decodeUTF8Strict(MemorySegment segment, int sp, int len, char[] da) {
        final int sl = sp + len;
        int dp = 0;
        int dlASCII = Math.min(len, da.length);

        // ASCII only optimized loop
        while (dp < dlASCII && segment.get(sp) >= 0) {
            da[dp++] = (char) segment.get(sp++);
        }

        while (sp < sl) {
            int b1 = segment.get(sp++);
            if (b1 >= 0) {
                // 1 byte, 7 bits: 0xxxxxxx
                da[dp++] = (char) b1;
            } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
                // 2 bytes, 11 bits: 110xxxxx 10xxxxxx
                if (sp < sl) {
                    int b2 = segment.get(sp++);
                    if ((b2 & 0xc0) != 0x80) { // isNotContinuation(b2)
                        return -1;
                    } else {
                        da[dp++] = (char) (((b1 << 6) ^ b2) ^ (((byte) 0xC0 << 6) ^ ((byte) 0x80)));
                    }
                    continue;
                }
                return -1;
            } else if ((b1 >> 4) == -2) {
                // 3 bytes, 16 bits: 1110xxxx 10xxxxxx 10xxxxxx
                if (sp + 1 < sl) {
                    int b2 = segment.get(sp++);
                    int b3 = segment.get(sp++);
                    if ((b1 == (byte) 0xe0 && (b2 & 0xe0) == 0x80)
                            || (b2 & 0xc0) != 0x80
                            || (b3 & 0xc0) != 0x80) { // isMalformed3(b1, b2, b3)
                        return -1;
                    } else {
                        char c =
                                (char)
                                        ((b1 << 12)
                                                ^ (b2 << 6)
                                                ^ (b3
                                                        ^ (((byte) 0xE0 << 12)
                                                                ^ ((byte) 0x80 << 6)
                                                                ^ ((byte) 0x80))));
                        if (Character.isSurrogate(c)) {
                            return -1;
                        } else {
                            da[dp++] = c;
                        }
                    }
                    continue;
                }
                return -1;
            } else if ((b1 >> 3) == -2) {
                // 4 bytes, 21 bits: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                if (sp + 2 < sl) {
                    int b2 = segment.get(sp++);
                    int b3 = segment.get(sp++);
                    int b4 = segment.get(sp++);
                    int uc =
                            ((b1 << 18)
                                    ^ (b2 << 12)
                                    ^ (b3 << 6)
                                    ^ (b4
                                            ^ (((byte) 0xF0 << 18)
                                                    ^ ((byte) 0x80 << 12)
                                                    ^ ((byte) 0x80 << 6)
                                                    ^ ((byte) 0x80))));
                    // isMalformed4 and shortest form check
                    if (((b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || (b4 & 0xc0) != 0x80)
                            || !Character.isSupplementaryCodePoint(uc)) {
                        return -1;
                    } else {
                        da[dp++] = Character.highSurrogate(uc);
                        da[dp++] = Character.lowSurrogate(uc);
                    }
                    continue;
                }
                return -1;
            } else {
                return -1;
            }
        }
        return dp;
    }

    public static String defaultDecodeUTF8(byte[] bytes, int offset, int len) {
        return new String(bytes, offset, len, StandardCharsets.UTF_8);
    }
}