All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.oracle.truffle.api.strings.JCodingsImpl Maven / Gradle / Ivy

/*
 * Copyright (c) 2021, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * The Universal Permissive License (UPL), Version 1.0
 *
 * Subject to the condition set forth below, permission is hereby granted to any
 * person obtaining a copy of this software, associated documentation and/or
 * data (collectively the "Software"), free of charge and under any and all
 * copyright rights in the Software, and any and all patent rights owned or
 * freely licensable by each licensor hereunder covering either (i) the
 * unmodified Software as contributed to or provided by such licensor, or (ii)
 * the Larger Works (as defined below), to deal in both
 *
 * (a) the Software, and
 *
 * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
 * one is included with the Software each a "Larger Work" to which the Software
 * is contributed by such licensors),
 *
 * without restriction, including without limitation the rights to copy, create
 * derivative works of, display, perform, and distribute the Software and make,
 * use, sell, offer for sale, import, export, have made, and have sold the
 * Software and the Larger Work(s), and to sublicense the foregoing rights on
 * either these or other terms.
 *
 * This license is subject to the following condition:
 *
 * The above copyright notice and either this complete permission notice or at a
 * minimum a reference to the UPL must be included in all copies or substantial
 * portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.oracle.truffle.api.strings;

import static com.oracle.truffle.api.CompilerDirectives.CompilationFinal;
import static com.oracle.truffle.api.strings.AbstractTruffleString.checkArrayRange;
import static com.oracle.truffle.api.strings.TStringGuards.isStride0;
import static com.oracle.truffle.api.strings.TStringGuards.isStride1;
import static com.oracle.truffle.api.strings.TStringGuards.isUTF16;
import static com.oracle.truffle.api.strings.TStringGuards.isUTF16Or32;
import static com.oracle.truffle.api.strings.TStringGuards.isUTF32;
import static com.oracle.truffle.api.strings.TStringGuards.isUTF8;
import static com.oracle.truffle.api.strings.TruffleString.ErrorHandling;

import java.util.Arrays;

import org.graalvm.collections.EconomicMap;
import org.graalvm.shadowed.org.jcodings.EncodingDB;
import org.graalvm.shadowed.org.jcodings.Ptr;
import org.graalvm.shadowed.org.jcodings.transcode.EConv;
import org.graalvm.shadowed.org.jcodings.transcode.EConvFlags;
import org.graalvm.shadowed.org.jcodings.transcode.EConvResult;
import org.graalvm.shadowed.org.jcodings.transcode.TranscoderDB;
import org.graalvm.shadowed.org.jcodings.util.CaseInsensitiveBytesHash;

import com.oracle.truffle.api.CompilerDirectives;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
import com.oracle.truffle.api.nodes.Node;
import com.oracle.truffle.api.profiles.BranchProfile;
import com.oracle.truffle.api.profiles.ConditionProfile;

final class JCodingsImpl implements JCodings {

    private static final class EncodingWrapper implements Encoding {
        private final org.graalvm.shadowed.org.jcodings.Encoding encoding;

        private EncodingWrapper(org.graalvm.shadowed.org.jcodings.Encoding encoding) {
            this.encoding = encoding;
        }
    }

    private static final int MAX_J_CODINGS_INDEX_VALUE = 0x7f;

    @CompilationFinal private static final EconomicMap J_CODINGS_MAP = createJCodingsMap();

    @TruffleBoundary
    private static EconomicMap createJCodingsMap() {
        CaseInsensitiveBytesHash encodings = EncodingDB.getEncodings();
        if (encodings.size() > MAX_J_CODINGS_INDEX_VALUE) {
            throw new RuntimeException(String.format("Assumption broken: org.graalvm.shadowed.org.jcodings has more than %d encodings (actual: %d)!", MAX_J_CODINGS_INDEX_VALUE, encodings.size()));
        }
        EconomicMap allEncodings = EconomicMap.create(encodings.size());
        for (EncodingDB.Entry entry : encodings) {
            org.graalvm.shadowed.org.jcodings.Encoding enc = entry.getEncoding();
            int i = enc.getIndex();
            if (i < 0 || i >= encodings.size()) {
                throw new RuntimeException(String.format(
                                "Assumption broken: index of org.graalvm.shadowed.org.jcodings encoding \"%s\" is greater than number of encodings (index: %d, number of encodings: %d)!", enc, i,
                                encodings.size()));
            }
            allEncodings.put(toEnumName(enc.toString()), new EncodingWrapper(enc));
        }
        return allEncodings;
    }

    @Override
    public Encoding get(String encodingName) {
        return J_CODINGS_MAP.get(encodingName);
    }

    @Override
    public Encoding get(TruffleString.Encoding encoding) {
        return encoding.jCoding;
    }

    @Override
    public String name(Encoding jCoding) {
        return unwrap(jCoding).toString();
    }

    @Override
    public int minLength(Encoding jCoding) {
        return unwrap(jCoding).minLength();
    }

    @Override
    public int maxLength(Encoding jCoding) {
        return unwrap(jCoding).maxLength();
    }

    @Override
    public boolean isFixedWidth(Encoding jCoding) {
        return unwrap(jCoding).isFixedWidth();
    }

    @Override
    public boolean isSingleByte(Encoding jCoding) {
        return unwrap(jCoding).isSingleByte();
    }

    @Override
    @TruffleBoundary
    public int getCodePointLength(Encoding jCoding, int codepoint) {
        return unwrap(jCoding).codeToMbcLength(codepoint);
    }

    @Override
    @TruffleBoundary
    public int getPreviousCodePointIndex(Encoding jCoding, byte[] array, int arrayBegin, int index, int arrayEnd) {
        return unwrap(jCoding).prevCharHead(array, arrayBegin, index, arrayEnd);
    }

    @Override
    @TruffleBoundary
    public int getCodePointLength(Encoding jCoding, byte[] array, int index, int arrayLength) {
        return unwrap(jCoding).length(array, index, arrayLength);
    }

    @Override
    @TruffleBoundary
    public int readCodePoint(Encoding jCoding, byte[] array, int index, int arrayEnd) {
        return unwrap(jCoding).mbcToCode(array, index, arrayEnd);
    }

    @Override
    @TruffleBoundary
    public int writeCodePoint(Encoding jCoding, int codepoint, byte[] array, int index) {
        return unwrap(jCoding).codeToMbc(codepoint, array, index);
    }

    @Override
    @TruffleBoundary
    public int codePointIndexToRaw(Node location, AbstractTruffleString a, byte[] arrayA, int extraOffsetRaw, int index, boolean isLength, Encoding jCoding) {
        if (isFixedWidth(jCoding)) {
            return index * minLength(jCoding);
        }
        int offset = a.byteArrayOffset() + extraOffsetRaw;
        int end = a.byteArrayOffset() + a.length();
        int cpi = 0;
        int i = 0;
        while (i < a.length() - extraOffsetRaw) {
            if (cpi == index) {
                return i;
            }
            int length = unwrap(jCoding).length(arrayA, offset + i, end);
            if (length < 1) {
                if (length < -1) {
                    // broken multibyte codepoint at end of string
                    if (isLength) {
                        return a.length() - extraOffsetRaw;
                    } else {
                        throw InternalErrors.indexOutOfBounds();
                    }
                } else {
                    i++;
                }
            } else {
                i += length;
            }
            cpi++;
            TStringConstants.truffleSafePointPoll(location, cpi);
        }
        return TStringInternalNodes.CodePointIndexToRawNode.atEnd(a, extraOffsetRaw, index, isLength, cpi);
    }

    @Override
    public int decode(AbstractTruffleString a, byte[] arrayA, int rawIndex, Encoding jCoding, ErrorHandling errorHandling) {
        int p = a.byteArrayOffset() + rawIndex;
        int end = a.byteArrayOffset() + a.length();
        int length = getCodePointLength(jCoding, arrayA, p, end);
        if (length < 1) {
            return Encodings.invalidCodepointReturnValue(errorHandling);
        }
        return readCodePoint(jCoding, arrayA, p, end);
    }

    @Override
    public long calcStringAttributes(Node location, Object array, int offset, int length, TruffleString.Encoding encoding, ConditionProfile validCharacterProfile, ConditionProfile fixedWidthProfile) {
        if (TStringGuards.is7BitCompatible(encoding) && TStringOps.calcStringAttributesLatin1(location, array, offset, length) == TSCodeRange.get7Bit()) {
            return StringAttributes.create(length, TSCodeRange.get7Bit());
        }
        byte[] bytes = JCodings.asByteArray(array);
        int offsetBytes = array instanceof AbstractTruffleString.NativePointer ? offset - ((AbstractTruffleString.NativePointer) array).offset() : offset;
        Encoding enc = get(encoding);
        int codeRange = isSingleByte(enc) ? TSCodeRange.getValidFixedWidth() : TSCodeRange.getValidMultiByte();
        int characters = 0;
        int p = offsetBytes;
        final int end = offsetBytes + length;
        int loopCount = 0;
        for (; p < end; characters++) {
            final int lengthOfCurrentCharacter = getCodePointLength(enc, bytes, p, end);
            if (validCharacterProfile.profile(lengthOfCurrentCharacter > 0 && p + lengthOfCurrentCharacter <= end)) {
                p += lengthOfCurrentCharacter;
            } else {
                codeRange = isSingleByte(enc) ? TSCodeRange.getBrokenFixedWidth() : TSCodeRange.getBrokenMultiByte();
                // If a string is detected as broken, and we already know the character length
                // due to a fixed width encoding, there's no value in visiting any more ptr.
                if (fixedWidthProfile.profile(isFixedWidth(enc))) {
                    characters = (length + minLength(enc) - 1) / minLength(enc);
                    return StringAttributes.create(characters, codeRange);
                } else {
                    p += minLength(enc);
                }
            }
            TStringConstants.truffleSafePointPoll(location, ++loopCount);
        }
        return StringAttributes.create(characters, codeRange);
    }

    @TruffleBoundary
    private static EConv getEconvTranscoder(Encoding jCodingSrc, Encoding jCodingDst) {
        return TranscoderDB.open(unwrap(jCodingSrc).getName(), unwrap(jCodingDst).getName(), EConvFlags.INVALID_REPLACE | EConvFlags.UNDEF_REPLACE);
    }

    @TruffleBoundary
    private static void econvSetReplacement(Encoding jCodingDst, EConv econv, byte[] replacement) {
        econv.setReplacement(replacement, 0, replacement.length, unwrap(jCodingDst).getName());
    }

    @TruffleBoundary
    private static EConvResult econvConvert(byte[] arrayA, byte[] buffer, EConv econv, Ptr srcPtr, Ptr dstPtr, int inStop) {
        return econv.convert(arrayA, srcPtr, inStop, buffer, dstPtr, buffer.length, 0);
    }

    private static final byte[] CONVERSION_REPLACEMENT = {'?'};
    private static final byte[] CONVERSION_REPLACEMENT_UTF_8 = {(byte) 0xEF, (byte) 0xBF, (byte) 0xBD};
    private static final byte[] CONVERSION_REPLACEMENT_UTF_16 = TStringGuards.littleEndian() ? new byte[]{(byte) 0xFD, (byte) 0xFF} : new byte[]{(byte) 0xFF, (byte) 0xFD};
    private static final byte[] CONVERSION_REPLACEMENT_UTF_32 = TStringGuards.littleEndian() ? new byte[]{(byte) 0xFD, (byte) 0xFF, 0, 0} : new byte[]{0, 0, (byte) 0xFF, (byte) 0xFD};

    @Override
    public TruffleString transcode(Node location, AbstractTruffleString a, Object arrayA, int codePointLengthA, TruffleString.Encoding targetEncoding,
                    BranchProfile outOfMemoryProfile,
                    ConditionProfile nativeProfile,
                    TStringInternalNodes.FromBufferWithStringCompactionNode fromBufferWithStringCompactionNode) {
        final TruffleString.Encoding encoding = TruffleString.Encoding.get(a.encoding());
        final JCodings.Encoding jCodingSrc;
        if (isUTF16Or32(encoding) && isStride0(a)) {
            jCodingSrc = TruffleString.Encoding.ISO_8859_1.jCoding;
        } else if (isUTF32(encoding) && isStride1(a)) {
            jCodingSrc = TruffleString.Encoding.UTF_16.jCoding;
        } else {
            jCodingSrc = JCodings.getInstance().get(encoding);
        }
        JCodings.Encoding jCodingDst = JCodings.getInstance().get(targetEncoding);
        byte[] buffer = new byte[(int) Math.min(TStringConstants.MAX_ARRAY_SIZE, ((long) codePointLengthA) * JCodings.getInstance().maxLength(jCodingDst))];
        int length = 0;
        EConv econv = getEconvTranscoder(jCodingSrc, jCodingDst);
        boolean undefinedConversion = false;
        if (econv == null) {
            undefinedConversion = true;
            int loopCount = 0;
            for (int i = 0; i < codePointLengthA; i++) {
                int ret = JCodings.getInstance().writeCodePoint(jCodingDst, isUTF8(targetEncoding) || isUTF16Or32(targetEncoding) ? 0xfffd : '?', buffer, length);
                assert ret > 0;
                length += ret;
                TStringConstants.truffleSafePointPoll(location, ++loopCount);
            }
        } else {
            final byte[] replacement;
            if (isUTF8(targetEncoding)) {
                replacement = CONVERSION_REPLACEMENT_UTF_8;
            } else if (isUTF16(targetEncoding)) {
                replacement = CONVERSION_REPLACEMENT_UTF_16;
            } else if (isUTF32(targetEncoding)) {
                replacement = CONVERSION_REPLACEMENT_UTF_32;
            } else {
                replacement = CONVERSION_REPLACEMENT;
            }
            final Ptr srcPtr = new Ptr();
            final Ptr dstPtr = new Ptr();
            srcPtr.p = a.byteArrayOffset();
            dstPtr.p = 0;
            int inStop = a.byteArrayOffset() + (a.length() << a.stride());
            if (arrayA instanceof AbstractTruffleString.NativePointer) {
                ((AbstractTruffleString.NativePointer) arrayA).materializeByteArray(a, nativeProfile);
            }
            byte[] bytes = JCodings.asByteArray(arrayA);
            EConvResult result = econvConvert(bytes, buffer, econv, srcPtr, dstPtr, inStop);
            while (!result.isFinished()) {
                if (result.isUndefinedConversion()) {
                    undefinedConversion = true;
                    econvSetReplacement(jCodingDst, econv, replacement);
                } else if (result.isDestinationBufferFull()) {
                    if (buffer.length == TStringConstants.MAX_ARRAY_SIZE) {
                        outOfMemoryProfile.enter();
                        throw InternalErrors.outOfMemory();
                    }
                    buffer = Arrays.copyOf(buffer, (int) Math.min(TStringConstants.MAX_ARRAY_SIZE, ((long) buffer.length) << 1));
                } else {
                    throw CompilerDirectives.shouldNotReachHere();
                }
                result = econvConvert(bytes, buffer, econv, srcPtr, dstPtr, inStop);
            }
            length = dstPtr.p;
        }
        checkArrayRange(buffer, 0, length);
        return fromBufferWithStringCompactionNode.execute(
                        buffer, 0, length, targetEncoding, length != buffer.length || targetEncoding.isSupported(), undefinedConversion || a.isMutable());
    }

    @TruffleBoundary
    private static String toEnumName(String encodingName) {
        if ("ASCII-8BIT".equals(encodingName)) {
            return "BYTES";
        }
        String capitalized = encodingName;
        if (Character.isLowerCase(encodingName.charAt(0))) {
            capitalized = Character.toUpperCase(encodingName.charAt(0)) + encodingName.substring(1);
        }
        return capitalized.replace('-', '_');
    }

    private static org.graalvm.shadowed.org.jcodings.Encoding unwrap(Encoding wrapped) {
        return ((EncodingWrapper) wrapped).encoding;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy