All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.oracle.graal.python.PythonFileDetector Maven / Gradle / Ivy

/*
 * Copyright (c) 2017, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * The Universal Permissive License (UPL), Version 1.0
 *
 * Subject to the condition set forth below, permission is hereby granted to any
 * person obtaining a copy of this software, associated documentation and/or
 * data (collectively the "Software"), free of charge and under any and all
 * copyright rights in the Software, and any and all patent rights owned or
 * freely licensable by each licensor hereunder covering either (i) the
 * unmodified Software as contributed to or provided by such licensor, or (ii)
 * the Larger Works (as defined below), to deal in both
 *
 * (a) the Software, and
 *
 * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
 * one is included with the Software each a "Larger Work" to which the Software
 * is contributed by such licensors),
 *
 * without restriction, including without limitation the rights to copy, create
 * derivative works of, display, perform, and distribute the Software and make,
 * use, sell, offer for sale, import, export, have made, and have sold the
 * Software and the Larger Work(s), and to sublicense the foregoing rights on
 * either these or other terms.
 *
 * This license is subject to the following condition:
 *
 * The above copyright notice and either this complete permission notice or at a
 * minimum a reference to the UPL must be included in all copies or substantial
 * portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.oracle.graal.python;

import static com.oracle.graal.python.nodes.StringLiterals.J_PY_EXTENSION;
import static com.oracle.graal.python.nodes.StringLiterals.T_UTF_UNDERSCORE_8;
import static com.oracle.graal.python.util.PythonUtils.TS_ENCODING;
import static com.oracle.graal.python.util.PythonUtils.toTruffleStringUncached;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.oracle.graal.python.util.CharsetMapping;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
import com.oracle.truffle.api.TruffleFile;
import com.oracle.truffle.api.strings.TruffleString;

public final class PythonFileDetector implements TruffleFile.FileTypeDetector {

    private static final String UTF_8_BOM_IN_LATIN_1 = new String(new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}, StandardCharsets.ISO_8859_1);
    private static final Pattern ENCODING_COMMENT = Pattern.compile("^[ \t\f]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+).*");
    private static final Pattern BLANK_LINE = Pattern.compile("^[ \t\f]*(?:#.*)?");

    @Override
    public String findMimeType(TruffleFile file) throws IOException {
        String fileName = file.getName();
        if (fileName != null && fileName.endsWith(J_PY_EXTENSION)) {
            return PythonLanguage.MIME_TYPE;
        }
        return null;
    }

    public static class InvalidEncodingException extends RuntimeException {
        private static final long serialVersionUID = 1L;

        private final String encodingName;

        public InvalidEncodingException(String encodingName) {
            super("Invalid or unsupported encoding: " + encodingName);
            this.encodingName = encodingName;
        }

        public String getEncodingName() {
            return encodingName;
        }
    }

    private static Charset tryGetCharsetFromLine(String line, boolean hasBOM) {
        if (line == null) {
            return null;
        }
        Matcher matcher = ENCODING_COMMENT.matcher(line);
        if (matcher.matches()) {
            // Files with UTF-8 BOM but different encoding declared are a SyntaxError
            // Note that CPython ignores UTF-8 aliases for the BOM check
            String encoding = matcher.group(1);
            TruffleString normalizedEncoding = CharsetMapping.normalizeUncached(toTruffleStringUncached(encoding));
            if (hasBOM && !normalizedEncoding.equalsUncached(T_UTF_UNDERSCORE_8, TS_ENCODING)) {
                throw new InvalidEncodingException(encoding + " with BOM");
            }
            Charset charset = CharsetMapping.getCharsetNormalized(normalizedEncoding);
            if (charset == null) {
                throw new InvalidEncodingException(encoding);
            }
            return charset;
        }
        return null;
    }

    @TruffleBoundary
    public static Charset findEncodingStrict(BufferedReader reader) throws IOException {
        // Read first two lines like CPython
        String firstLine = reader.readLine();
        if (firstLine != null) {
            boolean hasBOM = false;
            if (firstLine.startsWith(UTF_8_BOM_IN_LATIN_1)) {
                hasBOM = true;
                firstLine = firstLine.substring(UTF_8_BOM_IN_LATIN_1.length());
            }
            Charset charset;
            if ((charset = tryGetCharsetFromLine(firstLine, hasBOM)) != null) {
                return charset;
            }
            if (BLANK_LINE.matcher(firstLine).matches()) {
                if ((charset = tryGetCharsetFromLine(reader.readLine(), hasBOM)) != null) {
                    return charset;
                }
            }
        }
        return StandardCharsets.UTF_8;
    }

    @TruffleBoundary
    public static Charset findEncodingStrict(TruffleFile file) throws IOException {
        // Using Latin-1 to read the header avoids exceptions on non-ascii characters
        try (BufferedReader reader = file.newBufferedReader(StandardCharsets.ISO_8859_1)) {
            return findEncodingStrict(reader);
        }
    }

    @TruffleBoundary
    public static Charset findEncodingStrict(String source) {
        try (BufferedReader reader = new BufferedReader(new StringReader(source))) {
            return findEncodingStrict(reader);
        } catch (IOException e) {
            // Shouldn't happen on a string
            throw new RuntimeException(e);
        }
    }

    @TruffleBoundary
    public static Charset findEncodingStrict(byte[] source, int sourceLen) {
        // Using Latin-1 to read the header avoids exceptions on non-ascii characters
        try (BufferedReader reader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(source, 0, sourceLen), StandardCharsets.ISO_8859_1))) {
            return findEncodingStrict(reader);
        } catch (IOException e) {
            // Shouldn't happen on a string
            throw new RuntimeException(e);
        }
    }

    @Override
    public Charset findEncoding(TruffleFile file) throws IOException {
        try {
            return findEncodingStrict(file);
        } catch (InvalidEncodingException e) {
            // We cannot throw a SyntaxError at this point, but the parser will revalidate this.
            // Return Latin-1 so that it doesn't throw encoding errors before getting to the
            // parser, because Truffle would otherwise default to UTF-8
            return StandardCharsets.ISO_8859_1;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy