All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.oracle.graal.python.pegparser.sst.StringLiteralUtils Maven / Gradle / Ivy

There is a newer version: 24.1.1
Show newest version
/*
 * Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * The Universal Permissive License (UPL), Version 1.0
 *
 * Subject to the condition set forth below, permission is hereby granted to any
 * person obtaining a copy of this software, associated documentation and/or
 * data (collectively the "Software"), free of charge and under any and all
 * copyright rights in the Software, and any and all patent rights owned or
 * freely licensable by each licensor hereunder covering either (i) the
 * unmodified Software as contributed to or provided by such licensor, or (ii)
 * the Larger Works (as defined below), to deal in both
 *
 * (a) the Software, and
 *
 * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
 * one is included with the Software each a "Larger Work" to which the Software
 * is contributed by such licensors),
 *
 * without restriction, including without limitation the rights to copy, create
 * derivative works of, display, perform, and distribute the Software and make,
 * use, sell, offer for sale, import, export, have made, and have sold the
 * Software and the Larger Work(s), and to sublicense the foregoing rights on
 * either these or other terms.
 *
 * This license is subject to the following condition:
 *
 * The above copyright notice and either this complete permission notice or at a
 * minimum a reference to the UPL must be included in all copies or substantial
 * portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package com.oracle.graal.python.pegparser.sst;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import org.graalvm.shadowed.com.ibm.icu.lang.UCharacter;
import com.oracle.graal.python.pegparser.ErrorCallback;
import com.oracle.graal.python.pegparser.ErrorCallback.WarningType;
import com.oracle.graal.python.pegparser.FExprParser;
import com.oracle.graal.python.pegparser.PythonStringFactory;
import com.oracle.graal.python.pegparser.tokenizer.SourceRange;

public abstract class StringLiteralUtils {
    private static final ExprTy[] EMPTY_SST_ARRAY = new ExprTy[0];

    private static final String CANNOT_MIX_MESSAGE = "cannot mix bytes and nonbytes literals";

    private static class BytesBuilder {
        List bytes = new ArrayList<>();
        int len = 0;

        void append(byte[] b) {
            len += b.length;
            bytes.add(b);
        }

        byte[] build() {
            byte[] output = new byte[len];
            int offset = 0;
            for (byte[] bs : bytes) {
                System.arraycopy(bs, 0, output, offset, bs.length);
                offset += bs.length;
            }
            return output;
        }
    }

    public static  ExprTy createStringLiteral(String[] values, SourceRange[] sourceRanges, FExprParser exprParser, ErrorCallback errorCallback, PythonStringFactory stringFactory,
                    int featureVersion) {
        assert values.length == sourceRanges.length && values.length > 0;
        PythonStringFactory.PythonStringBuilder sb = null;
        BytesBuilder bb = null;
        boolean isFormatString = false;
        ArrayList formatStringParts = null;
        SourceRange sourceRange = sourceRanges[0].withEnd(sourceRanges[sourceRanges.length - 1]);
        SourceRange sbSourceRange = null;
        boolean hasUPrefix = false;
        for (int index = 0; index < values.length; ++index) {
            String text = values[index];
            boolean isRaw = false;
            boolean isBytes = false;
            boolean isFormat = false;

            int strStartIndex = 1;
            int strEndIndex = text.length() - 1;

            OUTER: for (int i = 0; i < 3; i++) {
                char chr = Character.toLowerCase(text.charAt(i));
                switch (chr) {
                    case 'r':
                        isRaw = true;
                        break;
                    // unicode case (default)
                    case 'u':
                        hasUPrefix = index == 0;
                        break;
                    case 'b':
                        isBytes = true;
                        break;
                    case 'f':
                        isFormat = true;
                        break;
                    case '\'':
                    case '"':
                        strStartIndex = i + 1;
                        break OUTER;
                    default:
                        break;
                }
            }

            if (isFormat && featureVersion < 6) {
                errorCallback.onError(ErrorCallback.ErrorType.Syntax, sourceRange, "Format strings are only supported in Python 3.6 and greater");
            }

            if (text.endsWith("'''") || text.endsWith("\"\"\"")) {
                strStartIndex += 2;
                strEndIndex -= 2;
            }

            text = text.substring(strStartIndex, strEndIndex);
            if (isBytes) {
                for (int i = 0; i < text.length(); i++) {
                    if (text.charAt(i) >= 0x80) {
                        errorCallback.onError(ErrorCallback.ErrorType.Syntax, sourceRange, "bytes can only contain ASCII literal characters");
                    }
                }
                if (sb != null || isFormatString) {
                    errorCallback.onError(sourceRange, CANNOT_MIX_MESSAGE);
                    if (sb != null) {
                        return new ExprTy.Constant(ConstantValue.ofRaw(sb.build()), hasUPrefix ? "u" : null, sourceRange);
                    } else if (bb != null) {
                        return new ExprTy.Constant(ConstantValue.ofBytes(bb.build()), null, sourceRange);
                    } else {
                        return new ExprTy.Constant(ConstantValue.ofBytes(text.getBytes()), null, sourceRange);
                    }
                }
                if (bb == null) {
                    bb = new BytesBuilder();
                }
                if (isRaw) {
                    bb.append(text.getBytes());
                } else {
                    bb.append(decodeEscapeToBytes(errorCallback, text, sourceRange));
                }
            } else {
                if (bb != null) {
                    errorCallback.onError(sourceRange, CANNOT_MIX_MESSAGE);
                    return new ExprTy.Constant(ConstantValue.ofBytes(bb.build()), null, sourceRange);
                }
                if (isFormat) {
                    isFormatString = true;
                    if (formatStringParts == null) {
                        formatStringParts = new ArrayList<>();
                    }
                    if (sb != null && !sb.isEmpty()) {
                        formatStringParts.add(new ExprTy.Constant(ConstantValue.ofRaw(sb.build()), hasUPrefix ? "u" : null, sbSourceRange));
                        sb = null;
                    }

                    FormatStringParser.parse(formatStringParts, text, sourceRanges[index].shiftStartRight(strStartIndex), isRaw, exprParser, errorCallback, stringFactory, featureVersion, hasUPrefix);
                } else {
                    if (sb == null) {
                        sb = stringFactory.createBuilder(text.length());
                        sbSourceRange = sourceRanges[index];
                    }
                    if (!isRaw) {
                        sb.appendPythonString(unescapeString(sourceRanges[index], errorCallback, text, stringFactory));
                    } else {
                        sb.appendString(text);
                    }
                    sbSourceRange = sbSourceRange.withEnd(sourceRanges[index]);
                }
            }
        }

        if (bb != null) {
            return new ExprTy.Constant(ConstantValue.ofBytes(bb.build()), null, sourceRange);
        } else if (isFormatString) {
            assert formatStringParts != null; // guaranteed due to how isFormatString is set
            if (sb != null && !sb.isEmpty()) {
                formatStringParts.add(new ExprTy.Constant(ConstantValue.ofRaw(sb.build()), hasUPrefix ? "u" : null, sbSourceRange));
            }
            ExprTy[] formatParts = formatStringParts.toArray(EMPTY_SST_ARRAY);
            return new ExprTy.JoinedStr(formatParts, sourceRange);
        }
        return new ExprTy.Constant(ConstantValue.ofRaw(sb == null ? stringFactory.emptyString() : sb.build()), hasUPrefix ? "u" : null, sourceRange);
    }

    private static final class FormatStringParser {
        // error messages from parsing
        public static final String ERROR_MESSAGE_EMPTY_EXPRESSION = "f-string: empty expression not allowed";
        public static final String ERROR_MESSAGE_NESTED_TOO_DEEPLY = "f-string: expressions nested too deeply";
        public static final String ERROR_MESSAGE_SINGLE_BRACE = "f-string: single '}' is not allowed";
        public static final String ERROR_MESSAGE_INVALID_CONVERSION = "f-string: invalid conversion character: expected 's', 'r', or 'a'";
        public static final String ERROR_MESSAGE_UNTERMINATED_STRING = "f-string: unterminated string";
        public static final String ERROR_MESSAGE_BACKSLASH_IN_EXPRESSION = "f-string expression part cannot include a backslash";
        public static final String ERROR_MESSAGE_HASH_IN_EXPRESSION = "f-string expression part cannot include '#'";
        public static final String ERROR_MESSAGE_CLOSING_PAR_DOES_NOT_MATCH = "f-string: closing parenthesis '%c' does not match opening parenthesis '%c'";
        public static final String ERROR_MESSAGE_UNMATCHED_PAR = "f-string: unmatched '%c'";
        public static final String ERROR_MESSAGE_TOO_MANY_NESTED_PARS = "f-string: too many nested parenthesis";
        public static final String ERROR_MESSAGE_EXPECTING_CLOSING_BRACE = "f-string: expecting '}'";
        public static final String ERROR_MESSAGE_SELF_DOCUMENTING_EXPRESSION = "f-string: self documenting expressions are only supported in Python 3.8 and greater";

        // token types and Token data holder (public for testing purposes)
        public static final byte TOKEN_TYPE_STRING = 1;
        public static final byte TOKEN_TYPE_EXPRESSION = 2;
        public static final byte TOKEN_TYPE_EXPRESSION_STR = 3;
        public static final byte TOKEN_TYPE_EXPRESSION_REPR = 4;
        public static final byte TOKEN_TYPE_EXPRESSION_ASCII = 5;

        public static final class Token {
            public byte type;
            /**
             * Start and end index within the parsed f-string.
             */
            public final int startIndex;
            public final int endIndex;
            /**
             * Count how many tokens follow as tokens of format specifier. So the next expression or
             * string is not the next token, but the next token + the value under this index. Value
             * is useful/defined only for expression tokens.
             */
            public int formatTokensCount;

            public Token(byte type, int startIndex, int endIndex) {
                this.type = type;
                this.startIndex = startIndex;
                this.endIndex = endIndex;
            }

            @Override
            public String toString() {
                StringBuilder sb = new StringBuilder("Token(");
                sb.append("Type: ").append(type);
                sb.append(", [").append(startIndex).append(", ").append(endIndex).append("], Inner tokens: ");
                sb.append(formatTokensCount).append(')');
                return sb.toString();
            }

            SourceRange getSourceRange(String text, SourceRange textSourceRange) {
                int column = textSourceRange.startColumn;
                int line = textSourceRange.startLine;
                for (int i = 0; i < startIndex; ++i) {
                    if (text.charAt(i) == '\n') {
                        line++;
                        column = 0;
                    } else {
                        column++;
                    }
                }
                int startColumn = column;
                int startLine = line;
                for (int i = startIndex; i < endIndex; ++i) {
                    if (text.charAt(i) == '\n') {
                        line++;
                        column = 0;
                    } else {
                        column++;
                    }
                }
                return new SourceRange(startLine, startColumn, line, column);
            }
        }

        private static ExprTy createFormatStringLiteralSSTNodeFromToken(ArrayList tokens, int tokenIndex, String text, SourceRange textSourceRange, boolean isRawString,
                        ErrorCallback errorCallback, FExprParser exprParser, PythonStringFactory stringFactory, boolean hasUPrefix) {
            Token token = tokens.get(tokenIndex);
            String code = text.substring(token.startIndex, token.endIndex);
            if (token.type == TOKEN_TYPE_STRING) {
                Object o;
                if (!isRawString) {
                    o = unescapeString(token.getSourceRange(text, textSourceRange), errorCallback, code, stringFactory);
                } else {
                    o = stringFactory.fromJavaString(code);
                }
                return new ExprTy.Constant(ConstantValue.ofRaw(o), hasUPrefix ? "u" : null, token.getSourceRange(text, textSourceRange));
            }
            int specTokensCount = token.formatTokensCount;
            // the expression has to be wrapped in ()
            code = "(" + code + ")";
            // TODO: pass isInteractive flag
            ExprTy expression = exprParser.parse(code, token.getSourceRange(text, textSourceRange));
            if (expression == null) {
                // TODO handle and raise appropriate error msg
                errorCallback.onError(ErrorCallback.ErrorType.Syntax, textSourceRange, "invalid syntax");
                return null;
            }
            ExprTy specifier = null;
            SourceRange specifierSourceRange = null;
            if (specTokensCount > 0) {
                ExprTy[] specifierParts = new ExprTy[specTokensCount];
                int specifierTokenStartIndex = tokenIndex + 1;
                Token specToken = tokens.get(specifierTokenStartIndex);
                specifierSourceRange = specToken.getSourceRange(text, textSourceRange);
                int realCount = 0;
                int i;

                for (i = 0; i < specTokensCount; i++) {
                    specToken = tokens.get(specifierTokenStartIndex + i);
                    specifierParts[realCount++] = createFormatStringLiteralSSTNodeFromToken(tokens, specifierTokenStartIndex + i, text, textSourceRange, isRawString, errorCallback, exprParser,
                                    stringFactory, hasUPrefix);
                    i = i + specToken.formatTokensCount;
                }

                if (realCount < i) {
                    specifierParts = Arrays.copyOf(specifierParts, realCount);
                }

                specifierSourceRange = specifierSourceRange.withEnd(specToken.getSourceRange(text, textSourceRange));
                specifier = new ExprTy.JoinedStr(specifierParts, specifierSourceRange);
            }
            int conversionType;
            switch (token.type) {
                case TOKEN_TYPE_EXPRESSION_STR:
                    conversionType = 's';
                    break;
                case TOKEN_TYPE_EXPRESSION_REPR:
                    conversionType = 'r';
                    break;
                case TOKEN_TYPE_EXPRESSION_ASCII:
                    conversionType = 'a';
                    break;
                default:
                    conversionType = -1;
            }
            SourceRange range = token.getSourceRange(text, textSourceRange);
            if (specifierSourceRange != null) {
                range = range.withEnd(specifierSourceRange);
            }
            if (conversionType != -1) {
                range = range.shiftEndRight(2);
            }
            return new ExprTy.FormattedValue(expression, conversionType, specifier, range);
        }

        /**
         * Parses f-string into an array of {@link ExprTy}. The nodes can end up being
         * {@link ExprTy.Constant} or {@link ExprTy.FormattedValue}.
         */
        public static void parse(ArrayList formatStringParts, String text, SourceRange textSourceRange, boolean isRawString, FExprParser exprParser, ErrorCallback errorCallback,
                        PythonStringFactory stringFactory, int featureVersion, boolean hasUPrefix) {
            int estimatedTokensCount = 1;
            for (int i = 0; i < text.length(); i++) {
                char c = text.charAt(i);
                if (c == '{' || c == '}' || c == ':') {
                    estimatedTokensCount++;
                    if (estimatedTokensCount > 32) {
                        // don't get too crazy...
                        break;
                    }
                }
            }

            // create tokens
            ArrayList tokens = new ArrayList<>(estimatedTokensCount);
            if (createTokens(tokens, errorCallback, 0, text, textSourceRange, isRawString, 0, featureVersion) < 0) {
                return;
            }

            // create nodes from the tokens
            int tokenIndex = 0;
            Token token;

            while (tokenIndex < tokens.size()) {
                token = tokens.get(tokenIndex);
                ExprTy part = createFormatStringLiteralSSTNodeFromToken(tokens, tokenIndex, text, textSourceRange, isRawString, errorCallback, exprParser, stringFactory, hasUPrefix);
                formatStringParts.add(part);
                tokenIndex = tokenIndex + 1 + token.formatTokensCount;
            }

        }

        // parsing the format string
        private static final int STATE_TEXT = 1;  // in text
        private static final int STATE_AFTER_OPEN_BRACE = 2; // just after {
        private static final int STATE_AFTER_CLOSE_BRACE = 3; // just after }
        private static final int STATE_AFTER_EXCLAMATION = 4; // just after !
        private static final int STATE_AFTER_COLON = 5; // just after :
        private static final int STATE_EXPRESSION = 6; // in {}
        private static final int STATE_UNKNOWN = 7;

        private static final int MAX_PAR_NESTING = 200;

        /**
         * This is the parser of the fstring.
         *
         * @param tokens list where the parsed tokens will be stored
         * @param errorCallback it's needed for raising syntax errors
         * @param startIndex start parsing from this index
         * @param text text to be parsed
         * @param isRawString whether the String is raw, i.e., escape sequences should be
         *            interpreted as a verbatim text
         * @param recursionLevel recursive calls are used for parsing the formatting string, which
         *            may contain other expressions. Depending on the recursive level some rules
         *            apply differently.
         * @return the index of the last processed character or {@code -1} on error
         */
        public static int createTokens(ArrayList tokens, ErrorCallback errorCallback, int startIndex, String text, SourceRange textSourceRange, boolean isRawString, int recursionLevel,
                        int featureVersion) {
            int state = STATE_TEXT;

            int braceLevel = 0;
            int braceLevelInExpression = 0;
            char[] bracesInExpression = new char[MAX_PAR_NESTING];
            int len = text.length();
            int index = startIndex;
            int start = startIndex;
            boolean toplevel = recursionLevel == 0;
            // currentExpression is set by '=' or '!' handlers, which create the expression token,
            // and
            // is read by the ':', which either needs to create the expression token itself or
            // should
            // reuse the created by '=' or '!' if preceded by '=' or '!', e.g., f'{expr!s:10<}'
            Token currentExpression = null;
            parserLoop: while (index < len) {
                char ch = text.charAt(index);
                switch (state) {
                    case STATE_TEXT:
                        switch (ch) {
                            case '\\':
                                if (isRawString) {
                                    break;
                                }
                                if (lookahead(text, index, len, '\\')) {
                                    // double "\\" is skipped, note that "\\\N{...}" should still be
                                    // treated as \N escape sequence
                                    index++;
                                } else if (lookahead(text, index, len, '{')) {
                                    warnInvalidEscapeSequence(errorCallback, textSourceRange, text.charAt(index + 1));
                                } else if (lookahead(text, index, len, 'N', '{')) {
                                    // skip escape sequence \N{...}, it should not be treated as an
                                    // expression inside f-string, but \\N{...} should be left
                                    // intact
                                    index += 2;
                                    while (index < len && text.charAt(index) != '}') {
                                        index++;
                                    }
                                    if (index >= len) {
                                        // Missing the closing brace. The escape sequence is
                                        // malformed,
                                        // which will be reported by the String escaping code later,
                                        // here we just end the parsing
                                        index = len - 1;
                                        break parserLoop;
                                    }
                                }
                                break;
                            case '{':
                                if (start < index) {
                                    tokens.add(new Token(TOKEN_TYPE_STRING, start, index));
                                }
                                state = STATE_AFTER_OPEN_BRACE;
                                start = index + 1;
                                braceLevel++;
                                break;
                            case '}':
                                braceLevel--;
                                if (braceLevel == -1) {
                                    if (!toplevel) {
                                        // We are parsing a format specifier (nested f-string) and
                                        // here
                                        // we reached the closing brace of the top-level f-string,
                                        // i.e.,
                                        // the end of the nested f-string too
                                        break parserLoop;
                                    }
                                }
                                state = STATE_AFTER_CLOSE_BRACE;
                                break;
                        }
                        break;
                    case STATE_AFTER_OPEN_BRACE:
                        if (ch == '}' || ch == '=') {
                            errorCallback.onError(textSourceRange, ERROR_MESSAGE_EMPTY_EXPRESSION);
                            return -1;
                        }
                        if (ch == '{' && toplevel) {
                            // '{' escaping works only when parsing the expression, not when parsing
                            // the
                            // format (i.e., when we are in the recursive call)
                            state = STATE_TEXT;
                            braceLevel--;
                        } else if (recursionLevel == 2) {
                            // we are inside formatting expression of another formatting expression,
                            // example: f'{42:{42:{42}}}'. This level of nesting is not allowed.
                            errorCallback.onError(textSourceRange, ERROR_MESSAGE_NESTED_TOO_DEEPLY);
                            return -1;
                        } else {
                            index--;
                            state = STATE_EXPRESSION;
                            braceLevelInExpression = 0;
                            currentExpression = null;
                        }
                        break;
                    case STATE_AFTER_CLOSE_BRACE:
                        if (toplevel && ch == '}') {
                            // after '}' should in this moment follow second '}', only allowed when
                            // parsing the expression, not when parsing the format
                            if (start < index) {
                                tokens.add(new Token(TOKEN_TYPE_STRING, start, index));
                            }
                            braceLevel++;
                            if (braceLevel == 0) {
                                state = STATE_TEXT;
                            }
                            start = index + 1;
                        } else {
                            errorCallback.onError(textSourceRange, ERROR_MESSAGE_SINGLE_BRACE);
                            return -1;
                        }
                        break;
                    case STATE_EXPRESSION:
                        if (index + 1 < len) {
                            // Some patterns of two characters, such as '!=', should be skipped
                            if ((ch == '!' || ch == '<' || ch == '>' || ch == '=') && (text.charAt(index + 1) == '=')) {
                                index += 2;
                                continue;
                            }
                        }
                        switch (ch) {
                            case '{':
                            case '(':
                            case '[':
                                bracesInExpression[braceLevelInExpression] = ch;
                                braceLevelInExpression++;
                                if (braceLevelInExpression >= MAX_PAR_NESTING) {
                                    errorCallback.onError(textSourceRange, ERROR_MESSAGE_TOO_MANY_NESTED_PARS);
                                    return -1;
                                }
                                break;
                            case ')':
                            case ']':
                                if (braceLevelInExpression == 0) {
                                    errorCallback.onError(textSourceRange, ERROR_MESSAGE_UNMATCHED_PAR, ch);
                                    return -1;
                                }
                                braceLevelInExpression--;
                                char expected = ch == ')' ? '(' : '[';
                                if (bracesInExpression[braceLevelInExpression] != expected) {
                                    errorCallback.onError(textSourceRange, ERROR_MESSAGE_CLOSING_PAR_DOES_NOT_MATCH, ch, bracesInExpression[braceLevelInExpression]);
                                    return -1;
                                }
                                break;
                            case '}':
                                if (braceLevelInExpression == 0) {
                                    Token t = createExpressionToken(errorCallback, text, textSourceRange, start, index);
                                    if (t == null) {
                                        return -1;
                                    }
                                    tokens.add(t);
                                    braceLevel--;
                                    state = STATE_TEXT;
                                    start = index + 1;
                                } else {
                                    braceLevelInExpression--;
                                    if (bracesInExpression[braceLevelInExpression] != '{') {
                                        errorCallback.onError(textSourceRange, ERROR_MESSAGE_CLOSING_PAR_DOES_NOT_MATCH, '}', bracesInExpression[braceLevelInExpression]);
                                        return -1;
                                    }
                                }
                                break;
                            case '=':
                                if (braceLevelInExpression == 0) {
                                    if (featureVersion < 8) {
                                        errorCallback.onError(textSourceRange, ERROR_MESSAGE_SELF_DOCUMENTING_EXPRESSION);
                                        return -1;
                                    }
                                    // The "=" mode, e.g., f'{1+1=}' produces "1+1=2"
                                    // Python allows '=' to be followed by whitespace, but nothing
                                    // else
                                    // "=" inside format specification
                                    int expressionEndIndex = index;
                                    index++;
                                    while (index < len && Character.isWhitespace(text.charAt(index))) {
                                        index++;
                                    }

                                    // Have we reached a legal end character of an expression?
                                    if (index >= len) {
                                        errorCallback.onError(textSourceRange, ERROR_MESSAGE_EXPECTING_CLOSING_BRACE);
                                        return -1;
                                    }
                                    char endChar = text.charAt(index);
                                    if (endChar != '}' && endChar != ':' && endChar != '!') {
                                        errorCallback.onError(textSourceRange, ERROR_MESSAGE_EXPECTING_CLOSING_BRACE);
                                        return -1;
                                    }

                                    // add verbatim text of the expression (including the "=" and
                                    // any
                                    // spaces after it) and the expression itself
                                    tokens.add(new Token(TOKEN_TYPE_STRING, start, index));
                                    currentExpression = createExpressionToken(errorCallback, text, textSourceRange, start, expressionEndIndex);
                                    if (currentExpression == null) {
                                        return -1;
                                    }
                                    tokens.add(currentExpression);
                                    if (endChar == '}') {
                                        // "debug" expressions are by default converted using
                                        // "repr",
                                        // but as long as there is no format
                                        currentExpression.type = TOKEN_TYPE_EXPRESSION_REPR;
                                        // we're done with the expression
                                        braceLevel--;
                                        state = STATE_TEXT;
                                        start = index + 1;
                                        currentExpression = null;
                                    } else if (endChar == '!') {
                                        // parse the format specifier, this state expects to see the
                                        // expression token in currentExpression
                                        state = STATE_AFTER_EXCLAMATION;
                                    } else {
                                        // endChar must be ':'
                                        // parse ':' again, the ':' handler checks the
                                        // currentExpression
                                        start = index;
                                        state = STATE_AFTER_COLON;
                                    }
                                }
                                break;
                            case '\'':
                            case '"':
                                index = skipString(errorCallback, text, textSourceRange, index, len, ch);
                                if (index < 0) {
                                    return index;
                                }
                                break;
                            case '!':
                                state = STATE_AFTER_EXCLAMATION;
                                currentExpression = createExpressionToken(errorCallback, text, textSourceRange, start, index);
                                if (currentExpression == null) {
                                    return -1;
                                }
                                tokens.add(currentExpression);
                                break;
                            case ':':
                                if (braceLevelInExpression == 0) {
                                    currentExpression = createExpressionToken(errorCallback, text, textSourceRange, start, index);
                                    if (currentExpression == null) {
                                        return -1;
                                    }
                                    tokens.add(currentExpression);
                                    state = STATE_AFTER_COLON;
                                }
                                break;
                            case '#':
                                errorCallback.onError(textSourceRange, ERROR_MESSAGE_HASH_IN_EXPRESSION);
                                return -1;
                            case '\\':
                                errorCallback.onError(textSourceRange, ERROR_MESSAGE_BACKSLASH_IN_EXPRESSION);
                                return -1;
                            default:
                                break;
                        }
                        break;
                    case STATE_AFTER_EXCLAMATION:
                        assert currentExpression != null;
                        switch (ch) {
                            case 's':
                                currentExpression.type = TOKEN_TYPE_EXPRESSION_STR;
                                break;
                            case 'r':
                                currentExpression.type = TOKEN_TYPE_EXPRESSION_REPR;
                                break;
                            case 'a':
                                currentExpression.type = TOKEN_TYPE_EXPRESSION_ASCII;
                                break;
                            default:
                                errorCallback.onError(textSourceRange, ERROR_MESSAGE_INVALID_CONVERSION);
                                return -1;
                        }
                        start = index + 2;
                        index++;
                        char next = index < len ? text.charAt(index) : Character.MAX_VALUE;
                        switch (next) {
                            case ':':
                                state = STATE_AFTER_COLON;
                                break;
                            case '}':
                                // We're done with the expression
                                state = STATE_TEXT;
                                braceLevel--;
                                break;
                            default:
                                errorCallback.onError(textSourceRange, ERROR_MESSAGE_EXPECTING_CLOSING_BRACE);
                                return -1;
                        }
                        break;
                    case STATE_AFTER_COLON:
                        assert currentExpression != null;
                        int tokensSizeBefore = tokens.size();
                        index = createTokens(tokens, errorCallback, index, text, textSourceRange, isRawString, recursionLevel + 1, featureVersion);
                        if (index < 0) {
                            return -1;
                        }
                        currentExpression.formatTokensCount = tokens.size() - tokensSizeBefore;
                        if (index >= len || text.charAt(index) != '}') {
                            errorCallback.onError(textSourceRange, ERROR_MESSAGE_EXPECTING_CLOSING_BRACE);
                            return -1;
                        }
                        braceLevel--;
                        state = STATE_TEXT;
                        start = index + 1;
                        break;
                    case STATE_UNKNOWN:
                        if (ch == '}') {
                            state = STATE_TEXT;
                            start = index + 1;
                        }
                        break;
                }
                index++;
            }
            switch (state) {
                case STATE_TEXT:
                    if (start < index) {
                        // handle the end of the string
                        tokens.add(new Token(TOKEN_TYPE_STRING, start, index));
                    }
                    break;
                case STATE_AFTER_CLOSE_BRACE:
                    errorCallback.onError(textSourceRange, ERROR_MESSAGE_SINGLE_BRACE);
                    return -1;
                case STATE_AFTER_EXCLAMATION:
                case STATE_AFTER_OPEN_BRACE:
                case STATE_AFTER_COLON:
                    errorCallback.onError(textSourceRange, ERROR_MESSAGE_EXPECTING_CLOSING_BRACE);
                    return -1;
                case STATE_EXPRESSION:
                    // expression is not allowed to span multiple f-strings: f'{3+' f'1}' is not
                    // the same as f'{3+1}'
                    errorCallback.onError(textSourceRange, ERROR_MESSAGE_EXPECTING_CLOSING_BRACE);
                    return -1;
            }
            return index;
        }

        private static boolean lookahead(String text, int index, int len, char c1) {
            return index + 1 < len && text.charAt(index + 1) == c1;
        }

        private static boolean lookahead(String text, int index, int len, char c1, char c2) {
            return index + 2 < len && text.charAt(index + 1) == c1 && text.charAt(index + 2) == c2;
        }

        /**
         * Skips a string literal. Checks for all the valid quotation styles.
         */
        private static int skipString(ErrorCallback errorCallback, String text, SourceRange sourceRange, int startIndex, int len, char startq) {
            boolean triple = false;
            boolean inString = true;
            int index = startIndex + 1;
            if (index < len && startq == text.charAt(index)) {
                if (lookahead(text, index, len, startq)) {
                    // we are in ''' or """ string, fully consume the quotes
                    triple = true;
                    index += 2;
                } else {
                    // we are in empty string "" or ''
                    inString = false;
                }
            }
            if (inString) {
                while (index < len) {
                    char ch = text.charAt(index);
                    if (ch == '\\') {
                        errorCallback.onError(sourceRange, ERROR_MESSAGE_BACKSLASH_IN_EXPRESSION);
                        return -1;
                    }
                    if (ch == startq) {
                        if (triple) {
                            // single quote should be ignored in a triple quoted string
                            if (lookahead(text, index, len, startq, startq)) {
                                inString = false;
                                break;
                            }
                        } else {
                            inString = false;
                            break;
                        }
                    }
                    index++;
                }
                if (inString) {
                    errorCallback.onError(sourceRange, ERROR_MESSAGE_UNTERMINATED_STRING);
                    return -1;
                }
            }
            return index;
        }

        /**
         * Return an expression Token or {@code null} on error.
         */
        private static Token createExpressionToken(ErrorCallback errorCallback, String text, SourceRange sourceRange, int start, int end) {
            if (start >= end) {
                errorCallback.onError(sourceRange, ERROR_MESSAGE_EMPTY_EXPRESSION);
                return null;
            }
            boolean onlyWhiteSpaces = true;
            for (int index = start; index < end; index++) {
                if (!Character.isWhitespace(text.charAt(index))) {
                    onlyWhiteSpaces = false;
                    break;
                }
            }
            if (onlyWhiteSpaces) {
                errorCallback.onError(sourceRange, ERROR_MESSAGE_EMPTY_EXPRESSION);
                return null;
            }
            return new Token(TOKEN_TYPE_EXPRESSION, start, end);
        }
    }

    private static byte[] decodeEscapeToBytes(ErrorCallback errors, String string, SourceRange sourceRange) {
        StringBuilder sb = decodeEscapes(errors, string, sourceRange, false);
        byte[] bytes = new byte[sb.length()];
        for (int i = 0; i < sb.length(); i++) {
            bytes[i] = (byte) sb.charAt(i);
        }
        return bytes;
    }

    public static StringBuilder decodeEscapes(ErrorCallback errors, String string, SourceRange sourceRange, boolean regexMode) {
        // see _PyBytes_DecodeEscape from
        // https://github.com/python/cpython/blob/master/Objects/bytesobject.c
        // TODO: for the moment we assume ASCII
        StringBuilder charList = new StringBuilder();
        int length = string.length();
        boolean wasDeprecationWarning = false;
        for (int i = 0; i < length; i++) {
            char chr = string.charAt(i);
            if (chr != '\\') {
                charList.append(chr);
                continue;
            }

            i++;
            if (i >= length) {
                errors.onError(ErrorCallback.ErrorType.Value, sourceRange, TRAILING_S_IN_STR, "\\");
                return charList;
            }

            chr = string.charAt(i);
            switch (chr) {
                case '\n':
                    break;
                case '\\':
                    if (regexMode) {
                        charList.append('\\');
                    }
                    charList.append('\\');
                    break;
                case '\'':
                    charList.append('\'');
                    break;
                case '\"':
                    charList.append('\"');
                    break;
                case 'b':
                    charList.append('\b');
                    break;
                case 'f':
                    charList.append('\014');
                    break; /* FF */
                case 't':
                    charList.append('\t');
                    break;
                case 'n':
                    charList.append('\n');
                    break;
                case 'r':
                    charList.append('\r');
                    break;
                case 'v':
                    charList.append('\013');
                    break; /* VT */
                case 'a':
                    charList.append('\007');
                    break; /* BEL */
                case '0':
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7':
                    if (!regexMode) {
                        int code = chr - '0';
                        if (i + 1 < length) {
                            char nextChar = string.charAt(i + 1);
                            if ('0' <= nextChar && nextChar <= '7') {
                                code = (code << 3) + nextChar - '0';
                                i++;

                                if (i + 1 < length) {
                                    nextChar = string.charAt(i + 1);
                                    if ('0' <= nextChar && nextChar <= '7') {
                                        code = (code << 3) + nextChar - '0';
                                        i++;
                                    }
                                }
                            }
                        }
                        charList.append((char) code);
                    } else {
                        // this mode is required for regex substitute to disambiguate from
                        // backreferences
                        charList.append('\\');
                        charList.append(chr);
                    }
                    break;
                case 'x':
                    if (i + 2 < length) {
                        try {
                            int b = Integer.parseInt(string.substring(i + 1, i + 3), 16);
                            assert b >= 0x00 && b <= 0xFF;
                            charList.append((char) b);
                            i += 2;
                            break;
                        } catch (NumberFormatException e) {
                            // fall through
                        }
                    }
                    errors.onError(ErrorCallback.ErrorType.Value, sourceRange, INVALID_ESCAPE_AT, "\\x", i);
                    return charList;
                default:
                    if (regexMode) {
                        if (chr == 'g' || (chr >= '0' && chr <= '9')) {
                            // only allow backslashes, named group references and numbered group
                            // references in regex mode
                            charList.append('\\');
                            charList.append(chr);
                        } else {
                            errors.onError(ErrorCallback.ErrorType.Value, sourceRange, INVALID_ESCAPE_AT, "\\x", i);
                            return charList;
                        }
                    } else {
                        charList.append('\\');
                        charList.append(chr);
                        if (!wasDeprecationWarning) {
                            wasDeprecationWarning = true;
                            warnInvalidEscapeSequence(errors, sourceRange, chr);
                        }
                    }
            }
        }

        return charList;
    }

    private static  T unescapeString(SourceRange sourceRange, ErrorCallback errorCallback, String st, PythonStringFactory stringFactory) {
        if (!st.contains("\\")) {
            return stringFactory.fromJavaString(st);
        }
        PythonStringFactory.PythonStringBuilder sb = stringFactory.createBuilder(st.length());
        boolean wasDeprecationWarning = false;
        for (int i = 0; i < st.length(); i++) {
            char ch = st.charAt(i);
            if (ch == '\\') {
                char nextChar = (i == st.length() - 1) ? '\\' : st.charAt(i + 1);
                // Octal escape?
                if (nextChar >= '0' && nextChar <= '7') {
                    String code = "" + nextChar;
                    i++;
                    if ((i < st.length() - 1) && st.charAt(i + 1) >= '0' && st.charAt(i + 1) <= '7') {
                        code += st.charAt(i + 1);
                        i++;
                        if ((i < st.length() - 1) && st.charAt(i + 1) >= '0' && st.charAt(i + 1) <= '7') {
                            code += st.charAt(i + 1);
                            i++;
                        }
                    }
                    sb.appendCodePoint(Integer.parseInt(code, 8));
                    continue;
                }
                switch (nextChar) {
                    case '\\':
                        ch = '\\';
                        break;
                    case 'a':
                        ch = '\u0007';
                        break;
                    case 'b':
                        ch = '\b';
                        break;
                    case 'f':
                        ch = '\f';
                        break;
                    case 'n':
                        ch = '\n';
                        break;
                    case 'r':
                        ch = '\r';
                        break;
                    case 't':
                        ch = '\t';
                        break;
                    case 'v':
                        ch = '\u000b';
                        break;
                    case '\"':
                        ch = '\"';
                        break;
                    case '\'':
                        ch = '\'';
                        break;
                    case '\r':
                        nextChar = (i == st.length() - 2) ? '\\' : st.charAt(i + 2);
                        if (nextChar == '\n') {
                            i++;
                        }
                        i++;
                        continue;
                    case '\n':
                        i++;
                        continue;
                    // Hex Unicode: u????
                    case 'u':
                        int code = getHexValue(st, sourceRange, i + 2, 4, errorCallback);
                        if (code < 0) {
                            return stringFactory.fromJavaString(st);
                        }
                        sb.appendCodePoint(code);
                        i += 5;
                        continue;
                    // Hex Unicode: U????????
                    case 'U':
                        code = getHexValue(st, sourceRange, i + 2, 8, errorCallback);
                        if (Character.isValidCodePoint(code)) {
                            sb.appendCodePoint(code);
                        } else {
                            errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, String.format(UNICODE_ERROR + ILLEGAL_CHARACTER, i, i + 9));
                            return stringFactory.fromJavaString(st);
                        }
                        i += 9;
                        continue;
                    // Hex Unicode: x??
                    case 'x':
                        code = getHexValue(st, sourceRange, i + 2, 2, errorCallback);
                        if (code < 0) {
                            return stringFactory.fromJavaString(st);
                        }
                        sb.appendCodePoint(code);
                        i += 3;
                        continue;
                    case 'N':
                        // a character from Unicode Data Database
                        i = doCharacterName(st, sourceRange, sb, i + 2, errorCallback);
                        if (i < 0) {
                            return stringFactory.fromJavaString(st);
                        }
                        continue;
                    default:
                        if (!wasDeprecationWarning) {
                            wasDeprecationWarning = true;
                            warnInvalidEscapeSequence(errorCallback, sourceRange, nextChar);
                        }
                        sb.appendCodePoint(ch);
                        sb.appendCodePoint(nextChar);
                        i++;
                        continue;
                }
                i++;
            }
            sb.appendCodePoint(ch);
        }
        return sb.build();
    }

    private static int getHexValue(String text, SourceRange sourceRange, int start, int len, ErrorCallback errorCb) {
        int digit;
        int result = 0;
        for (int index = start; index < (start + len); index++) {
            if (index < text.length()) {
                digit = Character.digit(text.charAt(index), 16);
                if (digit == -1) {
                    // Like cpython, raise error with the wrong character first,
                    // even if there are not enough characters
                    return createTruncatedError(sourceRange, start - 2, index - 1, len, errorCb);
                }
                result = result * 16 + digit;
            } else {
                return createTruncatedError(sourceRange, start - 2, index - 1, len, errorCb);
            }
        }
        return result;
    }

    private static int createTruncatedError(SourceRange sourceRange, int startIndex, int endIndex, int len, ErrorCallback errorCb) {
        String truncatedMessage = null;
        switch (len) {
            case 2:
                truncatedMessage = TRUNCATED_XXX_ERROR;
                break;
            case 4:
                truncatedMessage = TRUNCATED_UXXXX_ERROR;
                break;
            case 8:
                truncatedMessage = TRUNCATED_UXXXXXXXX_ERROR;
                break;
        }
        errorCb.onError(ErrorCallback.ErrorType.Encoding, sourceRange, UNICODE_ERROR + truncatedMessage, startIndex, endIndex);
        return -1;
    }

    /**
     * Replace '/N{Unicode Character Name}' with the code point of the character.
     *
     * @param text a text that contains /N{...} escape sequence
     * @param sb string builder where the result code point will be written
     * @param offset this is offset of the open brace
     * @return offset of the close brace or {@code -1} if an error was signaled
     */
    private static int doCharacterName(String text, SourceRange sourceRange, PythonStringFactory.PythonStringBuilder sb, int offset, ErrorCallback errorCallback) {
        if (offset >= text.length()) {
            errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, UNICODE_ERROR + MALFORMED_ERROR, offset - 2, offset - 1);
            return -1;
        }
        char ch = text.charAt(offset);
        if (ch != '{') {
            errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, UNICODE_ERROR + MALFORMED_ERROR, offset - 2, offset - 1);
            return -1;
        }
        int closeIndex = text.indexOf("}", offset + 1);
        if (closeIndex == -1) {
            errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, UNICODE_ERROR + MALFORMED_ERROR, offset - 2, offset - 1);
            return -1;
        }
        String charName = text.substring(offset + 1, closeIndex).toUpperCase();
        int cp = getCodePoint(charName);
        if (cp >= 0) {
            sb.appendCodePoint(cp);
        } else {
            errorCallback.onError(ErrorCallback.ErrorType.Encoding, sourceRange, UNICODE_ERROR + UNKNOWN_UNICODE_ERROR, offset - 2, closeIndex);
            return -1;
        }
        return closeIndex;
    }

    // Names for most control characters that mean 0-31, not some symbol
    private static final Map CONTROL_CHAR_NAMES = new HashMap<>(32);
    static {
        CONTROL_CHAR_NAMES.put("NULL", 0x0000);
        CONTROL_CHAR_NAMES.put("START OF HEADING", 0x0001);
        CONTROL_CHAR_NAMES.put("START OF TEXT", 0x0002);
        CONTROL_CHAR_NAMES.put("END OF TEXT", 0x0003);
        CONTROL_CHAR_NAMES.put("END OF TRANSMISSION", 0x0004);
        CONTROL_CHAR_NAMES.put("ENQUIRY", 0x0005);
        CONTROL_CHAR_NAMES.put("ACKNOWLEDGE", 0x0006);
        CONTROL_CHAR_NAMES.put("BELL", 0x0007);
        CONTROL_CHAR_NAMES.put("BACKSPACE", 0x0008);
        CONTROL_CHAR_NAMES.put("CHARACTER TABULATION", 0x0009);
        CONTROL_CHAR_NAMES.put("LINE FEED", 0x000A);
        CONTROL_CHAR_NAMES.put("LINE TABULATION", 0x000B);
        CONTROL_CHAR_NAMES.put("FORM FEED", 0x000C);
        CONTROL_CHAR_NAMES.put("CARRIAGE RETURN", 0x000D);
        CONTROL_CHAR_NAMES.put("SHIFT OUT", 0x000E);
        CONTROL_CHAR_NAMES.put("SHIFT IN", 0x000F);
        CONTROL_CHAR_NAMES.put("DATA LINK ESCAPE", 0x0010);
        CONTROL_CHAR_NAMES.put("DEVICE CONTROL ONE", 0x0011);
        CONTROL_CHAR_NAMES.put("DEVICE CONTROL TWO", 0x0012);
        CONTROL_CHAR_NAMES.put("DEVICE CONTROL THREE", 0x0013);
        CONTROL_CHAR_NAMES.put("DEVICE CONTROL FOUR", 0x0014);
        CONTROL_CHAR_NAMES.put("NEGATIVE ACKNOWLEDGE", 0x0015);
        CONTROL_CHAR_NAMES.put("SYNCHRONOUS IDLE", 0x0016);
        CONTROL_CHAR_NAMES.put("END OF TRANSMISSION BLOCK", 0x0017);
        CONTROL_CHAR_NAMES.put("CANCEL", 0x0018);
        CONTROL_CHAR_NAMES.put("END OF MEDIUM", 0x0019);
        CONTROL_CHAR_NAMES.put("SUBSTITUTE", 0x001A);
        CONTROL_CHAR_NAMES.put("ESCAPE", 0x001B);
        CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR FOUR", 0x001C);
        CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR THREE", 0x001D);
        CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR TWO", 0x001E);
        CONTROL_CHAR_NAMES.put("INFORMATION SEPARATOR ONE", 0x001F);
        CONTROL_CHAR_NAMES.put("BYTE ORDER MARK", 0xFEFF);
    }

    public static int getCodePoint(String charName) {
        int possibleChar = CONTROL_CHAR_NAMES.getOrDefault(charName.toUpperCase(Locale.ROOT), -1);
        if (possibleChar > -1) {
            return possibleChar;
        }
        possibleChar = UCharacter.getCharFromName(charName);
        if (possibleChar > -1) {
            return possibleChar;
        }
        possibleChar = UCharacter.getCharFromExtendedName(charName);
        if (possibleChar > -1) {
            return possibleChar;
        }
        possibleChar = UCharacter.getCharFromNameAlias(charName);
        if (possibleChar > -1) {
            return possibleChar;
        }
        return -1;
    }

    public static void warnInvalidEscapeSequence(ErrorCallback errorCallback, SourceRange sourceRange, char nextChar) {
        errorCallback.onWarning(WarningType.Deprecation, sourceRange, "invalid escape sequence '\\%c'", nextChar);
    }

    private static final String UNICODE_ERROR = "(unicode error) 'unicodeescape' codec can't decode bytes in position %d-%d:";
    private static final String ILLEGAL_CHARACTER = "illegal Unicode character";
    private static final String TRAILING_S_IN_STR = "Trailing %s in string";
    private static final String MALFORMED_ERROR = " malformed \\N character escape";
    private static final String TRUNCATED_XXX_ERROR = "truncated \\xXX escape";
    private static final String TRUNCATED_UXXXX_ERROR = "truncated \\uXXXX escape";
    private static final String TRUNCATED_UXXXXXXXX_ERROR = "truncated \\UXXXXXXXX escape";
    private static final String UNKNOWN_UNICODE_ERROR = " unknown Unicode character name";
    private static final String INVALID_ESCAPE_AT = "invalid %s escape at position %d";
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy