All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.joni.Parser Maven / Gradle / Ivy

The newest version!
/*
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 * of the Software, and to permit persons to whom the Software is furnished to do
 * so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package org.joni;

import static org.joni.BitStatus.bsOnAtSimple;
import static org.joni.BitStatus.bsOnOff;
import static org.joni.Option.isAsciiRange;
import static org.joni.Option.isDontCaptureGroup;
import static org.joni.Option.isIgnoreCase;
import static org.joni.Option.isPosixBracketAllRange;

import org.jcodings.ObjPtr;
import org.jcodings.Ptr;
import org.jcodings.constants.CharacterType;
import org.jcodings.constants.PosixBracket;
import org.jcodings.exception.InternalException;
import org.jcodings.unicode.UnicodeCodeRange;
import org.joni.ast.AnchorNode;
import org.joni.ast.AnyCharNode;
import org.joni.ast.BackRefNode;
import org.joni.ast.CClassNode;
import org.joni.ast.CClassNode.CCSTATE;
import org.joni.ast.CClassNode.CCStateArg;
import org.joni.ast.CClassNode.CCVALTYPE;
import org.joni.ast.CTypeNode;
import org.joni.ast.CallNode;
import org.joni.ast.EncloseNode;
import org.joni.ast.ListNode;
import org.joni.ast.Node;
import org.joni.ast.QuantifierNode;
import org.joni.ast.StringNode;
import org.joni.constants.internal.AnchorType;
import org.joni.constants.internal.EncloseType;
import org.joni.constants.internal.NodeType;
import org.joni.constants.internal.TokenType;
import org.joni.exception.ErrorMessages;

class Parser extends Lexer {
    protected int returnCode; // return code used by parser methods (they itself return parsed nodes)
                              // this approach will not affect recursive calls

    protected Parser(Regex regex, Syntax syntax, byte[]bytes, int p, int end, WarnCallback warnings) {
        super(regex, syntax, bytes, p, end, warnings);
    }

    private static final int POSIX_BRACKET_NAME_MIN_LEN            = 4;
    private static final int POSIX_BRACKET_CHECK_LIMIT_LENGTH      = 20;
    private static final byte BRACKET_END[]                        = ":]".getBytes();
    private boolean parsePosixBracket(CClassNode cc, CClassNode ascCc) {
        mark();

        boolean not;
        if (peekIs('^')) {
            inc();
            not = true;
        } else {
            not = false;
        }
        if (enc.strLength(bytes, p, stop) >= POSIX_BRACKET_NAME_MIN_LEN + 3) { // else goto not_posix_bracket
            boolean asciiRange = isAsciiRange(env.option) && !isPosixBracketAllRange(env.option);

            for (int i=0; i POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
        }

        if (c == ':' && left()) {
            inc();
            if (left()) {
                fetch();
                if (c == ']') newSyntaxException(INVALID_POSIX_BRACKET_TYPE);
            }
        }
        restore();
        return true; /* 1: is not POSIX bracket, but no error. */
    }

    private boolean codeExistCheck(int code, boolean ignoreEscaped) {
        mark();

        boolean inEsc = false;
        while (left()) {
            if (ignoreEscaped && inEsc) {
                inEsc = false;
            } else {
                fetch();
                if (c == code) {
                    restore();
                    return true;
                }
                if (c == syntax.metaCharTable.esc) inEsc = true;
            }
        }

        restore();
        return false;
    }

    private CClassNode parseCharClass(ObjPtr ascNode) {
        final boolean neg;
        CClassNode cc, prevCc = null, ascCc = null, ascPrevCc = null, workCc = null, ascWorkCc = null;
        CCStateArg arg = new CCStateArg();

        fetchTokenInCC();
        if (token.type == TokenType.CHAR && token.getC() == '^' && !token.escaped) {
            neg = true;
            fetchTokenInCC();
        } else {
            neg = false;
        }

        if (token.type == TokenType.CC_CLOSE && !syntax.op3OptionECMAScript()) {
            if (!codeExistCheck(']', true)) newSyntaxException(EMPTY_CHAR_CLASS);
            env.ccEscWarn("]");
            token.type = TokenType.CHAR; /* allow []...] */
        }

        cc = new CClassNode();
        if (isIgnoreCase(env.option)) {
            ascCc = ascNode.p = new CClassNode();
        }

        boolean andStart = false;
        arg.state = CCSTATE.START;
        while (token.type != TokenType.CC_CLOSE) {
            boolean fetched = false;

            switch (token.type) {
            case CHAR:
                final int len;
                if (token.getCode() >= BitSet.SINGLE_BYTE_SIZE || (len = enc.codeToMbcLength(token.getC())) > 1) {
                    arg.inType = CCVALTYPE.CODE_POINT;
                } else {
                    arg.inType = CCVALTYPE.SB; // sb_char:
                }
                arg.to = token.getC();
                arg.toIsRaw = false;
                parseCharClassValEntry2(cc, ascCc, arg); // goto val_entry2
                break;

            case RAW_BYTE:
                if (!enc.isSingleByte() && token.base != 0) { /* tok->base != 0 : octal or hexadec. */
                    byte[]buf = new byte[Config.ENC_MBC_CASE_FOLD_MAXLEN];
                    int psave = p;
                    int base = token.base;
                    buf[0] = (byte)token.getC();
                    int i;
                    for (i=1; i len) { /* fetch back */
                        p = psave;
                        for (i=1; i ascPtr = new ObjPtr();
                CClassNode acc = parseCharClass(ascPtr);
                cc.or(acc, env);
                if (ascPtr.p != null) {
                    ascCc.or(ascPtr.p, env);
                }
                break;

            case CC_AND:     /* && */
                if (arg.state == CCSTATE.VALUE) {
                    arg.to = 0;
                    arg.toIsRaw = false;
                    cc.nextStateValue(arg, ascCc, env);
                }
                /* initialize local variables */
                andStart = true;
                arg.state = CCSTATE.START;
                if (prevCc != null) {
                    prevCc.and(cc, env);
                    if (ascCc != null) {
                        ascPrevCc.and(ascCc, env);
                    }
                } else {
                    prevCc = cc;
                    if (workCc == null) workCc = new CClassNode();
                    cc = workCc;
                    if (ascCc != null) {
                        ascPrevCc = ascCc;
                        if (ascWorkCc == null) ascWorkCc = new CClassNode();
                        ascCc = ascWorkCc;
                    }
                }
                cc.clear();
                if (ascCc != null) ascCc.clear();
                break;

            case EOT:
                newSyntaxException(PREMATURE_END_OF_CHAR_CLASS);

            default:
                newInternalException(PARSER_BUG);
            } // switch

            if (!fetched) fetchTokenInCC();

        } // while

        if (arg.state == CCSTATE.VALUE) {
            arg.to = 0;
            arg.toIsRaw = false;
            cc.nextStateValue(arg, ascCc, env);
        }

        if (prevCc != null) {
            prevCc.and(cc, env);
            cc = prevCc;
            if (ascCc != null) {
                ascPrevCc.and(ascCc, env);
                ascCc = ascPrevCc;
            }
        }

        if (neg) {
            cc.setNot();
            if (ascCc != null) ascCc.setNot();
        } else {
            cc.clearNot();
            if (ascCc != null) ascCc.clearNot();
        }

        if (cc.isNot() && syntax.notNewlineInNegativeCC()) {
            if (!cc.isEmpty()) { // ???
                final int NEW_LINE = 0x0a;
                if (enc.isNewLine(NEW_LINE)) {
                    if (enc.codeToMbcLength(NEW_LINE) == 1) {
                        cc.bs.set(env, NEW_LINE);
                    } else {
                        cc.addCodeRange(env, NEW_LINE, NEW_LINE);
                    }
                }
            }
        }

        return cc;
    }

    private void parseCharClassSbChar(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
        arg.inType = CCVALTYPE.SB;
        arg.to = token.getC();
        arg.toIsRaw = false;
        parseCharClassValEntry2(cc, ascCc, arg); // goto val_entry2
    }

    private void parseCharClassRangeEndVal(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
        arg.to = '-';
        arg.toIsRaw = false;
        parseCharClassValEntry(cc, ascCc, arg); // goto val_entry
    }

    private void parseCharClassValEntry(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
        int len = enc.codeToMbcLength(arg.to);
        arg.inType = len == 1 ? CCVALTYPE.SB : CCVALTYPE.CODE_POINT;
        parseCharClassValEntry2(cc, ascCc, arg); // val_entry2:
    }

    private void parseCharClassValEntry2(CClassNode cc, CClassNode ascCc, CCStateArg arg) {
        cc.nextStateValue(arg, ascCc, env);
    }

    private Node parseEnclose(TokenType term) {
        Node node = null;

        if (!left()) newSyntaxException(END_PATTERN_WITH_UNMATCHED_PARENTHESIS);

        int option = env.option;

        if (peekIs('?') && syntax.op2QMarkGroupEffect()) {
            inc();
            if (!left()) newSyntaxException(END_PATTERN_IN_GROUP);

            boolean listCapture = false;

            fetch();
            switch(c) {
            case ':':  /* (?:...) grouping only */
                fetchToken(); // group:
                node = parseSubExp(term);
                returnCode = 1; /* group */
                return node;
            case '=':
                node = new AnchorNode(AnchorType.PREC_READ);
                break;
            case '!':  /*         preceding read */
                node = new AnchorNode(AnchorType.PREC_READ_NOT);
                if (syntax.op3OptionECMAScript()) {
                    env.pushPrecReadNotNode(node);
                }
                break;
            case '>':  /* (?>...) stop backtrack */
                node = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose
                break;
            case '~': /* (?~...) absent operator */
                if (syntax.op2QMarkTildeAbsent()) {
                    node = new EncloseNode(EncloseType.ABSENT);
                    break;
                } else {
                    newSyntaxException(UNDEFINED_GROUP_OPTION);
                }
            case '\'':
                if (Config.USE_NAMED_GROUP) {
                    if (syntax.op2QMarkLtNamedGroup()) {
                        listCapture = false; // goto named_group1
                        node = parseEncloseNamedGroup2(listCapture);
                        break;
                    } else {
                        newSyntaxException(UNDEFINED_GROUP_OPTION);
                    }
                } // USE_NAMED_GROUP
                break;
            case '<':  /* look behind (?<=...), (?...) */
                            }
                            unfetch();
                        }
                    } // USE_NAMED_GROUP
                    EncloseNode en = EncloseNode.newMemory(env.option, false);
                    int num = env.addMemEntry();
                    if (num >= BitStatus.BIT_STATUS_BITS_NUM) newValueException(GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY);
                    en.regNum = num;
                    node = en;
                } else {
                    newSyntaxException(UNDEFINED_GROUP_OPTION);
                }
                break;

            case '(':   /* conditional expression: (?(cond)yes), (?(cond)yes|no) */
                if (left() && syntax.op2QMarkLParenCondition()) {
                    int num = -1;
                    int name = -1;
                    fetch();
                    if (enc.isDigit(c)) { /* (n) */
                        unfetch();
                        num = fetchName('(', true);
                        if (syntax.strictCheckBackref()) {
                            if (num > env.numMem || env.memNodes == null || env.memNodes[num] == null) newValueException(INVALID_BACKREF);
                        }
                    } else {
                        if (Config.USE_NAMED_GROUP) {
                            if (c == '<' || c == '\'') {    /* (), ('name') */
                                name = p;
                                fetchNamedBackrefToken();
                                inc();
                                num = token.getBackrefNum() > 1 ? token.getBackrefRefs()[0] : token.getBackrefRef1();
                            }
                        } else { // USE_NAMED_GROUP
                            newSyntaxException(INVALID_CONDITION_PATTERN);
                        }
                    }
                    EncloseNode en = new EncloseNode(EncloseType.CONDITION);
                    en.regNum = num;
                    if (name != -1) en.setNameRef();
                    node = en;
                } else {
                    newSyntaxException(UNDEFINED_GROUP_OPTION);
                }
                break;

            case '^': /* loads default options */
                if (left() && syntax.op2OptionPerl()) {
                    /* d-imsx */
                    option = bsOnOff(option, Option.ASCII_RANGE, true);
                    option = bsOnOff(option, Option.IGNORECASE, true);
                    option = bsOnOff(option, Option.SINGLELINE, false);
                    option = bsOnOff(option, Option.MULTILINE, true);
                    option = bsOnOff(option, Option.EXTEND, true);
                    fetch();
                } else {
                    newSyntaxException(UNDEFINED_GROUP_OPTION);
                }

            // case 'p': #ifdef USE_POSIXLINE_OPTION
            case '-':
            case 'i':
            case 'm':
            case 's':
            case 'x':
            case 'a':
            case 'd':
            case 'l':
            case 'u':
                boolean neg = false;
                while (true) {
                    switch(c) {
                    case ':':
                    case ')':
                        break;
                    case '-':
                        neg = true;
                        break;
                    case 'x':
                        option = bsOnOff(option, Option.EXTEND, neg);
                        break;
                    case 'i':
                        option = bsOnOff(option, Option.IGNORECASE, neg);
                        break;
                    case 's':
                        if (syntax.op2OptionPerl()) {
                            option = bsOnOff(option, Option.MULTILINE, neg);
                        } else {
                            newSyntaxException(UNDEFINED_GROUP_OPTION);
                        }
                        break;
                    case 'm':
                        if (syntax.op2OptionPerl()) {
                            option = bsOnOff(option, Option.SINGLELINE, !neg);
                        } else if (syntax.op2OptionRuby()) {
                            option = bsOnOff(option, Option.MULTILINE, neg);
                        } else {
                            newSyntaxException(UNDEFINED_GROUP_OPTION);
                        }
                        break;
                    // case 'p': #ifdef USE_POSIXLINE_OPTION // not defined
                    // option = bsOnOff(option, Option.MULTILINE|Option.SINGLELINE, neg);
                    // break;

                    case 'a':     /* limits \d, \s, \w and POSIX brackets to ASCII range */
                        if ((syntax.op2OptionPerl() || syntax.op2OptionRuby()) && !neg) {
                            option = bsOnOff(option, Option.ASCII_RANGE, false);
                            option = bsOnOff(option, Option.POSIX_BRACKET_ALL_RANGE, true);
                            option = bsOnOff(option, Option.WORD_BOUND_ALL_RANGE, true);
                            break;
                        } else {
                            newSyntaxException(UNDEFINED_GROUP_OPTION);
                        }
                    case 'u':
                        if ((syntax.op2OptionPerl() || syntax.op2OptionRuby()) && !neg) {
                            option = bsOnOff(option, Option.ASCII_RANGE, true);
                            option = bsOnOff(option, Option.POSIX_BRACKET_ALL_RANGE, true);
                            option = bsOnOff(option, Option.WORD_BOUND_ALL_RANGE, true);
                            break;
                        } else {
                            newSyntaxException(UNDEFINED_GROUP_OPTION);
                        }

                    case 'd':
                        if (syntax.op2OptionPerl() && !neg) {
                            option = bsOnOff(option, Option.ASCII_RANGE, true);
                        } else if (syntax.op2OptionRuby() && !neg) {
                            option = bsOnOff(option, Option.ASCII_RANGE, false);
                            option = bsOnOff(option, Option.POSIX_BRACKET_ALL_RANGE, false);
                            option = bsOnOff(option, Option.WORD_BOUND_ALL_RANGE, false);
                        } else {
                            newSyntaxException(UNDEFINED_GROUP_OPTION);
                        }
                        break;

                    case 'l':
                        if (syntax.op2OptionPerl() && !neg) {
                            option = bsOnOff(option, Option.ASCII_RANGE, true);
                        } else {
                            newSyntaxException(UNDEFINED_GROUP_OPTION);
                        }
                        break;
                    default:
                        newSyntaxException(UNDEFINED_GROUP_OPTION);
                    } // switch

                    if (c == ')') {
                        node = EncloseNode.newOption(option);
                        returnCode = 2; /* option only */
                        return node;
                    } else if (c == ':') {
                        int prev = env.option;
                        env.option = option;
                        fetchToken();
                        Node target = parseSubExp(term);
                        env.option = prev;
                        EncloseNode en = EncloseNode.newOption(option);
                        en.setTarget(target);
                        node = en;
                        returnCode = 0;
                        return node;
                    }
                    if (!left()) newSyntaxException(END_PATTERN_IN_GROUP);
                    fetch();
                } // while

            default:
                newSyntaxException(UNDEFINED_GROUP_OPTION);
            } // switch

        } else {
            if (isDontCaptureGroup(env.option)) {
                fetchToken(); // goto group
                node = parseSubExp(term);
                returnCode = 1; /* group */
                return node;
            }
            EncloseNode en = EncloseNode.newMemory(env.option, false);
            en.regNum = env.addMemEntry();
            node = en;
        }

        fetchToken();
        Node target = parseSubExp(term);

        if (node.getType() == NodeType.ANCHOR) {
            AnchorNode an = (AnchorNode)node;
            an.setTarget(target);
            if (syntax.op3OptionECMAScript() && an.type == AnchorType.PREC_READ_NOT) {
                env.popPrecReadNotNode(an);
            }
        } else {
            EncloseNode en = (EncloseNode)node;
            en.setTarget(target);
            if (en.type == EncloseType.MEMORY) {
                if (syntax.op3OptionECMAScript()) {
                    en.containingAnchor = env.currentPrecReadNotNode();
                }
                /* Don't move this to previous of parse_subexp() */
                env.setMemNode(en.regNum, en);
            } else if (en.type == EncloseType.CONDITION) {
                if (target.getType() != NodeType.ALT) { /* convert (?(cond)yes) to (?(cond)yes|empty) */
                    en.setTarget(ListNode.newAlt(target, ListNode.newAlt(StringNode.EMPTY, null)));
                }
            }
        }
        returnCode = 0;
        return node; // ??
    }

    private Node parseEncloseNamedGroup2(boolean listCapture) {
        int nm = p;
        fetchName(c, false);
        int nameEnd = value;
        int num = env.addMemEntry();
        if (listCapture && num >= BitStatus.BIT_STATUS_BITS_NUM) newValueException(GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY);

        regex.nameAdd(bytes, nm, nameEnd, num, syntax);
        EncloseNode en = EncloseNode.newMemory(env.option, true);
        en.regNum = num;

        if (listCapture) env.captureHistory = bsOnAtSimple(env.captureHistory, num);
        env.numNamed++;
        return en;
    }

    private int findStrPosition(int[]s, int n, int from, int to, Ptr nextChar) {
        int x;
        int q;
        int p = from;
        int i;
        while (p < to) {
            x = enc.mbcToCode(bytes, p, to);
            q = p + enc.length(bytes, p, to);
            if (x == s[0]) {
                for (i=1; i= n) {
                    if (bytes[nextChar.p] != 0) nextChar.p = q; // we may need zero term semantics...
                    return p;
                }
            }
            p = q;
        }
        return -1;
    }

    private Node parseExp(TokenType term) {
        if (token.type == term) return StringNode.EMPTY; // goto end_of_token
        Node node = null;
        boolean group = false;

        switch(token.type) {
        case ALT:
        case EOT:
            return StringNode.EMPTY; // end_of_token:, node_new_empty

        case SUBEXP_OPEN:
            node = parseEnclose(TokenType.SUBEXP_CLOSE);
            if (returnCode == 1) {
                group = true;
            } else if (returnCode == 2) { /* option only */
                int prev = env.option;
                EncloseNode en = (EncloseNode)node;
                env.option = en.option;
                fetchToken();
                Node target = parseSubExp(term);
                env.option = prev;
                en.setTarget(target);
                return node;
            }
            break;
        case SUBEXP_CLOSE:
            if (!syntax.allowUnmatchedCloseSubexp()) newSyntaxException(UNMATCHED_CLOSE_PARENTHESIS);
            if (token.escaped) {
                return parseExpTkRawByte(group); // goto tk_raw_byte
            } else {
                return parseExpTkByte(group); // goto tk_byte
            }
        case LINEBREAK:
            node = parseLineBreak();
            break;

        case EXTENDED_GRAPHEME_CLUSTER:
            node = parseExtendedGraphemeCluster();
            break;

        case KEEP:
            node = new AnchorNode(AnchorType.KEEP);
            break;

        case STRING:
            return parseExpTkByte(group); // tk_byte:

        case RAW_BYTE:
            return parseExpTkRawByte(group); // tk_raw_byte:

        case CODE_POINT:
            return parseStringLoop(StringNode.fromCodePoint(token.getCode(), enc), group);

        case QUOTE_OPEN:
            node = parseQuoteOpen();
            break;

        case CHAR_TYPE:
            node = parseCharType(node);
            break;

        case CHAR_PROPERTY:
            node = parseCharProperty();
            break;

        case CC_OPEN: {
            ObjPtr ascPtr = new ObjPtr();
            CClassNode cc = parseCharClass(ascPtr);
            int code = cc.isOneChar();
            if (code != -1) return parseStringLoop(StringNode.fromCodePoint(code, enc), group);

            node = cc;
            if (isIgnoreCase(env.option)) node = cClassCaseFold(node, cc, ascPtr.p);
            break;
            }

        case ANYCHAR:
            node = new AnyCharNode();
            break;

        case ANYCHAR_ANYTIME:
            node = parseAnycharAnytime();
            break;

        case BACKREF:
            node = parseBackref();
            break;

        case CALL:
            if (Config.USE_SUBEXP_CALL) node = parseCall();
            break;

        case ANCHOR:
            node = new AnchorNode(token.getAnchorSubtype(), token.getAnchorASCIIRange());
            break;

        case OP_REPEAT:
        case INTERVAL:
            if (syntax.contextIndepRepeatOps()) {
                if (syntax.contextInvalidRepeatOps()) {
                    newSyntaxException(TARGET_OF_REPEAT_OPERATOR_NOT_SPECIFIED);
                } else {
                    node = StringNode.EMPTY; // node_new_empty
                }
            } else {
                return parseExpTkByte(group); // goto tk_byte
            }
            break;

        default:
            newInternalException(PARSER_BUG);
        } //switch

        //targetp = node;

        fetchToken(); // re_entry:

        return parseExpRepeat(node, group); // repeat:
    }

    private Node parseLineBreak() {
        byte[]buflb = new byte[Config.ENC_CODE_TO_MBC_MAXLEN * 2];
        int len1 = enc.codeToMbc(0x0D, buflb, 0);
        int len2 = enc.codeToMbc(0x0A, buflb, len1);
        StringNode left = new StringNode(buflb, 0, len1 + len2);
        left.setRaw();
        /* [\x0A-\x0D] or [\x0A-\x0D\x{85}\x{2028}\x{2029}] */
        CClassNode right = new CClassNode();
        if (enc.minLength() > 1) {
            right.addCodeRange(env, 0x0A, 0x0D);
        } else {
            right.bs.setRange(env, 0x0A, 0x0D);
        }

        if (enc.isUnicode()) {
            /* UTF-8, UTF-16BE/LE, UTF-32BE/LE */
            right.addCodeRange(env, 0x85, 0x85);
            right.addCodeRange(env, 0x2028, 0x2029);
        }
        /* (?>...) */
        EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK);
        en.setTarget(ListNode.newAlt(left, ListNode.newAlt(right, null)));
        return en;
    }

    private void addPropertyToCC(CClassNode cc, UnicodeCodeRange range, boolean not) {
        cc.addCType(range.getCType(), not, false, env, this);
    }

    private void createPropertyNode(Node[]nodes, int np, UnicodeCodeRange range) {
        CClassNode cc = new CClassNode();
        addPropertyToCC(cc, range, false);
        nodes[np] = cc;
    }

    private void quantifierNode(Node[]nodes, int np, int lower, int upper) {
        QuantifierNode qnf = new QuantifierNode(lower, upper, false);
        qnf.setTarget(nodes[np]);
        nodes[np] = qnf;
    }

    private void quantifierPropertyNode(Node[]nodes, int np, UnicodeCodeRange range, char repetitions) {
        int lower = 0;
        int upper = QuantifierNode.REPEAT_INFINITE;

        createPropertyNode(nodes, np, range);
        switch (repetitions) {
            case '?':  upper = 1;          break;
            case '+':  lower = 1;          break;
            case '*':                      break;
            case '2':  lower = upper = 2;  break;
            default :  throw new InternalException(ErrorMessages.PARSER_BUG);
        }

        quantifierNode(nodes, np, lower, upper);
    }

    private void createNodeFromArray(boolean list, Node[] nodes, int np, int nodeArray) {
        int i = 0;
        ListNode tmp = null;
        while (nodes[nodeArray + i] != null) i++;
        while (--i >= 0) {
            nodes[np] = list ? ListNode.newList(nodes[nodeArray + i], tmp) : ListNode.newAlt(nodes[nodeArray + i], tmp);
            nodes[nodeArray + i] = null;
            tmp = (ListNode)nodes[np];
        }
    }

    private ListNode createNodeFromArray(Node[]nodes, int nodeArray) {
        int i = 0;
        ListNode np = null, tmp = null;
        while (nodes[nodeArray + i] != null) i++;
        while (--i >= 0) {
            np = ListNode.newAlt(nodes[nodeArray + i], tmp);
            nodes[nodeArray + i] = null;
            tmp = np;
        }
        return np;
    }

    private static final int NODE_COMMON_SIZE = 16;
    private Node parseExtendedGraphemeCluster() {
        final Node[] nodes = new Node[NODE_COMMON_SIZE];
        final int anyTargetPosition;
        int alts = 0;

        StringNode strNode = new StringNode(Config.ENC_CODE_TO_MBC_MAXLEN * 2);
        strNode.setRaw();
        strNode.catCode(0x0D, enc);
        strNode.catCode(0x0A, enc);
        nodes[alts] = strNode;

        if (Config.USE_UNICODE_PROPERTIES && enc.isUnicode()) {
            CClassNode cc;
            cc = new CClassNode();
            nodes[alts + 1] = cc;
            addPropertyToCC(cc, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_CONTROL, false);
            if (enc.minLength() > 1) {
                cc.addCodeRange(env, 0x000A, 0x000A);
                cc.addCodeRange(env, 0x000D, 0x000D);
            } else {
                cc.bs.set(0x0A);
                cc.bs.set(0x0D);
            }

            {
                int list = alts + 3;
                quantifierPropertyNode(nodes, list + 0, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_PREPEND, '*');
                {
                    int coreAlts = list + 2;
                    {
                        int HList = coreAlts + 1;
                        quantifierPropertyNode(nodes, HList + 0, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_L, '*');
                        {
                            int HAlt2 = HList + 2;
                            quantifierPropertyNode(nodes, HAlt2 + 0, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_V, '+');
                            {
                                int HList2 = HAlt2 + 2;
                                createPropertyNode(nodes, HList2 + 0, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_LV);
                                quantifierPropertyNode(nodes, HList2 + 1, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_V, '*');
                                createNodeFromArray(true, nodes, HAlt2 + 1, HList2);
                            }
                            createPropertyNode(nodes, HAlt2 + 2, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_LVT);
                            createNodeFromArray(false, nodes, HList + 1, HAlt2);
                        }
                        quantifierPropertyNode(nodes, HList + 2, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_T, '*');
                        createNodeFromArray(true, nodes, coreAlts + 0, HList);
                    }
                    quantifierPropertyNode(nodes, coreAlts + 1, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_L, '+');
                    quantifierPropertyNode(nodes, coreAlts + 2, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_T, '+');
                    quantifierPropertyNode(nodes, coreAlts + 3, UnicodeCodeRange.REGIONALINDICATOR, '2');
                    {
                        int XPList = coreAlts + 5;
                        createPropertyNode(nodes, XPList + 0, UnicodeCodeRange.EXTENDEDPICTOGRAPHIC);
                        {
                            int ExList = XPList + 2;
                            quantifierPropertyNode(nodes, ExList + 0, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_EXTEND, '*');
                            strNode = new StringNode(Config.ENC_CODE_TO_MBC_MAXLEN);
                            strNode.setRaw();
                            strNode.catCode(0x200D, enc);
                            nodes[ExList + 1] = strNode;
                            createPropertyNode(nodes, ExList + 2, UnicodeCodeRange.EXTENDEDPICTOGRAPHIC);
                            createNodeFromArray(true, nodes, XPList + 1, ExList);
                        }
                        quantifierNode(nodes, XPList + 1, 0, QuantifierNode.REPEAT_INFINITE);
                        createNodeFromArray(true, nodes, coreAlts + 4, XPList);
                    }
                    cc = new CClassNode();
                    nodes[coreAlts + 5] = cc;
                    if (enc.minLength() > 1) {
                        addPropertyToCC(cc, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_CONTROL, false);
                        cc.addCodeRange(env, 0x000A, 0x000A);
                        cc.addCodeRange(env, 0x000D, 0x000D);
                        cc.mbuf = CodeRangeBuffer.notCodeRangeBuff(env, cc.mbuf);
                    } else {
                        addPropertyToCC(cc, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_CONTROL, true);
                        cc.bs.clear(0x0A);
                        cc.bs.clear(0x0D);
                    }
                    createNodeFromArray(false, nodes, list + 1, coreAlts);
                }
                createPropertyNode(nodes, list + 2, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_EXTEND);
                cc = (CClassNode)nodes[list + 2];
                addPropertyToCC(cc, UnicodeCodeRange.GRAPHEMECLUSTERBREAK_SPACINGMARK, false);
                cc.addCodeRange(env, 0x200D, 0x200D);
                quantifierNode(nodes, list + 2, 0, QuantifierNode.REPEAT_INFINITE);
                createNodeFromArray(true, nodes, alts + 2, list);

            }
            anyTargetPosition = 3;
        } else { // enc.isUnicode()
            anyTargetPosition = 1;
        }

        Node any = new AnyCharNode();
        EncloseNode option = EncloseNode.newOption(bsOnOff(env.option, Option.MULTILINE, false));
        option.setTarget(any);
        nodes[anyTargetPosition] = option;

        Node topAlt = createNodeFromArray(nodes, alts);
        EncloseNode enclose = new EncloseNode(EncloseType.STOP_BACKTRACK);
        enclose.setTarget(topAlt);

        if (Config.USE_UNICODE_PROPERTIES && enc.isUnicode()) {
            option = EncloseNode.newOption(bsOnOff(env.option, Option.IGNORECASE, true));
            option.setTarget(enclose);
            return option;
        } else {
            return enclose;
        }
    }

    private Node parseExpTkByte(boolean group) {
        StringNode node = new StringNode(bytes, token.backP, p); // tk_byte:
        return parseStringLoop(node, group);
    }

    private Node parseStringLoop(StringNode node, boolean group) {
        while (true) {
            fetchToken();
            if (token.type == TokenType.STRING) {
                if (token.backP == node.end) {
                    node.end = p; // non escaped character, remain shared, just increase shared range
                } else {
                    node.catBytes(bytes, token.backP, p); // non continuous string stream, need to COW
                }
            } else if (token.type == TokenType.CODE_POINT) {
                node.catCode(token.getCode(), enc);
            } else {
                break;
            }
        }
        // targetp = node;
        return parseExpRepeat(node, group); // string_end:, goto repeat
    }

    private Node parseExpTkRawByte(boolean group) {
        // tk_raw_byte:
        StringNode node = new StringNode();
        node.setRaw();
        node.catByte((byte)token.getC());

        int len = 1;
        while (true) {
            if (len >= enc.minLength()) {
                if (len == enc.length(node.bytes, node.p, node.end)) {
                    fetchToken();
                    node.clearRaw();
                    // !goto string_end;!
                    return parseExpRepeat(node, group);
                }
            }

            fetchToken();
            if (token.type != TokenType.RAW_BYTE) {
                /* Don't use this, it is wrong for little endian encodings. */
                // USE_PAD_TO_SHORT_BYTE_CHAR ...
                newValueException(TOO_SHORT_MULTI_BYTE_STRING);
            }
            node.catByte((byte)token.getC());
            len++;
        } // while
    }

    private Node parseExpRepeat(Node target, boolean group) {
        while (token.type == TokenType.OP_REPEAT || token.type == TokenType.INTERVAL) { // repeat:
            if (isInvalidQuantifier(target)) newSyntaxException(TARGET_OF_REPEAT_OPERATOR_INVALID);

            if (!group && syntax.op3OptionECMAScript() && target.getType() == NodeType.QTFR) {
                newSyntaxException(NESTED_REPEAT_NOT_ALLOWED);
            }
            QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(),
                                                     token.getRepeatUpper(),
                                                     token.type == TokenType.INTERVAL);

            qtfr.greedy = token.getRepeatGreedy();
            int ret = qtfr.setQuantifier(target, group, env, bytes, getBegin(), getEnd());
            Node qn = qtfr;

            if (token.getRepeatPossessive()) {
                EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose
                en.setTarget(qn);
                qn = en;
            }

            if (ret == 0 || (syntax.op3OptionECMAScript() && ret == 1)) {
                target = qn;
            } else if (ret == 2) { /* split case: /abc+/ */
                target = ListNode.newList(target, null);
                ListNode tmp = ListNode.newList(qn, null);
                ((ListNode)target).setTail(tmp);

                fetchToken();
                return parseExpRepeatForCar(target, tmp, group);
            }
            fetchToken(); // goto re_entry
        }
        return target;
    }

    private Node parseExpRepeatForCar(Node top, ListNode target, boolean group) {
        while (token.type == TokenType.OP_REPEAT || token.type == TokenType.INTERVAL) { // repeat:
            if (isInvalidQuantifier(target.value)) newSyntaxException(TARGET_OF_REPEAT_OPERATOR_INVALID);

            QuantifierNode qtfr = new QuantifierNode(token.getRepeatLower(),
                                                     token.getRepeatUpper(),
                                                     token.type == TokenType.INTERVAL);

            qtfr.greedy = token.getRepeatGreedy();
            int ret = qtfr.setQuantifier(target.value, group, env, bytes, getBegin(), getEnd());
            Node qn = qtfr;

            if (token.getRepeatPossessive()) {
                EncloseNode en = new EncloseNode(EncloseType.STOP_BACKTRACK); // node_new_enclose
                en.setTarget(qn);
                qn = en;
            }

            if (ret == 0) {
                target.setValue(qn);
            } else if (ret == 2) { /* split case: /abc+/ */
                assert false;
            }
            fetchToken(); // goto re_entry
        }
        return top;
    }

    private boolean isInvalidQuantifier(Node node) {
        if (Config.USE_NO_INVALID_QUANTIFIER) return false;

        ListNode consAlt;
        switch(node.getType()) {
        case NodeType.ANCHOR:
            return true;

        case NodeType.ENCLOSE:
            /* allow enclosed elements */
            /* return is_invalid_quantifier_target(NENCLOSE(node)->target); */
            break;

        case NodeType.LIST:
            consAlt = (ListNode)node;
            do {
                if (!isInvalidQuantifier(consAlt.value)) return false;
            } while ((consAlt = consAlt.tail) != null);
            return false;

        case NodeType.ALT:
            consAlt = (ListNode)node;
            do {
                if (isInvalidQuantifier(consAlt.value)) return true;
            } while ((consAlt = consAlt.tail) != null);
            break;

        default:
            break;
        }
        return false;
    }

    private Node parseQuoteOpen() {
        int[]endOp = new int[]{syntax.metaCharTable.esc, 'E'};
        int qstart = p;
        Ptr nextChar = new Ptr();
        int qend = findStrPosition(endOp, endOp.length, qstart, stop, nextChar);
        if (qend == -1) nextChar.p = qend = stop;
        Node node = new StringNode(bytes, qstart, qend);
        p = nextChar.p;
        return node;
    }

    private Node parseCharType(Node node) {
        switch(token.getPropCType()) {
        case CharacterType.WORD:
            node = new CTypeNode(token.getPropCType(), token.getPropNot(), isAsciiRange(env.option));
            break;

        case CharacterType.SPACE:
        case CharacterType.DIGIT:
        case CharacterType.XDIGIT:
            CClassNode ccn = new CClassNode();
            ccn.addCType(token.getPropCType(), false, isAsciiRange(env.option), env, this);
            if (token.getPropNot()) ccn.setNot();
            node = ccn;
            break;

        default:
            newInternalException(PARSER_BUG);
        } // inner switch
        return node;
    }

    private Node cClassCaseFold(Node node, CClassNode cc, CClassNode ascCc) {
        ApplyCaseFoldArg arg = new ApplyCaseFoldArg(env, cc, ascCc);
        enc.applyAllCaseFold(env.caseFoldFlag, ApplyCaseFold.INSTANCE, arg);
        if (arg.altRoot != null) {
            node = ListNode.newAlt(node, arg.altRoot);
        }
        return node;
    }

    private Node parseCharProperty() {
        int ctype = fetchCharPropertyToCType();
        CClassNode cc = new CClassNode();
        Node node = cc;
        cc.addCType(ctype, false, false, env, this);
        if (token.getPropNot()) cc.setNot();

        if (isIgnoreCase(env.option)) {
            if (ctype != CharacterType.ASCII) {
                node = cClassCaseFold(node, cc, cc);
            }
        }
        return node;
    }

    private Node parseAnycharAnytime() {
        Node node = new AnyCharNode();
        QuantifierNode qn = new QuantifierNode(0, QuantifierNode.REPEAT_INFINITE, false);
        qn.setTarget(node);
        return qn;
    }

    private Node parseBackref() {
        final Node node;
        if (syntax.op3OptionECMAScript() && token.getBackrefNum() == 1 && env.memNodes != null) {
            EncloseNode encloseNode = env.memNodes[token.getBackrefRef1()];
            boolean shouldIgnore = false;
            if (encloseNode != null && encloseNode.containingAnchor != null) {
                shouldIgnore = true;
                for (Node anchorNode : env.precReadNotNodes) {
                    if (anchorNode == encloseNode.containingAnchor) {
                        shouldIgnore = false;
                        break;
                    }
                }
            }
            if (shouldIgnore) {
                node = StringNode.EMPTY;
            } else {
                node = newBackRef(new int[]{token.getBackrefRef1()});
            }
        } else {
            node = newBackRef(token.getBackrefNum() > 1 ? token.getBackrefRefs() : new int[]{token.getBackrefRef1()});
        }
        return node;
    }

    private BackRefNode newBackRef(int[]backRefs) {
        return new BackRefNode(token.getBackrefNum(),
            backRefs,
            token.getBackrefByName(),
            token.getBackrefExistLevel(),
            token.getBackrefLevel(),
            env);
    }

    private Node parseCall() {
        int gNum = token.getCallGNum();
        if (gNum < 0 || token.getCallRel()) {
            if (gNum > 0) gNum--;
            gNum = backrefRelToAbs(gNum);
            if (gNum <= 0) newValueException(INVALID_BACKREF);
        }
        Node node = new CallNode(bytes, token.getCallNameP(), token.getCallNameEnd(), gNum);
        env.numCall++;
        return node;
    }

    private Node parseBranch(TokenType term) {
        Node node = parseExp(term);

        if (token.type == TokenType.EOT || token.type == term || token.type == TokenType.ALT) {
            return node;
        } else {
            ListNode top = ListNode.newList(node, null);
            ListNode t = top;

            while (token.type != TokenType.EOT && token.type != term && token.type != TokenType.ALT) {
                node = parseExp(term);
                if (node.getType() == NodeType.LIST) {
                    t.setTail((ListNode)node);
                    while (((ListNode)node).tail != null ) node = ((ListNode)node).tail;

                    t = ((ListNode)node);
                } else {
                    t.setTail(ListNode.newList(node, null));
                    t = t.tail;
                }
            }
            return top;
        }
    }

    /* term_tok: TK_EOT or TK_SUBEXP_CLOSE */
    private Node parseSubExp(TokenType term) {
        Node node = parseBranch(term);

        if (token.type == term) {
            return node;
        } else if (token.type == TokenType.ALT) {
            ListNode top = ListNode.newAlt(node, null);
            ListNode t = top;
            while (token.type == TokenType.ALT) {
                fetchToken();
                node = parseBranch(term);

                t.setTail(ListNode.newAlt(node, null));
                t = t.tail;
            }

            if (token.type != term) parseSubExpError(term);
            return top;
        } else {
            parseSubExpError(term);
            return null; //not reached
        }
    }

    private void parseSubExpError(TokenType term) {
        if (term == TokenType.SUBEXP_CLOSE) {
            newSyntaxException(END_PATTERN_WITH_UNMATCHED_PARENTHESIS);
        } else {
            newInternalException(PARSER_BUG);
        }
    }

    protected final Node parseRegexp() {
        fetchToken();
        Node top = parseSubExp(TokenType.EOT);
        if (Config.USE_SUBEXP_CALL) {
            if (env.numCall > 0) {
                /* Capture the pattern itself. It is used for (?R), (?0) and \g<0>. */
                EncloseNode np = EncloseNode.newMemory(env.option, false);
                np.regNum = 0;
                np.setTarget(top);
                if (env.memNodes ==  null) env.memNodes = new EncloseNode[Config.SCANENV_MEMNODES_SIZE];
                env.memNodes[0] = np;
                top = np;
            }
        }
        return top;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy