com.oracle.truffle.regex.tregex.parser.ast.RegexAST Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * The Universal Permissive License (UPL), Version 1.0
 *
 * Subject to the condition set forth below, permission is hereby granted to any
 * person obtaining a copy of this software, associated documentation and/or
 * data (collectively the "Software"), free of charge and under any and all
 * copyright rights in the Software, and any and all patent rights owned or
 * freely licensable by each licensor hereunder covering either (i) the
 * unmodified Software as contributed to or provided by such licensor, or (ii)
 * the Larger Works (as defined below), to deal in both
 *
 * (a) the Software, and
 *
 * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
 * one is included with the Software each a "Larger Work" to which the Software
 * is contributed by such licensors),
 *
 * without restriction, including without limitation the rights to copy, create
 * derivative works of, display, perform, and distribute the Software and make,
 * use, sell, offer for sale, import, export, have made, and have sold the
 * Software and the Larger Work(s), and to sublicense the foregoing rights on
 * either these or other terms.
 *
 * This license is subject to the following condition:
 *
 * The above copyright notice and either this complete permission notice or at a
 * minimum a reference to the UPL must be included in all copies or substantial
 * portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.oracle.truffle.regex.tregex.parser.ast;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Stream;

import org.graalvm.collections.EconomicMap;
import org.graalvm.collections.Equivalence;

import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
import com.oracle.truffle.api.source.SourceSection;
import com.oracle.truffle.regex.RegexFlags;
import com.oracle.truffle.regex.RegexOptions;
import com.oracle.truffle.regex.RegexSource;
import com.oracle.truffle.regex.UnsupportedRegexException;
import com.oracle.truffle.regex.charset.CodePointSet;
import com.oracle.truffle.regex.charset.Constants;
import com.oracle.truffle.regex.tregex.TRegexOptions;
import com.oracle.truffle.regex.tregex.automaton.StateIndex;
import com.oracle.truffle.regex.tregex.automaton.StateSet;
import com.oracle.truffle.regex.tregex.parser.Counter;
import com.oracle.truffle.regex.tregex.parser.RegexProperties;
import com.oracle.truffle.regex.tregex.parser.Token;
import com.oracle.truffle.regex.tregex.parser.ast.visitors.ASTDebugDumpVisitor;
import com.oracle.truffle.regex.tregex.parser.ast.visitors.CopyVisitor;
import com.oracle.truffle.regex.tregex.string.AbstractStringBuffer;
import com.oracle.truffle.regex.tregex.string.Encodings.Encoding;
import com.oracle.truffle.regex.tregex.util.json.Json;
import com.oracle.truffle.regex.tregex.util.json.JsonArray;
import com.oracle.truffle.regex.tregex.util.json.JsonConvertible;
import com.oracle.truffle.regex.tregex.util.json.JsonValue;
import com.oracle.truffle.regex.util.CompilationFinalBitSet;

public final class RegexAST implements StateIndex, JsonConvertible {

    /**
     * Original pattern as seen by the parser.
     */
    private final RegexSource source;
    private final RegexFlags flags;
    private final RegexOptions options;
    private final Counter.ThresholdCounter nodeCount = new Counter.ThresholdCounter(TRegexOptions.TRegexParserTreeMaxSize, "parse tree explosion");
    private final Counter.ThresholdCounter groupCount = new Counter.ThresholdCounter(TRegexOptions.TRegexMaxNumberOfCaptureGroups, "too many capture groups");
    private final Counter quantifierCount = new Counter();
    private final Counter zeroWidthQuantifierCount = new Counter();
    private final RegexProperties properties = new RegexProperties();
    private RegexASTNode[] nodes;
    /**
     * AST as parsed from the expression.
     */
    private Group root;
    /**
     * Possibly wrapped root for NFA generation (see {@link #createPrefix()}).
     */
    private Group wrappedRoot;
    private Group[] captureGroups;
    private final LookAroundIndex lookArounds = new LookAroundIndex();
    private final List reachableCarets = new ArrayList<>();
    private final List reachableDollars = new ArrayList<>();
    private StateSet nfaAnchoredInitialStates;
    private StateSet hardPrefixNodes;
    private final EconomicMap groupBoundariesDeduplicationMap = EconomicMap.create();

    private int negativeLookaheads = 0;
    private int negativeLookbehinds = 0;

    private final EconomicMap> sourceSections;

    public RegexAST(RegexSource source, RegexFlags flags, RegexOptions options) {
        this.source = source;
        this.flags = flags;
        this.options = options;
        sourceSections = options.isDumpAutomata() ? EconomicMap.create(Equivalence.IDENTITY_WITH_SYSTEM_HASHCODE) : null;
    }

    public RegexSource getSource() {
        return source;
    }

    public RegexFlags getFlags() {
        return flags;
    }

    public RegexOptions getOptions() {
        return options;
    }

    public Encoding getEncoding() {
        return source.getEncoding();
    }

    public Group getRoot() {
        return root;
    }

    public void setRoot(Group root) {
        this.root = root;
    }

    public Group getWrappedRoot() {
        return wrappedRoot;
    }

    public boolean rootIsWrapped() {
        return wrappedRoot != null && root != wrappedRoot;
    }

    public Counter.ThresholdCounter getNodeCount() {
        return nodeCount;
    }

    public int getNumberOfNodes() {
        return nodeCount.getCount();
    }

    public Counter.ThresholdCounter getGroupCount() {
        return groupCount;
    }

    /**
     * @return the number of capturing groups in the AST, including group 0.
     */
    public int getNumberOfCaptureGroups() {
        return groupCount.getCount();
    }

    public Counter getQuantifierCount() {
        return quantifierCount;
    }

    public Counter getZeroWidthQuantifierCount() {
        return zeroWidthQuantifierCount;
    }

    public Group getGroupByBoundaryIndex(int index) {
        if (captureGroups == null) {
            captureGroups = new Group[getNumberOfCaptureGroups()];
            for (RegexASTNode n : nodes) {
                if (n instanceof Group && ((Group) n).isCapturing()) {
                    captureGroups[((Group) n).getGroupNumber()] = (Group) n;
                }
            }
        }
        return captureGroups[index / 2];
    }

    public RegexProperties getProperties() {
        return properties;
    }

    public boolean isLiteralString() {
        Group r = getRoot();
        RegexProperties p = getProperties();
        return !((p.hasBackReferences() || p.hasAlternations() || p.hasLookAroundAssertions() || r.hasLoops()) || ((r.startsWithCaret() || r.endsWithDollar()) && getFlags().isMultiline())) &&
                        (!p.hasCharClasses() || p.charClassesCanBeMatchedWithMask());
    }

    @Override
    public int getNumberOfStates() {
        return nodes.length;
    }

    @Override
    public int getId(RegexASTNode state) {
        return state.getId();
    }

    @Override
    public RegexASTNode getState(int id) {
        return nodes[id];
    }

    public void setIndex(RegexASTNode[] index) {
        this.nodes = index;
    }

    /**
     * @return length of prefix possibly generated by {@link #createPrefix()}.
     */
    public int getWrappedPrefixLength() {
        if (rootIsWrapped()) {
            // The single alternative in the wrappedRoot is composed of N non-optional prefix
            // matchers, 1 group of optional matchers and the original root. By
            // taking size() - 2, we get the number of non-optional prefix matchers.
            return wrappedRoot.getFirstAlternative().size() - 2;
        }
        return 0;
    }

    /**
     * @return first element of sequence of optional any-char matchers possibly generated by
     *         {@link #createPrefix()}.
     */
    public RegexASTNode getEntryAfterPrefix() {
        if (rootIsWrapped()) {
            return wrappedRoot.getFirstAlternative().getTerms().get(getWrappedPrefixLength());
        }
        return wrappedRoot;
    }

    public LookAroundIndex getLookArounds() {
        return lookArounds;
    }

    public List getReachableCarets() {
        return reachableCarets;
    }

    public List getReachableDollars() {
        return reachableDollars;
    }

    public StateSet getNfaAnchoredInitialStates() {
        return nfaAnchoredInitialStates;
    }

    public StateSet getHardPrefixNodes() {
        return hardPrefixNodes;
    }

    public RegexASTRootNode createRootNode() {
        final RegexASTRootNode node = new RegexASTRootNode();
        createNFAHelperNodes(node);
        return node;
    }

    public BackReference createBackReference(int groupNumber) {
        return register(new BackReference(groupNumber));
    }

    public CharacterClass createCharacterClass(CodePointSet matcherBuilder) {
        assert getEncoding().getFullSet().contains(matcherBuilder);
        return register(new CharacterClass(matcherBuilder));
    }

    public Group createGroup() {
        return register(new Group());
    }

    public Group createCaptureGroup(int groupNumber) {
        return register(new Group(groupNumber));
    }

    public LookAheadAssertion createLookAheadAssertion(boolean negated) {
        final LookAheadAssertion assertion = new LookAheadAssertion(negated);
        createNFAHelperNodes(assertion);
        return register(assertion);
    }

    public LookBehindAssertion createLookBehindAssertion(boolean negated) {
        final LookBehindAssertion assertion = new LookBehindAssertion(negated);
        createNFAHelperNodes(assertion);
        return register(assertion);
    }

    public void createNFAHelperNodes(RegexASTSubtreeRootNode rootNode) {
        nodeCount.inc(4);
        PositionAssertion anchored = new PositionAssertion(PositionAssertion.Type.CARET);
        rootNode.setAnchoredInitialState(anchored);
        MatchFound unAnchored = new MatchFound();
        rootNode.setUnAnchoredInitialState(unAnchored);
        MatchFound end = new MatchFound();
        rootNode.setMatchFound(end);
        PositionAssertion anchoredEnd = new PositionAssertion(PositionAssertion.Type.DOLLAR);
        rootNode.setAnchoredFinalState(anchoredEnd);
    }

    public PositionAssertion createPositionAssertion(PositionAssertion.Type type) {
        return register(new PositionAssertion(type));
    }

    public Sequence createSequence() {
        return register(new Sequence());
    }

    public BackReference register(BackReference backReference) {
        nodeCount.inc();
        properties.setBackReferences();
        return backReference;
    }

    public CharacterClass register(CharacterClass characterClass) {
        nodeCount.inc();
        updatePropsCC(characterClass);
        return characterClass;
    }

    public void updatePropsCC(CharacterClass characterClass) {
        if (!characterClass.getCharSet().matchesSingleChar()) {
            if (!characterClass.getCharSet().matches2CharsWith1BitDifference()) {
                properties.unsetCharClassesCanBeMatchedWithMask();
            }
            if (!getEncoding().isFixedCodePointWidth(characterClass.getCharSet())) {
                properties.setFixedCodePointWidth(false);
            }
            properties.setCharClasses();
        }
        if (Constants.SURROGATES.intersects(characterClass.getCharSet())) {
            properties.setLoneSurrogates();
        }
    }

    public Group register(Group group) {
        nodeCount.inc();
        if (group.isCapturing() && group.getGroupNumber() != 0) {
            properties.setCaptureGroups();
        }
        return group;
    }

    public LookAheadAssertion register(LookAheadAssertion lookAheadAssertion) {
        nodeCount.inc();
        properties.setLookAheadAssertions();
        if (lookAheadAssertion.isNegated()) {
            negativeLookaheads++;
            properties.setNegativeLookAheadAssertions();
        }
        return lookAheadAssertion;
    }

    public LookBehindAssertion register(LookBehindAssertion lookBehindAssertion) {
        nodeCount.inc();
        properties.setLookBehindAssertions();
        if (lookBehindAssertion.isNegated()) {
            negativeLookbehinds++;
            properties.setNegativeLookBehindAssertions();
        }
        return lookBehindAssertion;
    }

    public void invertNegativeLookAround(LookAroundAssertion assertion) {
        assert assertion.isNegated();
        assertion.setNegated(false);
        if (assertion.isLookAheadAssertion()) {
            assert negativeLookaheads > 0;
            if (--negativeLookaheads == 0) {
                properties.setNegativeLookAheadAssertions(false);
            }
        } else {
            assert negativeLookbehinds > 0;
            if (--negativeLookbehinds == 0) {
                properties.setNegativeLookBehindAssertions(false);
            }
        }
    }

    public PositionAssertion register(PositionAssertion positionAssertion) {
        nodeCount.inc();
        return positionAssertion;
    }

    public Sequence register(Sequence sequence) {
        nodeCount.inc();
        return sequence;
    }

    public boolean isNFAInitialState(RegexASTNode node) {
        return node.getId() >= 1 && node.getId() <= getWrappedPrefixLength() * 2 + 2;
    }

    private void createNFAInitialStates() {
        if (nfaAnchoredInitialStates != null) {
            return;
        }
        hardPrefixNodes = StateSet.create(this);
        nfaAnchoredInitialStates = StateSet.create(this);
        int nextID = 1;
        MatchFound mf = new MatchFound();
        initNodeId(mf, nextID++);
        mf.setNext(getEntryAfterPrefix());
        PositionAssertion pos = new PositionAssertion(PositionAssertion.Type.CARET);
        initNodeId(pos, nextID++);
        nfaAnchoredInitialStates.add(pos);
        pos.setNext(getEntryAfterPrefix());
        for (int i = getWrappedPrefixLength() - 1; i >= 0; i--) {
            RegexASTNode prefixNode = getWrappedRoot().getFirstAlternative().getTerms().get(i);
            hardPrefixNodes.add(prefixNode);
            mf = new MatchFound();
            initNodeId(mf, nextID++);
            mf.setNext(prefixNode);
            pos = new PositionAssertion(PositionAssertion.Type.CARET);
            initNodeId(pos, nextID++);
            nfaAnchoredInitialStates.add(pos);
            pos.setNext(prefixNode);
        }
    }

    public MatchFound getNFAUnAnchoredInitialState(int prefixOffset) {
        createNFAInitialStates();
        assert nodes[prefixOffset * 2 + 1] != null;
        return (MatchFound) nodes[prefixOffset * 2 + 1];
    }

    public PositionAssertion getNFAAnchoredInitialState(int prefixOffset) {
        createNFAInitialStates();
        assert nodes[prefixOffset * 2 + 2] != null;
        return (PositionAssertion) nodes[prefixOffset * 2 + 2];
    }

    /**
     * Inserts a prefix of matchers that match any characters at the beginning of the AST. The
     * length of the prefix is determined by the look-behind assertions present in the regex. Any
     * necessary context that could be matched by the look-behind assertions but not by the original
     * regex can be captured by the prefix. Exemplary prefix: {@code
     * regex: /(?<=ab)/
     *  -> prefix length: 2
     *  -> result: /(?:[_any_][_any_](?:|[_any_](?:|[_any_])))(?<=ab)/
     *      -> the non-optional [_any_] - matchers will be used if fromIndex > 0,
     *                                    the optional matchers will always be used
     * }
     */
    public void createPrefix() {
        if (root.startsWithCaret() || properties.hasNonLiteralLookBehindAssertions()) {
            wrappedRoot = root;
            return;
        }
        int prefixLength = 0;
        for (LookAroundAssertion lb : lookArounds) {
            if (lb instanceof LookAheadAssertion) {
                continue;
            }
            int minPath = lb.getMinPath();
            RegexASTSubtreeRootNode laParent = lb.getSubTreeParent();
            while (!(laParent instanceof RegexASTRootNode)) {
                if (laParent instanceof LookBehindAssertion) {
                    throw new UnsupportedRegexException("nested look-behind assertions");
                }
                minPath += laParent.getMinPath();
                laParent = laParent.getSubTreeParent();
            }
            prefixLength = Math.max(prefixLength, lb.getLiteralLength() - minPath);
        }
        if (prefixLength == 0) {
            wrappedRoot = root;
            return;
        }
        final Group wrapRoot = createGroup();
        wrapRoot.setPrefix();
        final Sequence wrapRootSeq = createSequence();
        wrapRoot.add(wrapRootSeq);
        wrapRootSeq.setPrefix();
        // create non-optional matchers ([_any_][_any_]...)
        for (int i = 0; i < prefixLength; i++) {
            wrapRootSeq.add(createPrefixAnyMatcher());
        }
        Group prevOpt = null;
        // create optional matchers ((?:|[_any_](?:|[_any_]))...)
        for (int i = 0; i < prefixLength; i++) {
            Group opt = createGroup();
            opt.setPrefix();
            opt.add(createSequence());
            opt.add(createSequence());
            opt.getFirstAlternative().setPrefix();
            opt.getAlternatives().get(1).setPrefix();
            opt.getAlternatives().get(1).add(createPrefixAnyMatcher());
            if (prevOpt != null) {
                opt.getAlternatives().get(1).add(prevOpt);
            }
            prevOpt = opt;
        }
        root.getSubTreeParent().setGroup(wrapRoot);
        wrapRootSeq.add(prevOpt);
        wrapRootSeq.add(root);
        wrappedRoot = wrapRoot;
    }

    public void hidePrefix() {
        if (wrappedRoot != root) {
            root.getSubTreeParent().setGroup(root);
        }
    }

    public void unhidePrefix() {
        if (wrappedRoot != root) {
            root.getSubTreeParent().setGroup(wrappedRoot);
        }
    }

    public GroupBoundaries createGroupBoundaries(CompilationFinalBitSet updateIndices, CompilationFinalBitSet clearIndices) {
        GroupBoundaries staticInstance = GroupBoundaries.getStaticInstance(updateIndices, clearIndices);
        if (staticInstance != null) {
            return staticInstance;
        }
        GroupBoundaries lookup = new GroupBoundaries(updateIndices, clearIndices);
        if (groupBoundariesDeduplicationMap.containsKey(lookup)) {
            return groupBoundariesDeduplicationMap.get(lookup);
        } else {
            GroupBoundaries gb = new GroupBoundaries(updateIndices.copy(), clearIndices.copy());
            groupBoundariesDeduplicationMap.put(gb, gb);
            return gb;
        }
    }

    /**
     * Creates a {@link CharacterClass} node which matches any character and whose 'prefix' flag is
     * set to true.
     */
    private CharacterClass createPrefixAnyMatcher() {
        final CharacterClass anyMatcher = createCharacterClass(getEncoding().getFullSet());
        anyMatcher.setPrefix();
        return anyMatcher;
    }

    private void addToIndex(RegexASTNode node) {
        assert node.getId() >= 0;
        assert node.getId() < nodes.length;
        assert nodes[node.getId()] == null;
        nodes[node.getId()] = node;
    }

    private void initNodeId(RegexASTNode node, int id) {
        node.setId(id);
        addToIndex(node);
    }

    /**
     * Get a list of all source sections associated with the given {@link RegexASTNode}. The parser
     * will map nodes to source sections in the following way:
     * 
     * {@link Group}: sections of the respective opening and closing brackets, in that order.
     * For example, the source sections of a look-ahead assertion will be {@code ["(?=", ")"]}.
     * Groups generated by the parser, e.g. {@code (?:a|)} generated for {@code a?}, don't have
     * source sections.
     * {@link CharacterClass}: normally these nodes correspond to a single
     * {@link com.oracle.truffle.regex.tregex.parser.Token.CharacterClass Token.CharacterClass}, but
     * the parser may optimize redundant nodes away and add their source sections to existing nodes.
     * Example: {@code a|b} will be optimized to {@code [ab]}, which will be mapped to both original
     * characters.
     * {@link Sequence}, {@link MatchFound}, {@link RegexASTSubtreeRootNode}: no mapping.
     * {@link PositionAssertion}, {@link BackReference}: mapped to their respective
     * {@link Token}s.
     * Nodes generated by {@link CopyVisitor} are mapped to the same source sections as their
     * counterparts.
     * Nodes inserted as substitutions for e.g. {@code \b} will simply point to the source
     * section they are substituting.
     * Source sections of {@link com.oracle.truffle.regex.tregex.parser.Token.Quantifier
     * quantifiers} are mapped to their respective {@link Term}.
     * 
     */
    public List getSourceSections(RegexASTNode node) {
        return options.isDumpAutomata() ? sourceSections.get(node) : null;
    }

    public void addSourceSection(RegexASTNode node, Token token) {
        if (options.isDumpAutomata() && token != null && token.getSourceSection() != null) {
            getOrCreateSourceSections(node).add(token.getSourceSection());
        }
    }

    public void addSourceSections(RegexASTNode node, Collection src) {
        if (options.isDumpAutomata() && src != null) {
            getOrCreateSourceSections(node).addAll(src);
        }
    }

    private List getOrCreateSourceSections(RegexASTNode node) {
        List sections = sourceSections.get(node);
        if (sections == null) {
            sections = new ArrayList<>();
            sourceSections.put(node, sections);
        }
        return sections;
    }

    public InnerLiteral extractInnerLiteral() {
        assert properties.hasInnerLiteral();
        int literalEnd = properties.getInnerLiteralEnd();
        int literalStart = properties.getInnerLiteralStart();
        AbstractStringBuffer literal = getEncoding().createStringBuffer(literalEnd - literalStart);
        AbstractStringBuffer mask = getEncoding().createStringBuffer(literalEnd - literalStart);
        boolean hasMask = false;
        for (int i = literalStart; i < literalEnd; i++) {
            CharacterClass cc = root.getFirstAlternative().getTerms().get(i).asCharacterClass();
            assert cc.getCharSet().matchesSingleChar() || cc.getCharSet().matches2CharsWith1BitDifference();
            assert getEncoding().isFixedCodePointWidth(cc.getCharSet());
            cc.extractSingleChar(literal, mask);
            hasMask |= cc.getCharSet().matches2CharsWith1BitDifference();
        }
        return new InnerLiteral(literal.materialize(), hasMask ? mask.materialize() : null, root.getFirstAlternative().get(literalStart).getMaxPath() - 1);
    }

    @TruffleBoundary
    @Override
    public JsonValue toJson() {
        return Json.obj(Json.prop("source", source),
                        Json.prop("root", root),
                        Json.prop("debugAST", ASTDebugDumpVisitor.getDump(wrappedRoot)),
                        Json.prop("wrappedRoot", wrappedRoot),
                        Json.prop("reachableCarets", reachableCarets),
                        Json.prop("startsWithCaret", root.startsWithCaret()),
                        Json.prop("endsWithDollar", root.endsWithDollar()),
                        Json.prop("reachableDollars", reachableDollars),
                        Json.prop("properties", properties));
    }

    @TruffleBoundary
    public static JsonArray sourceSectionsToJson(List sourceSections) {
        if (sourceSections == null) {
            return Json.array();
        }
        return sourceSectionsToJson(sourceSections.stream());
    }

    @TruffleBoundary
    public static JsonArray sourceSectionsToJson(Stream sourceSections) {
        if (sourceSections == null) {
            return Json.array();
        }
        return Json.array(sourceSections.map(x -> Json.obj(
                        Json.prop("start", x.getCharIndex()),
                        Json.prop("end", x.getCharEndIndex()))));
    }
}