All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.oracle.truffle.regex.tregex.automaton.StateTransitionCanonicalizer Maven / Gradle / Ivy

/*
 * Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * The Universal Permissive License (UPL), Version 1.0
 *
 * Subject to the condition set forth below, permission is hereby granted to any
 * person obtaining a copy of this software, associated documentation and/or
 * data (collectively the "Software"), free of charge and under any and all
 * copyright rights in the Software, and any and all patent rights owned or
 * freely licensable by each licensor hereunder covering either (i) the
 * unmodified Software as contributed to or provided by such licensor, or (ii)
 * the Larger Works (as defined below), to deal in both
 *
 * (a) the Software, and
 *
 * (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
 * one is included with the Software each a "Larger Work" to which the Software
 * is contributed by such licensors),
 *
 * without restriction, including without limitation the rights to copy, create
 * derivative works of, display, perform, and distribute the Software and make,
 * use, sell, offer for sale, import, export, have made, and have sold the
 * Software and the Larger Work(s), and to sublicense the foregoing rights on
 * either these or other terms.
 *
 * This license is subject to the following condition:
 *
 * The above copyright notice and either this complete permission notice or at a
 * minimum a reference to the UPL must be included in all copies or substantial
 * portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
package com.oracle.truffle.regex.tregex.automaton;

import java.util.Arrays;
import java.util.Iterator;

import com.oracle.truffle.regex.charset.CodePointSet;
import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer;
import com.oracle.truffle.regex.tregex.buffer.ObjectArrayBuffer;
import com.oracle.truffle.regex.util.CompilationFinalBitSet;

/**
 * This class provides an algorithm for converting a list of NFA transitions into a set of DFA
 * transitions.
 *
 * @param  represents a DFA transition fragment. This type is used for both intermediate and
 *            final results.
 *
 * @see TransitionBuilder
 * @see TransitionSet
 */
public abstract class StateTransitionCanonicalizer, S extends AbstractState, T extends AbstractTransition, TB extends TransitionBuilder> {

    private final ObjectArrayBuffer argTransitions = new ObjectArrayBuffer<>();
    private final ObjectArrayBuffer argCharSets = new ObjectArrayBuffer<>();

    private static final int INITIAL_CAPACITY = 8;
    @SuppressWarnings("unchecked") private ObjectArrayBuffer[] transitionLists = new ObjectArrayBuffer[INITIAL_CAPACITY];
    @SuppressWarnings("unchecked") private StateSet[] targetStateSets = new StateSet[INITIAL_CAPACITY];
    private CodePointSet[] matcherBuilders = new CodePointSet[INITIAL_CAPACITY];
    private CompilationFinalBitSet leadsToFinalState = new CompilationFinalBitSet(INITIAL_CAPACITY);
    private int resultLength = 0;

    private final SI stateIndex;
    private final boolean forward;
    private final boolean prioritySensitive;

    public StateTransitionCanonicalizer(SI stateIndex, boolean forward, boolean prioritySensitive) {
        this.stateIndex = stateIndex;
        this.forward = forward;
        this.prioritySensitive = prioritySensitive;
    }

    /**
     * If priority-sensitive mode, transition sets are pruned after transitions to final states.
     * Also, target state sets are considered equal iff their order is equal as well.
     */
    protected boolean isPrioritySensitive() {
        return prioritySensitive;
    }

    /**
     * Submits an argument to be processed by {@link #run(CompilationBuffer)}.
     */
    public void addArgument(T transition, CodePointSet charSet) {
        argTransitions.add(transition);
        argCharSets.add(charSet);
    }

    /**
     * Runs the NFA to DFA transition conversion algorithm on the NFA transitions given by previous
     * calls to {@link #addArgument(AbstractTransition, CodePointSet)}. This algorithm has two
     * phases:
     * 
    *
  1. Merge NFA transitions according to their expected character sets. The result of this * phase is a list of {@link TransitionBuilder}s whose {@link CodePointSet}s have no more * intersections.
  2. *
  3. Merge {@link TransitionBuilder}s generated by the first phase if their target * state is equal and {@link #canMerge(TransitionBuilder, TransitionBuilder)} returns * {@code true}.
  4. *
* * @return a set of transition builders representing the DFA transitions generated from the * given NFA transitions. */ public TB[] run(CompilationBuffer compilationBuffer) { calcDisjointTransitions(compilationBuffer); TB[] result = mergeSameTargets(compilationBuffer); resultLength = 0; leadsToFinalState.clear(); argTransitions.clear(); argCharSets.clear(); return result; } /** * Merges NFA transitions according to their expected character sets as returned * {@link TransitionBuilder#getCodePointSet()}, in the following way:
*
    *
  • The result of the algorithm is a list of transitions where no two elements have an * intersection in their respective set of expected characters. We initially define the result * as an empty list.
  • *
  • For every element e of the input list, we compare the expected character sets of * e and every element in the result list. If an intersection with an element * r of the result list is found, we distinguish the following two cases, where the * character set of an element x is denoted as x.cs and the transition set of * an element x is denoted as x.ts: *
      *
    • If e.cs contains r.cs, e.ts is merged into r.ts using * TransitionSet#addAll(TransitionSet).
    • *
    • Otherwise, a new transition containing e.ts and r.ts and the * intersection of e.cs and r.cs is added to the result list. This new * transition is created using TransitionBuilder#createMerged(TransitionBuilder, CodePointSet). * The intersection of e.cs and r.cs is removed from r.cs.
    • *
    *
  • *
  • Every time an intersection is found, that intersection is removed from e.cs. If * e.cs is not empty after e has been compared to every element of the result * list, e is added to the result list.
  • *
  • The result list at all times fulfills the property that no two elements in the list * intersect.
  • *
* This algorithm has an important property: every entry in the input list will be merged with * the elements it intersects with in the order of the input list. This means that * given an input list {@code [1, 2, 3]} where all elements intersect with each other, element * {@code 2} will be merged with {@code 1} before {@code 3} is merged with {@code 1}. This * property is crucial for generating priority-sensitive DFAs, since we track priorities of NFA * transitions by their order! * * Example:
* *
     * input: [
     *   {transitionSet {1}, matcherBuilder [ab]},
     *   {transitionSet {2}, matcherBuilder [bc]}
     * ]
     * output: [
     *   {transitionSet {1},    matcherBuilder [a]},
     *   {transitionSet {1, 2}, matcherBuilder [b]},
     *   {transitionSet {2},    matcherBuilder [c]}
     * ]
     * 
*/ @SuppressWarnings("unchecked") private void calcDisjointTransitions(CompilationBuffer compilationBuffer) { for (int i = 0; i < argTransitions.length(); i++) { T argTransition = argTransitions.get(i); CodePointSet argCharSet = argCharSets.get(i); int currentResultLength = resultLength; for (int j = 0; j < currentResultLength; j++) { CodePointSet.IntersectAndSubtractResult result = matcherBuilders[j].intersectAndSubtract(argCharSet, compilationBuffer); CodePointSet rSubtractedMatcher = result.subtractedA; CodePointSet eSubtractedMatcher = result.subtractedB; CodePointSet intersection = result.intersection; if (intersection.matchesSomething()) { if (rSubtractedMatcher.matchesNothing()) { addTransitionTo(j, argTransition); } else { createSlot(); matcherBuilders[j] = rSubtractedMatcher; matcherBuilders[resultLength] = intersection; targetStateSets[resultLength] = targetStateSets[j].copy(); transitionLists[resultLength].addAll(transitionLists[j]); if (isPrioritySensitive() && leadsToFinalState.get(j)) { leadsToFinalState.set(resultLength); } addTransitionTo(resultLength, argTransition); resultLength++; } argCharSet = eSubtractedMatcher; if (eSubtractedMatcher.matchesNothing()) { break; } } } if (argCharSet.matchesSomething()) { createSlot(); targetStateSets[resultLength] = StateSet.create(stateIndex); matcherBuilders[resultLength] = argCharSet; addTransitionTo(resultLength, argTransition); resultLength++; } } } private void createSlot() { if (transitionLists.length <= resultLength) { transitionLists = Arrays.copyOf(transitionLists, resultLength * 2); targetStateSets = Arrays.copyOf(targetStateSets, resultLength * 2); matcherBuilders = Arrays.copyOf(matcherBuilders, resultLength * 2); } if (transitionLists[resultLength] == null) { transitionLists[resultLength] = new ObjectArrayBuffer<>(); } transitionLists[resultLength].clear(); } private void addTransitionTo(int i, T transition) { if (isPrioritySensitive() && leadsToFinalState.get(i)) { return; } if (targetStateSets[i].add(transition.getTarget(forward))) { transitionLists[i].add(transition); if (isPrioritySensitive() && ((BasicState) transition.getTarget(forward)).hasTransitionToUnAnchoredFinalState(forward)) { leadsToFinalState.set(i); } } } /** * Merges transitions calculated by {@link #calcDisjointTransitions(CompilationBuffer)} if their * target state set is equal and * {@link #canMerge(TransitionBuilder, TransitionBuilder)} returns {@code true}. */ @SuppressWarnings("unchecked") private TB[] mergeSameTargets(CompilationBuffer compilationBuffer) { ObjectArrayBuffer resultBuffer1 = compilationBuffer.getObjectBuffer1(); resultBuffer1.ensureCapacity(resultLength); for (int i = 0; i < resultLength; i++) { assert matcherBuilders[i].matchesSomething(); resultBuffer1.add(createTransitionBuilder(transitionLists[i].toArray(createTransitionArray(transitionLists[i].length())), targetStateSets[i], matcherBuilders[i])); } if (isPrioritySensitive() && leadsToFinalState.isEmpty()) { // no transitions were pruned, so no equal transition sets are possible return resultBuffer1.toArray(createResultArray(resultBuffer1.length())); } resultBuffer1.sort((TB o1, TB o2) -> { TransitionSet t1 = o1.getTransitionSet(); TransitionSet t2 = o2.getTransitionSet(); int cmp = t1.size() - t2.size(); if (cmp != 0) { return cmp; } if (isPrioritySensitive()) { // Transition sets are equal iff they lead to the same target states in the same // order. for (int i = 0; i < t1.size(); i++) { cmp = t1.getTransition(i).getTarget(forward).getId() - t2.getTransition(i).getTarget(forward).getId(); if (cmp != 0) { return cmp; } } return cmp; } else { // Transition sets are equal iff they lead to the same set of target states. // Here, we abuse the fact that our state set iterators yield states ordered by id. Iterator i1 = t1.getTargetStateSet().iterator(); Iterator i2 = t2.getTargetStateSet().iterator(); while (i1.hasNext()) { assert i2.hasNext(); cmp = i1.next().getId() - i2.next().getId(); if (cmp != 0) { return cmp; } } return cmp; } }); ObjectArrayBuffer resultBuffer2 = compilationBuffer.getObjectBuffer2(); TB last = null; for (TB tb : resultBuffer1) { if (last != null && canMerge(last, tb)) { last.setMatcherBuilder(last.getCodePointSet().union(tb.getCodePointSet(), compilationBuffer)); } else { resultBuffer2.add(tb); last = tb; } } return resultBuffer2.toArray(createResultArray(resultBuffer2.length())); } protected abstract TB createTransitionBuilder(T[] transitions, StateSet targetStateSet, CodePointSet matcherBuilder); /** * Returns {@code true} if two DFA transitions are allowed to be merged into one. */ protected abstract boolean canMerge(TB a, TB b); protected abstract T[] createTransitionArray(int size); /** * Returns an array suitable for holding the result of {@link #run(CompilationBuffer)}. */ protected abstract TB[] createResultArray(int size); }