
com.oracle.truffle.regex.tregex.automaton.StateTransitionCanonicalizer Maven / Gradle / Ivy
/*
* Copyright (c) 2018, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
*
* Subject to the condition set forth below, permission is hereby granted to any
* person obtaining a copy of this software, associated documentation and/or
* data (collectively the "Software"), free of charge and under any and all
* copyright rights in the Software, and any and all patent rights owned or
* freely licensable by each licensor hereunder covering either (i) the
* unmodified Software as contributed to or provided by such licensor, or (ii)
* the Larger Works (as defined below), to deal in both
*
* (a) the Software, and
*
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
* one is included with the Software each a "Larger Work" to which the Software
* is contributed by such licensors),
*
* without restriction, including without limitation the rights to copy, create
* derivative works of, display, perform, and distribute the Software and make,
* use, sell, offer for sale, import, export, have made, and have sold the
* Software and the Larger Work(s), and to sublicense the foregoing rights on
* either these or other terms.
*
* This license is subject to the following condition:
*
* The above copyright notice and either this complete permission notice or at a
* minimum a reference to the UPL must be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
package com.oracle.truffle.regex.tregex.automaton;
import java.util.Arrays;
import java.util.Iterator;
import com.oracle.truffle.regex.charset.CodePointSet;
import com.oracle.truffle.regex.tregex.buffer.CompilationBuffer;
import com.oracle.truffle.regex.tregex.buffer.ObjectArrayBuffer;
import com.oracle.truffle.regex.util.CompilationFinalBitSet;
/**
* This class provides an algorithm for converting a list of NFA transitions into a set of DFA
* transitions.
*
* @param represents a DFA transition fragment. This type is used for both intermediate and
* final results.
*
* @see TransitionBuilder
* @see TransitionSet
*/
public abstract class StateTransitionCanonicalizer, S extends AbstractState, T extends AbstractTransition, TB extends TransitionBuilder> {
private final ObjectArrayBuffer argTransitions = new ObjectArrayBuffer<>();
private final ObjectArrayBuffer argCharSets = new ObjectArrayBuffer<>();
private static final int INITIAL_CAPACITY = 8;
@SuppressWarnings("unchecked") private ObjectArrayBuffer[] transitionLists = new ObjectArrayBuffer[INITIAL_CAPACITY];
@SuppressWarnings("unchecked") private StateSet[] targetStateSets = new StateSet[INITIAL_CAPACITY];
private CodePointSet[] matcherBuilders = new CodePointSet[INITIAL_CAPACITY];
private CompilationFinalBitSet leadsToFinalState = new CompilationFinalBitSet(INITIAL_CAPACITY);
private int resultLength = 0;
private final SI stateIndex;
private final boolean forward;
private final boolean prioritySensitive;
public StateTransitionCanonicalizer(SI stateIndex, boolean forward, boolean prioritySensitive) {
this.stateIndex = stateIndex;
this.forward = forward;
this.prioritySensitive = prioritySensitive;
}
/**
* If priority-sensitive mode, transition sets are pruned after transitions to final states.
* Also, target state sets are considered equal iff their order is equal as well.
*/
protected boolean isPrioritySensitive() {
return prioritySensitive;
}
/**
* Submits an argument to be processed by {@link #run(CompilationBuffer)}.
*/
public void addArgument(T transition, CodePointSet charSet) {
argTransitions.add(transition);
argCharSets.add(charSet);
}
/**
* Runs the NFA to DFA transition conversion algorithm on the NFA transitions given by previous
* calls to {@link #addArgument(AbstractTransition, CodePointSet)}. This algorithm has two
* phases:
*
* - Merge NFA transitions according to their expected character sets. The result of this
* phase is a list of {@link TransitionBuilder}s whose {@link CodePointSet}s have no more
* intersections.
* - Merge {@link TransitionBuilder}s generated by the first phase if their target
* state is equal and {@link #canMerge(TransitionBuilder, TransitionBuilder)} returns
* {@code true}.
*
*
* @return a set of transition builders representing the DFA transitions generated from the
* given NFA transitions.
*/
public TB[] run(CompilationBuffer compilationBuffer) {
calcDisjointTransitions(compilationBuffer);
TB[] result = mergeSameTargets(compilationBuffer);
resultLength = 0;
leadsToFinalState.clear();
argTransitions.clear();
argCharSets.clear();
return result;
}
/**
* Merges NFA transitions according to their expected character sets as returned
* {@link TransitionBuilder#getCodePointSet()}, in the following way:
*
* - The result of the algorithm is a list of transitions where no two elements have an
* intersection in their respective set of expected characters. We initially define the result
* as an empty list.
* - For every element e of the input list, we compare the expected character sets of
* e and every element in the result list. If an intersection with an element
* r of the result list is found, we distinguish the following two cases, where the
* character set of an element x is denoted as x.cs and the transition set of
* an element x is denoted as x.ts:
*
* - If e.cs contains r.cs, e.ts is merged into r.ts using
* TransitionSet#addAll(TransitionSet).
* - Otherwise, a new transition containing e.ts and r.ts and the
* intersection of e.cs and r.cs is added to the result list. This new
* transition is created using TransitionBuilder#createMerged(TransitionBuilder, CodePointSet).
* The intersection of e.cs and r.cs is removed from r.cs.
*
*
* - Every time an intersection is found, that intersection is removed from e.cs. If
* e.cs is not empty after e has been compared to every element of the result
* list, e is added to the result list.
* - The result list at all times fulfills the property that no two elements in the list
* intersect.
*
* This algorithm has an important property: every entry in the input list will be merged with
* the elements it intersects with in the order of the input list. This means that
* given an input list {@code [1, 2, 3]} where all elements intersect with each other, element
* {@code 2} will be merged with {@code 1} before {@code 3} is merged with {@code 1}. This
* property is crucial for generating priority-sensitive DFAs, since we track priorities of NFA
* transitions by their order!
*
* Example:
*
*
* input: [
* {transitionSet {1}, matcherBuilder [ab]},
* {transitionSet {2}, matcherBuilder [bc]}
* ]
* output: [
* {transitionSet {1}, matcherBuilder [a]},
* {transitionSet {1, 2}, matcherBuilder [b]},
* {transitionSet {2}, matcherBuilder [c]}
* ]
*
*/
@SuppressWarnings("unchecked")
private void calcDisjointTransitions(CompilationBuffer compilationBuffer) {
for (int i = 0; i < argTransitions.length(); i++) {
T argTransition = argTransitions.get(i);
CodePointSet argCharSet = argCharSets.get(i);
int currentResultLength = resultLength;
for (int j = 0; j < currentResultLength; j++) {
CodePointSet.IntersectAndSubtractResult result = matcherBuilders[j].intersectAndSubtract(argCharSet, compilationBuffer);
CodePointSet rSubtractedMatcher = result.subtractedA;
CodePointSet eSubtractedMatcher = result.subtractedB;
CodePointSet intersection = result.intersection;
if (intersection.matchesSomething()) {
if (rSubtractedMatcher.matchesNothing()) {
addTransitionTo(j, argTransition);
} else {
createSlot();
matcherBuilders[j] = rSubtractedMatcher;
matcherBuilders[resultLength] = intersection;
targetStateSets[resultLength] = targetStateSets[j].copy();
transitionLists[resultLength].addAll(transitionLists[j]);
if (isPrioritySensitive() && leadsToFinalState.get(j)) {
leadsToFinalState.set(resultLength);
}
addTransitionTo(resultLength, argTransition);
resultLength++;
}
argCharSet = eSubtractedMatcher;
if (eSubtractedMatcher.matchesNothing()) {
break;
}
}
}
if (argCharSet.matchesSomething()) {
createSlot();
targetStateSets[resultLength] = StateSet.create(stateIndex);
matcherBuilders[resultLength] = argCharSet;
addTransitionTo(resultLength, argTransition);
resultLength++;
}
}
}
private void createSlot() {
if (transitionLists.length <= resultLength) {
transitionLists = Arrays.copyOf(transitionLists, resultLength * 2);
targetStateSets = Arrays.copyOf(targetStateSets, resultLength * 2);
matcherBuilders = Arrays.copyOf(matcherBuilders, resultLength * 2);
}
if (transitionLists[resultLength] == null) {
transitionLists[resultLength] = new ObjectArrayBuffer<>();
}
transitionLists[resultLength].clear();
}
private void addTransitionTo(int i, T transition) {
if (isPrioritySensitive() && leadsToFinalState.get(i)) {
return;
}
if (targetStateSets[i].add(transition.getTarget(forward))) {
transitionLists[i].add(transition);
if (isPrioritySensitive() && ((BasicState, ?>) transition.getTarget(forward)).hasTransitionToUnAnchoredFinalState(forward)) {
leadsToFinalState.set(i);
}
}
}
/**
* Merges transitions calculated by {@link #calcDisjointTransitions(CompilationBuffer)} if their
* target state set is equal and
* {@link #canMerge(TransitionBuilder, TransitionBuilder)} returns {@code true}.
*/
@SuppressWarnings("unchecked")
private TB[] mergeSameTargets(CompilationBuffer compilationBuffer) {
ObjectArrayBuffer resultBuffer1 = compilationBuffer.getObjectBuffer1();
resultBuffer1.ensureCapacity(resultLength);
for (int i = 0; i < resultLength; i++) {
assert matcherBuilders[i].matchesSomething();
resultBuffer1.add(createTransitionBuilder(transitionLists[i].toArray(createTransitionArray(transitionLists[i].length())), targetStateSets[i], matcherBuilders[i]));
}
if (isPrioritySensitive() && leadsToFinalState.isEmpty()) {
// no transitions were pruned, so no equal transition sets are possible
return resultBuffer1.toArray(createResultArray(resultBuffer1.length()));
}
resultBuffer1.sort((TB o1, TB o2) -> {
TransitionSet t1 = o1.getTransitionSet();
TransitionSet t2 = o2.getTransitionSet();
int cmp = t1.size() - t2.size();
if (cmp != 0) {
return cmp;
}
if (isPrioritySensitive()) {
// Transition sets are equal iff they lead to the same target states in the same
// order.
for (int i = 0; i < t1.size(); i++) {
cmp = t1.getTransition(i).getTarget(forward).getId() - t2.getTransition(i).getTarget(forward).getId();
if (cmp != 0) {
return cmp;
}
}
return cmp;
} else {
// Transition sets are equal iff they lead to the same set of target states.
// Here, we abuse the fact that our state set iterators yield states ordered by id.
Iterator i1 = t1.getTargetStateSet().iterator();
Iterator i2 = t2.getTargetStateSet().iterator();
while (i1.hasNext()) {
assert i2.hasNext();
cmp = i1.next().getId() - i2.next().getId();
if (cmp != 0) {
return cmp;
}
}
return cmp;
}
});
ObjectArrayBuffer resultBuffer2 = compilationBuffer.getObjectBuffer2();
TB last = null;
for (TB tb : resultBuffer1) {
if (last != null && canMerge(last, tb)) {
last.setMatcherBuilder(last.getCodePointSet().union(tb.getCodePointSet(), compilationBuffer));
} else {
resultBuffer2.add(tb);
last = tb;
}
}
return resultBuffer2.toArray(createResultArray(resultBuffer2.length()));
}
protected abstract TB createTransitionBuilder(T[] transitions, StateSet targetStateSet, CodePointSet matcherBuilder);
/**
* Returns {@code true} if two DFA transitions are allowed to be merged into one.
*/
protected abstract boolean canMerge(TB a, TB b);
protected abstract T[] createTransitionArray(int size);
/**
* Returns an array suitable for holding the result of {@link #run(CompilationBuffer)}.
*/
protected abstract TB[] createResultArray(int size);
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy