org.apache.lucene.util.automaton.Operations Maven / Gradle / Ivy
/* * dk.brics.automaton * * Copyright (c) 2001-2009 Anders Moeller * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ package org.apache.lucene.util.automaton; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.RamUsageEstimator; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; /** * Automata operations. * * @lucene.experimental */ final public class Operations { /** * Default maximum number of states that {@link Operations#determinize} should create. */ public static final int DEFAULT_MAX_DETERMINIZED_STATES = 10000; private Operations() {} /** * Returns an automaton that accepts the concatenation of the languages of the * given automata. *
or more concatenated * repetitions of the language of the given automaton. ** Complexity: linear in total number of states. */ static public Automaton concatenate(Automaton a1, Automaton a2) { return concatenate(Arrays.asList(a1, a2)); } /** * Returns an automaton that accepts the concatenation of the languages of the * given automata. *
* Complexity: linear in total number of states. */ static public Automaton concatenate(List
l) { Automaton result = new Automaton(); // First pass: create all states for(Automaton a : l) { if (a.getNumStates() == 0) { result.finishState(); return result; } int numStates = a.getNumStates(); for(int s=0;s * Complexity: linear in number of states. */ static public Automaton optional(Automaton a) { Automaton result = new Automaton(); result.createState(); result.setAccept(0, true); if (a.getNumStates() > 0) { result.copy(a); result.addEpsilon(0, 1); } result.finishState(); return result; } /** * Returns an automaton that accepts the Kleene star (zero or more * concatenated repetitions) of the language of the given automaton. Never * modifies the input automaton language. * * Complexity: linear in number of states. */ static public Automaton repeat(Automaton a) { if (a.getNumStates() == 0) { // Repeating the empty automata will still only accept the empty automata. return a; } Automaton.Builder builder = new Automaton.Builder(); builder.createState(); builder.setAccept(0, true); builder.copy(a); Transition t = new Transition(); int count = a.initTransition(0, t); for(int i=0;i
min * Complexity: linear in number of states and in
min
. */ static public Automaton repeat(Automaton a, int count) { if (count == 0) { return repeat(a); } Listas = new ArrayList<>(); while (count-- > 0) { as.add(a); } as.add(repeat(a)); return concatenate(as); } /** * Returns an automaton that accepts between min
and *max
(including both) concatenated repetitions of the language * of the given automaton. ** Complexity: linear in number of states and in
min
and *max
. */ static public Automaton repeat(Automaton a, int min, int max) { if (min > max) { return Automata.makeEmpty(); } Automaton b; if (min == 0) { b = Automata.makeEmptyString(); } else if (min == 1) { b = new Automaton(); b.copy(a); } else { Listas = new ArrayList<>(); for(int i=0;i prevAcceptStates = toSet(b, 0); Automaton.Builder builder = new Automaton.Builder(); builder.copy(b); for(int i=min;i toSet(Automaton a, int offset) { int numStates = a.getNumStates(); BitSet isAccept = a.getAcceptStates(); Set result = new HashSet (); int upto = 0; while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) { result.add(offset+upto); upto++; } return result; } /** * Returns a (deterministic) automaton that accepts the complement of the * language of the given automaton. * * Complexity: linear in number of states if already deterministic and * exponential otherwise. * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. */ static public Automaton complement(Automaton a, int maxDeterminizedStates) { a = totalize(determinize(a, maxDeterminizedStates)); int numStates = a.getNumStates(); for (int p=0;p
a1 and the complement of the language of * a2
. As a side-effect, the automata may be determinized, if not * already deterministic. ** Complexity: quadratic in number of states if a2 already deterministic and * exponential in number of a2's states otherwise. */ static public Automaton minus(Automaton a1, Automaton a2, int maxDeterminizedStates) { if (Operations.isEmpty(a1) || a1 == a2) { return Automata.makeEmpty(); } if (Operations.isEmpty(a2)) { return a1; } return intersection(a1, complement(a2, maxDeterminizedStates)); } /** * Returns an automaton that accepts the intersection of the languages of the * given automata. Never modifies the input automata languages. *
* Complexity: quadratic in number of states. */ static public Automaton intersection(Automaton a1, Automaton a2) { if (a1 == a2) { return a1; } if (a1.getNumStates() == 0) { return a1; } if (a2.getNumStates() == 0) { return a2; } Transition[][] transitions1 = a1.getSortedTransitions(); Transition[][] transitions2 = a2.getSortedTransitions(); Automaton c = new Automaton(); c.createState(); LinkedList
worklist = new LinkedList<>(); HashMap newstates = new HashMap<>(); StatePair p = new StatePair(0, 0, 0); worklist.add(p); newstates.put(p, p); while (worklist.size() > 0) { p = worklist.removeFirst(); c.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2)); Transition[] t1 = transitions1[p.s1]; Transition[] t2 = transitions2[p.s2]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) if (t2[n2].max >= t1[n1].min) { StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); StatePair r = newstates.get(q); if (r == null) { q.s = c.createState(); worklist.add(q); newstates.put(q, q); r = q; } int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; c.addTransition(p.s, r.s, min, max); } } } c.finishState(); return removeDeadStates(c); } /** Returns true if these two automata accept exactly the * same language. This is a costly computation! Note * also that a1 and a2 will be determinized as a side * effect. Both automata must be determinized and have * no dead states! */ public static boolean sameLanguage(Automaton a1, Automaton a2) { if (a1 == a2) { return true; } return subsetOf(a2, a1) && subsetOf(a1, a2); } // TODO: move to test-framework? /** Returns true if this automaton has any states that cannot * be reached from the initial state or cannot reach an accept state. * Cost is O(numTransitions+numStates). */ public static boolean hasDeadStates(Automaton a) { BitSet liveStates = getLiveStates(a); int numLive = liveStates.cardinality(); int numStates = a.getNumStates(); assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates; return numLive < numStates; } // TODO: move to test-framework? /** Returns true if there are dead states reachable from an initial state. */ public static boolean hasDeadStatesFromInitial(Automaton a) { BitSet reachableFromInitial = getLiveStatesFromInitial(a); BitSet reachableFromAccept = getLiveStatesToAccept(a); reachableFromInitial.andNot(reachableFromAccept); return reachableFromInitial.isEmpty() == false; } // TODO: move to test-framework? /** Returns true if there are dead states that reach an accept state. */ public static boolean hasDeadStatesToAccept(Automaton a) { BitSet reachableFromInitial = getLiveStatesFromInitial(a); BitSet reachableFromAccept = getLiveStatesToAccept(a); reachableFromAccept.andNot(reachableFromInitial); return reachableFromAccept.isEmpty() == false; } /** * Returns true if the language of a1
is a subset of the language * ofa2
. Both automata must be determinized and must have no dead * states. ** Complexity: quadratic in number of states. */ public static boolean subsetOf(Automaton a1, Automaton a2) { if (a1.isDeterministic() == false) { throw new IllegalArgumentException("a1 must be deterministic"); } if (a2.isDeterministic() == false) { throw new IllegalArgumentException("a2 must be deterministic"); } assert hasDeadStatesFromInitial(a1) == false; assert hasDeadStatesFromInitial(a2) == false; if (a1.getNumStates() == 0) { // Empty language is alwyas a subset of any other language return true; } else if (a2.getNumStates() == 0) { return isEmpty(a1); } // TODO: cutover to iterators instead Transition[][] transitions1 = a1.getSortedTransitions(); Transition[][] transitions2 = a2.getSortedTransitions(); LinkedList
worklist = new LinkedList<>(); HashSet visited = new HashSet<>(); StatePair p = new StatePair(0, 0); worklist.add(p); visited.add(p); while (worklist.size() > 0) { p = worklist.removeFirst(); if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { return false; } Transition[] t1 = transitions1[p.s1]; Transition[] t2 = transitions2[p.s2]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) { b2++; } int min1 = t1[n1].min, max1 = t1[n1].max; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].min > min1) { return false; } if (t2[n2].max < Character.MAX_CODE_POINT) { min1 = t2[n2].max + 1; } else { min1 = Character.MAX_CODE_POINT; max1 = Character.MIN_CODE_POINT; } StatePair q = new StatePair(t1[n1].dest, t2[n2].dest); if (!visited.contains(q)) { worklist.add(q); visited.add(q); } } if (min1 <= max1) { return false; } } } return true; } /** * Returns an automaton that accepts the union of the languages of the given * automata. * * Complexity: linear in number of states. */ public static Automaton union(Automaton a1, Automaton a2) { return union(Arrays.asList(a1, a2)); } /** * Returns an automaton that accepts the union of the languages of the given * automata. *
* Complexity: linear in number of states. */ public static Automaton union(Collection
l) { Automaton result = new Automaton(); // Create initial state: result.createState(); // Copy over all automata for(Automaton a : l) { result.copy(a); } // Add epsilon transition from new initial state int stateOffset = 1; for(Automaton a : l) { if (a.getNumStates() == 0) { continue; } result.addEpsilon(0, stateOffset); stateOffset += a.getNumStates(); } result.finishState(); return removeDeadStates(result); } // Simple custom ArrayList private final static class TransitionList { // dest, min, max int[] transitions = new int[3]; int next; public void add(Transition t) { if (transitions.length < next+3) { transitions = ArrayUtil.grow(transitions, next+3); } transitions[next] = t.dest; transitions[next+1] = t.min; transitions[next+2] = t.max; next += 3; } } // Holds all transitions that start on this int point, or // end at this point-1 private final static class PointTransitions implements Comparable { int point; final TransitionList ends = new TransitionList(); final TransitionList starts = new TransitionList(); @Override public int compareTo(PointTransitions other) { return point - other.point; } public void reset(int point) { this.point = point; ends.next = 0; starts.next = 0; } @Override public boolean equals(Object other) { return ((PointTransitions) other).point == point; } @Override public int hashCode() { return point; } } private final static class PointTransitionSet { int count; PointTransitions[] points = new PointTransitions[5]; private final static int HASHMAP_CUTOVER = 30; private final HashMap map = new HashMap<>(); private boolean useHash = false; private PointTransitions next(int point) { // 1st time we are seeing this point if (count == points.length) { final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(points, 0, newArray, 0, count); points = newArray; } PointTransitions points0 = points[count]; if (points0 == null) { points0 = points[count] = new PointTransitions(); } points0.reset(point); count++; return points0; } private PointTransitions find(int point) { if (useHash) { final Integer pi = point; PointTransitions p = map.get(pi); if (p == null) { p = next(point); map.put(pi, p); } return p; } else { for(int i=0;i 1) ArrayUtil.timSort(points, 0, count); } public void add(Transition t) { find(t.min).starts.add(t); find(1+t.max).ends.add(t); } @Override public String toString() { StringBuilder s = new StringBuilder(); for(int i=0;i 0) { s.append(' '); } s.append(points[i].point).append(':').append(points[i].starts.next/3).append(',').append(points[i].ends.next/3); } return s.toString(); } } /** * Determinizes the given automaton. * * Worst case complexity: exponential in number of states. * @param maxDeterminizedStates Maximum number of states created when * determinizing. Higher numbers allow this operation to consume more * memory but allow more complex automatons. Use * DEFAULT_MAX_DETERMINIZED_STATES as a decent default if you don't know * how many to allow. * @throws TooComplexToDeterminizeException if determinizing a creates an * automaton with more than maxDeterminizedStates */ public static Automaton determinize(Automaton a, int maxDeterminizedStates) { if (a.isDeterministic()) { // Already determinized return a; } if (a.getNumStates() <= 1) { // Already determinized return a; } // subset construction Automaton.Builder b = new Automaton.Builder(); //System.out.println("DET:"); //a.writeDot("/l/la/lucene/core/detin.dot"); SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(0, 0); // Create state 0: b.createState(); LinkedList
worklist = new LinkedList<>(); Map newstate = new HashMap<>(); worklist.add(initialset); b.setAccept(0, a.isAccept(0)); newstate.put(initialset, 0); // like Set final PointTransitionSet points = new PointTransitionSet(); // like SortedMap final SortedIntSet statesSet = new SortedIntSet(5); Transition t = new Transition(); while (worklist.size() > 0) { SortedIntSet.FrozenIntSet s = worklist.removeFirst(); //System.out.println("det: pop set=" + s); // Collate all outgoing transitions by min/1+max: for(int i=0;i 0) { assert lastPoint != -1; statesSet.computeHash(); Integer q = newstate.get(statesSet); if (q == null) { q = b.createState(); if (q >= maxDeterminizedStates) { throw new TooComplexToDeterminizeException(a, maxDeterminizedStates); } final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); //System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount); worklist.add(p); b.setAccept(q, accCount > 0); newstate.put(p, q); } else { assert (accCount > 0 ? true:false) == b.isAccept(q): "accCount=" + accCount + " vs existing accept=" + b.isAccept(q) + " states=" + statesSet; } // System.out.println(" add trans src=" + r + " dest=" + q + " min=" + lastPoint + " max=" + (point-1)); b.addTransition(r, q, lastPoint, point-1); } // process transitions that end on this point // (closes an overlapping interval) int[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.next; for(int j=0;j workList = new LinkedList<>(); BitSet seen = new BitSet(a.getNumStates()); workList.add(0); seen.set(0); Transition t = new Transition(); while (workList.isEmpty() == false) { int state = workList.removeFirst(); if (a.isAccept(state)) { return false; } int count = a.initTransition(state, t); for(int i=0;i * Complexity: linear in the length of the string. * * Note: for full performance, use the {@link RunAutomaton} class. */ public static boolean run(Automaton a, String s) { assert a.isDeterministic(); int state = 0; for (int i = 0, cp = 0; i < s.length(); i += Character.charCount(cp)) { int nextState = a.step(state, cp = s.codePointAt(i)); if (nextState == -1) { return false; } state = nextState; } return a.isAccept(state); } /** * Returns true if the given string (expressed as unicode codepoints) is accepted by the automaton. The input must be deterministic. *
* Complexity: linear in the length of the string. *
* Note: for full performance, use the {@link RunAutomaton} class. */ public static boolean run(Automaton a, IntsRef s) { assert a.isDeterministic(); int state = 0; for (int i=0;i
workList = new LinkedList<>(); live.set(0); workList.add(0); Transition t = new Transition(); while (workList.isEmpty() == false) { int s = workList.removeFirst(); int count = a.initTransition(s, t); for(int i=0;i workList = new LinkedList<>(); BitSet live = new BitSet(numStates); BitSet acceptBits = a.getAcceptStates(); int s = 0; while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) { live.set(s); workList.add(s); s++; } while (workList.isEmpty() == false) { s = workList.removeFirst(); int count = a2.initTransition(s, t); for(int i=0;i visited = new HashSet<>(); int s = 0; boolean done; Transition t = new Transition(); do { done = true; visited.add(s); if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { b.appendCodePoint(t.min); s = t.dest; done = false; } } } while (!done); return b.toString(); } // TODO: this currently requites a determinized machine, // but it need not -- we can speed it up by walking the // NFA instead. it'd still be fail fast. /** * Returns the longest BytesRef that is a prefix of all accepted strings and * visits each state at most once. The automaton must be deterministic. * * @return common prefix, which can be an empty (length 0) BytesRef (never null) */ public static BytesRef getCommonPrefixBytesRef(Automaton a) { BytesRefBuilder builder = new BytesRefBuilder(); HashSet visited = new HashSet<>(); int s = 0; boolean done; Transition t = new Transition(); do { done = true; visited.add(s); if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { builder.append((byte) t.min); s = t.dest; done = false; } } } while (!done); return builder.get(); } /** If this automaton accepts a single input, return it. Else, return null. * The automaton must be deterministic. */ public static IntsRef getSingleton(Automaton a) { if (a.isDeterministic() == false) { throw new IllegalArgumentException("input automaton must be deterministic"); } IntsRefBuilder builder = new IntsRefBuilder(); HashSet visited = new HashSet<>(); int s = 0; Transition t = new Transition(); while (true) { visited.add(s); if (a.isAccept(s) == false) { if (a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { builder.append(t.min); s = t.dest; continue; } } } else if (a.getNumTransitions(s) == 0) { return builder.get(); } // Automaton accepts more than one string: return null; } } /** * Returns the longest BytesRef that is a suffix of all accepted strings. * Worst case complexity: exponential in number of states (this calls * determinize). * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. * @return common suffix, which can be an empty (length 0) BytesRef (never null) */ public static BytesRef getCommonSuffixBytesRef(Automaton a, int maxDeterminizedStates) { // reverse the language of the automaton, then reverse its common prefix. Automaton r = Operations.determinize(reverse(a), maxDeterminizedStates); BytesRef ref = getCommonPrefixBytesRef(r); reverseBytes(ref); return ref; } private static void reverseBytes(BytesRef ref) { if (ref.length <= 1) return; int num = ref.length >> 1; for (int i = ref.offset; i < ( ref.offset + num ); i++) { byte b = ref.bytes[i]; ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; } } /** Returns an automaton accepting the reverse language. */ public static Automaton reverse(Automaton a) { return reverse(a, null); } /** Reverses the automaton, returning the new initial states. */ static Automaton reverse(Automaton a, Set initialStates) { if (Operations.isEmpty(a)) { return new Automaton(); } int numStates = a.getNumStates(); // Build a new automaton with all edges reversed Automaton.Builder builder = new Automaton.Builder(); // Initial node; we'll add epsilon transitions in the end: builder.createState(); for(int s=0;s maxi) { result.addTransition(i, deadState, maxi, t.min-1); } if (t.max + 1 > maxi) { maxi = t.max + 1; } } if (maxi <= Character.MAX_CODE_POINT) { result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT); } } result.finishState(); return result; } /** Returns the topological sort of all states reachable from * the initial state. Behavior is undefined if this * automaton has cycles. CPU cost is O(numTransitions), * and the implementation is recursive so an automaton * matching long strings may exhaust the java stack. */ public static int[] topoSortStates(Automaton a) { if (a.getNumStates() == 0) { return new int[0]; } int numStates = a.getNumStates(); int[] states = new int[numStates]; final BitSet visited = new BitSet(numStates); int upto = topoSortStatesRecurse(a, visited, states, 0, 0); if (upto < states.length) { // There were dead states int[] newStates = new int[upto]; System.arraycopy(states, 0, newStates, 0, upto); states = newStates; } // Reverse the order: for(int i=0;i