All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.util.automaton.XOperations Maven / Gradle / Ivy

/*
 * dk.brics.automaton
 * 
 * Copyright (c) 2001-2009 Anders Moeller
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.apache.lucene.util.automaton;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.XBytesRefBuilder;
import org.apache.lucene.util.XIntsRefBuilder;

/**
 * Automata operations.
 * 
 * @lucene.experimental
 */
final public class XOperations {
  /**
   * Default maximum number of states that {@link XOperations#determinize} should create.
   */
  public static final int DEFAULT_MAX_DETERMINIZED_STATES = 10000;

  private XOperations() {}

  /**
   * Returns an automaton that accepts the concatenation of the languages of the
   * given automata.
   * 

* Complexity: linear in total number of states. */ static public XAutomaton concatenate(XAutomaton a1, XAutomaton a2) { return concatenate(Arrays.asList(a1, a2)); } /** * Returns an automaton that accepts the concatenation of the languages of the * given automata. *

* Complexity: linear in total number of states. */ static public XAutomaton concatenate(List l) { XAutomaton result = new XAutomaton(); // First pass: create all states for(XAutomaton a : l) { if (a.getNumStates() == 0) { result.finishState(); return result; } int numStates = a.getNumStates(); for(int s=0;s * Complexity: linear in number of states. */ static public XAutomaton optional(XAutomaton a) { XAutomaton result = new XAutomaton(); result.createState(); result.setAccept(0, true); if (a.getNumStates() > 0) { result.copy(a); result.addEpsilon(0, 1); } result.finishState(); return result; } /** * Returns an automaton that accepts the Kleene star (zero or more * concatenated repetitions) of the language of the given automaton. Never * modifies the input automaton language. *

* Complexity: linear in number of states. */ static public XAutomaton repeat(XAutomaton a) { if (a.getNumStates() == 0) { // Repeating the empty automata will still only accept the empty automata. return a; } XAutomaton.Builder builder = new XAutomaton.Builder(); builder.createState(); builder.setAccept(0, true); builder.copy(a); XTransition t = new XTransition(); int count = a.initTransition(0, t); for(int i=0;imin or more concatenated * repetitions of the language of the given automaton. *

* Complexity: linear in number of states and in min. */ static public XAutomaton repeat(XAutomaton a, int min) { if (min == 0) { return repeat(a); } List as = new ArrayList<>(); while (min-- > 0) { as.add(a); } as.add(repeat(a)); return concatenate(as); } /** * Returns an automaton that accepts between min and * max (including both) concatenated repetitions of the language * of the given automaton. *

* Complexity: linear in number of states and in min and * max. */ static public XAutomaton repeat(XAutomaton a, int min, int max) { if (min > max) { return XAutomata.makeEmpty(); } XAutomaton b; if (min == 0) { b = XAutomata.makeEmptyString(); } else if (min == 1) { b = new XAutomaton(); b.copy(a); } else { List as = new ArrayList<>(); for(int i=0;i prevAcceptStates = toSet(b, 0); for(int i=min;i toSet(XAutomaton a, int offset) { int numStates = a.getNumStates(); BitSet isAccept = a.getAcceptStates(); Set result = new HashSet(); int upto = 0; while (upto < numStates && (upto = isAccept.nextSetBit(upto)) != -1) { result.add(offset+upto); upto++; } return result; } /** * Returns a (deterministic) automaton that accepts the complement of the * language of the given automaton. *

* Complexity: linear in number of states if already deterministic and * exponential otherwise. * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. */ static public XAutomaton complement(XAutomaton a, int maxDeterminizedStates) { a = totalize(determinize(a, maxDeterminizedStates)); int numStates = a.getNumStates(); for (int p=0;pa1 and the complement of the language of * a2. As a side-effect, the automata may be determinized, if not * already deterministic. *

* Complexity: quadratic in number of states if a2 already deterministic and * exponential in number of a2's states otherwise. */ static public XAutomaton minus(XAutomaton a1, XAutomaton a2, int maxDeterminizedStates) { if (XOperations.isEmpty(a1) || a1 == a2) { return XAutomata.makeEmpty(); } if (XOperations.isEmpty(a2)) { return a1; } return intersection(a1, complement(a2, maxDeterminizedStates)); } /** * Returns an automaton that accepts the intersection of the languages of the * given automata. Never modifies the input automata languages. *

* Complexity: quadratic in number of states. */ static public XAutomaton intersection(XAutomaton a1, XAutomaton a2) { if (a1 == a2) { return a1; } if (a1.getNumStates() == 0) { return a1; } if (a2.getNumStates() == 0) { return a2; } XTransition[][] transitions1 = a1.getSortedTransitions(); XTransition[][] transitions2 = a2.getSortedTransitions(); XAutomaton c = new XAutomaton(); c.createState(); LinkedList worklist = new LinkedList<>(); HashMap newstates = new HashMap<>(); XStatePair p = new XStatePair(0, 0, 0); worklist.add(p); newstates.put(p, p); while (worklist.size() > 0) { p = worklist.removeFirst(); c.setAccept(p.s, a1.isAccept(p.s1) && a2.isAccept(p.s2)); XTransition[] t1 = transitions1[p.s1]; XTransition[] t2 = transitions2[p.s2]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) if (t2[n2].max >= t1[n1].min) { XStatePair q = new XStatePair(t1[n1].dest, t2[n2].dest); XStatePair r = newstates.get(q); if (r == null) { q.s = c.createState(); worklist.add(q); newstates.put(q, q); r = q; } int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; c.addTransition(p.s, r.s, min, max); } } } c.finishState(); return removeDeadStates(c); } /** Returns true if these two automata accept exactly the * same language. This is a costly computation! Note * also that a1 and a2 will be determinized as a side * effect. Both automata must be determinized and have * no dead states! */ public static boolean sameLanguage(XAutomaton a1, XAutomaton a2) { if (a1 == a2) { return true; } return subsetOf(a2, a1) && subsetOf(a1, a2); } // TODO: move to test-framework? /** Returns true if this automaton has any states that cannot * be reached from the initial state or cannot reach an accept state. * Cost is O(numTransitions+numStates). */ public static boolean hasDeadStates(XAutomaton a) { BitSet liveStates = getLiveStates(a); int numLive = liveStates.cardinality(); int numStates = a.getNumStates(); assert numLive <= numStates: "numLive=" + numLive + " numStates=" + numStates + " " + liveStates; return numLive < numStates; } // TODO: move to test-framework? /** Returns true if there are dead states reachable from an initial state. */ public static boolean hasDeadStatesFromInitial(XAutomaton a) { BitSet reachableFromInitial = getLiveStatesFromInitial(a); BitSet reachableFromAccept = getLiveStatesToAccept(a); reachableFromInitial.andNot(reachableFromAccept); return reachableFromInitial.isEmpty() == false; } // TODO: move to test-framework? /** Returns true if there are dead states that reach an accept state. */ public static boolean hasDeadStatesToAccept(XAutomaton a) { BitSet reachableFromInitial = getLiveStatesFromInitial(a); BitSet reachableFromAccept = getLiveStatesToAccept(a); reachableFromAccept.andNot(reachableFromInitial); return reachableFromAccept.isEmpty() == false; } /** * Returns true if the language of a1 is a subset of the language * of a2. Both automata must be determinized and must have no dead * states. *

* Complexity: quadratic in number of states. */ public static boolean subsetOf(XAutomaton a1, XAutomaton a2) { if (a1.isDeterministic() == false) { throw new IllegalArgumentException("a1 must be deterministic"); } if (a2.isDeterministic() == false) { throw new IllegalArgumentException("a2 must be deterministic"); } assert hasDeadStatesFromInitial(a1) == false; assert hasDeadStatesFromInitial(a2) == false; if (a1.getNumStates() == 0) { // Empty language is alwyas a subset of any other language return true; } else if (a2.getNumStates() == 0) { return isEmpty(a1); } // TODO: cutover to iterators instead XTransition[][] transitions1 = a1.getSortedTransitions(); XTransition[][] transitions2 = a2.getSortedTransitions(); LinkedList worklist = new LinkedList<>(); HashSet visited = new HashSet<>(); XStatePair p = new XStatePair(0, 0); worklist.add(p); visited.add(p); while (worklist.size() > 0) { p = worklist.removeFirst(); if (a1.isAccept(p.s1) && a2.isAccept(p.s2) == false) { return false; } XTransition[] t1 = transitions1[p.s1]; XTransition[] t2 = transitions2[p.s2]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) { b2++; } int min1 = t1[n1].min, max1 = t1[n1].max; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].min > min1) { return false; } if (t2[n2].max < Character.MAX_CODE_POINT) { min1 = t2[n2].max + 1; } else { min1 = Character.MAX_CODE_POINT; max1 = Character.MIN_CODE_POINT; } XStatePair q = new XStatePair(t1[n1].dest, t2[n2].dest); if (!visited.contains(q)) { worklist.add(q); visited.add(q); } } if (min1 <= max1) { return false; } } } return true; } /** * Returns an automaton that accepts the union of the languages of the given * automata. *

* Complexity: linear in number of states. */ public static XAutomaton union(XAutomaton a1, XAutomaton a2) { return union(Arrays.asList(a1, a2)); } /** * Returns an automaton that accepts the union of the languages of the given * automata. *

* Complexity: linear in number of states. */ public static XAutomaton union(Collection l) { XAutomaton result = new XAutomaton(); // Create initial state: result.createState(); // Copy over all automata for(XAutomaton a : l) { result.copy(a); } // Add epsilon transition from new initial state int stateOffset = 1; for(XAutomaton a : l) { if (a.getNumStates() == 0) { continue; } result.addEpsilon(0, stateOffset); stateOffset += a.getNumStates(); } result.finishState(); return removeDeadStates(result); } // Simple custom ArrayList private final static class TransitionList { // dest, min, max int[] transitions = new int[3]; int next; public void add(XTransition t) { if (transitions.length < next+3) { transitions = ArrayUtil.grow(transitions, next+3); } transitions[next] = t.dest; transitions[next+1] = t.min; transitions[next+2] = t.max; next += 3; } } // Holds all transitions that start on this int point, or // end at this point-1 private final static class PointTransitions implements Comparable { int point; final TransitionList ends = new TransitionList(); final TransitionList starts = new TransitionList(); @Override public int compareTo(PointTransitions other) { return point - other.point; } public void reset(int point) { this.point = point; ends.next = 0; starts.next = 0; } @Override public boolean equals(Object other) { return ((PointTransitions) other).point == point; } @Override public int hashCode() { return point; } } private final static class PointTransitionSet { int count; PointTransitions[] points = new PointTransitions[5]; private final static int HASHMAP_CUTOVER = 30; private final HashMap map = new HashMap<>(); private boolean useHash = false; private PointTransitions next(int point) { // 1st time we are seeing this point if (count == points.length) { final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(points, 0, newArray, 0, count); points = newArray; } PointTransitions points0 = points[count]; if (points0 == null) { points0 = points[count] = new PointTransitions(); } points0.reset(point); count++; return points0; } private PointTransitions find(int point) { if (useHash) { final Integer pi = point; PointTransitions p = map.get(pi); if (p == null) { p = next(point); map.put(pi, p); } return p; } else { for(int i=0;i 1) ArrayUtil.timSort(points, 0, count); } public void add(XTransition t) { find(t.min).starts.add(t); find(1+t.max).ends.add(t); } @Override public String toString() { StringBuilder s = new StringBuilder(); for(int i=0;i 0) { s.append(' '); } s.append(points[i].point).append(':').append(points[i].starts.next/3).append(',').append(points[i].ends.next/3); } return s.toString(); } } /** * Determinizes the given automaton. *

* Worst case complexity: exponential in number of states. * @param maxDeterminizedStates Maximum number of states created when * determinizing. Higher numbers allow this operation to consume more * memory but allow more complex automatons. Use * DEFAULT_MAX_DETERMINIZED_STATES as a decent default if you don't know * how many to allow. * @throws XTooComplexToDeterminizeException if determinizing a creates an * automaton with more than maxDeterminizedStates */ public static XAutomaton determinize(XAutomaton a, int maxDeterminizedStates) { if (a.isDeterministic()) { // Already determinized return a; } if (a.getNumStates() <= 1) { // Already determinized return a; } // subset construction XAutomaton.Builder b = new XAutomaton.Builder(); //System.out.println("DET:"); //a.writeDot("/l/la/lucene/core/detin.dot"); XSortedIntSet.FrozenIntSet initialset = new XSortedIntSet.FrozenIntSet(0, 0); // Create state 0: b.createState(); LinkedList worklist = new LinkedList<>(); Map newstate = new HashMap<>(); worklist.add(initialset); b.setAccept(0, a.isAccept(0)); newstate.put(initialset, 0); // like Set final PointTransitionSet points = new PointTransitionSet(); // like SortedMap final XSortedIntSet statesSet = new XSortedIntSet(5); XTransition t = new XTransition(); while (worklist.size() > 0) { XSortedIntSet.FrozenIntSet s = worklist.removeFirst(); //System.out.println("det: pop set=" + s); // Collate all outgoing transitions by min/1+max: for(int i=0;i 0) { assert lastPoint != -1; statesSet.computeHash(); Integer q = newstate.get(statesSet); if (q == null) { q = b.createState(); if (q >= maxDeterminizedStates) { throw new XTooComplexToDeterminizeException(a, maxDeterminizedStates); } final XSortedIntSet.FrozenIntSet p = statesSet.freeze(q); //System.out.println(" make new state=" + q + " -> " + p + " accCount=" + accCount); worklist.add(p); b.setAccept(q, accCount > 0); newstate.put(p, q); } else { assert (accCount > 0 ? true:false) == b.isAccept(q): "accCount=" + accCount + " vs existing accept=" + b.isAccept(q) + " states=" + statesSet; } // System.out.println(" add trans src=" + r + " dest=" + q + " min=" + lastPoint + " max=" + (point-1)); b.addTransition(r, q, lastPoint, point-1); } // process transitions that end on this point // (closes an overlapping interval) int[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.next; for(int j=0;j workList = new LinkedList<>(); live.set(0); workList.add(0); XTransition t = new XTransition(); while (workList.isEmpty() == false) { int s = workList.removeFirst(); int count = a.initTransition(s, t); for(int i=0;i workList = new LinkedList<>(); BitSet live = new BitSet(numStates); BitSet acceptBits = a.getAcceptStates(); int s = 0; while (s < numStates && (s = acceptBits.nextSetBit(s)) != -1) { live.set(s); workList.add(s); s++; } while (workList.isEmpty() == false) { s = workList.removeFirst(); int count = a2.initTransition(s, t); for(int i=0;i visited = new HashSet<>(); int s = 0; boolean done; XTransition t = new XTransition(); do { done = true; visited.add(s); if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { b.appendCodePoint(t.min); s = t.dest; done = false; } } } while (!done); return b.toString(); } // TODO: this currently requites a determinized machine, // but it need not -- we can speed it up by walking the // NFA instead. it'd still be fail fast. /** * Returns the longest BytesRef that is a prefix of all accepted strings and * visits each state at most once. The automaton must be deterministic. * * @return common prefix */ public static BytesRef getCommonPrefixBytesRef(XAutomaton a) { XBytesRefBuilder builder = new XBytesRefBuilder(); HashSet visited = new HashSet<>(); int s = 0; boolean done; XTransition t = new XTransition(); do { done = true; visited.add(s); if (a.isAccept(s) == false && a.getNumTransitions(s) == 1) { a.getTransition(s, 0, t); if (t.min == t.max && !visited.contains(t.dest)) { builder.append((byte) t.min); s = t.dest; done = false; } } } while (!done); return builder.get(); } /** * Returns the longest BytesRef that is a suffix of all accepted strings. * Worst case complexity: exponential in number of states (this calls * determinize). * @param maxDeterminizedStates maximum number of states determinizing the * automaton can result in. Set higher to allow more complex queries and * lower to prevent memory exhaustion. * @return common suffix */ public static BytesRef getCommonSuffixBytesRef(XAutomaton a, int maxDeterminizedStates) { // reverse the language of the automaton, then reverse its common prefix. XAutomaton r = XOperations.determinize(reverse(a), maxDeterminizedStates); BytesRef ref = getCommonPrefixBytesRef(r); reverseBytes(ref); return ref; } private static void reverseBytes(BytesRef ref) { if (ref.length <= 1) return; int num = ref.length >> 1; for (int i = ref.offset; i < ( ref.offset + num ); i++) { byte b = ref.bytes[i]; ref.bytes[i] = ref.bytes[ref.offset * 2 + ref.length - i - 1]; ref.bytes[ref.offset * 2 + ref.length - i - 1] = b; } } /** Returns an automaton accepting the reverse language. */ public static XAutomaton reverse(XAutomaton a) { return reverse(a, null); } /** Reverses the automaton, returning the new initial states. */ static XAutomaton reverse(XAutomaton a, Set initialStates) { if (XOperations.isEmpty(a)) { return new XAutomaton(); } int numStates = a.getNumStates(); // Build a new automaton with all edges reversed XAutomaton.Builder builder = new XAutomaton.Builder(); // Initial node; we'll add epsilon transitions in the end: builder.createState(); for(int s=0;s t.max) { // We've exhaused the current transition's labels; // move to next transitions: transition++; if (transition >= a.getNumTransitions(state)) { // We're done iterating transitions leaving this state return -1; } a.getTransition(state, transition, t); label = t.min; to = t.dest; } return label++; } } private static PathNode getNode(PathNode[] nodes, int index) { assert index < nodes.length; if (nodes[index] == null) { nodes[index] = new PathNode(); } return nodes[index]; } // TODO: this is a dangerous method ... Automaton could be // huge ... and it's better in general for caller to // enumerate & process in a single walk: /** Returns the set of accepted strings, up to at most * limit strings. If more than limit * strings are accepted, the first limit strings found are returned. If limit == -1, then * the limit is infinite. If the {@link XAutomaton} has * cycles then this method might throw {@code * IllegalArgumentException} but that is not guaranteed * when the limit is set. */ public static Set getFiniteStrings(XAutomaton a, int limit) { Set results = new HashSet<>(); if (limit == -1 || limit > 0) { // OK } else { throw new IllegalArgumentException("limit must be -1 (which means no limit), or > 0; got: " + limit); } if (a.isAccept(0)) { // Special case the empty string, as usual: results.add(new IntsRef()); } if (a.getNumTransitions(0) > 0 && (limit == -1 || results.size() < limit)) { int numStates = a.getNumStates(); // Tracks which states are in the current path, for // cycle detection: BitSet pathStates = new BitSet(numStates); // Stack to hold our current state in the // recursion/iteration: PathNode[] nodes = new PathNode[4]; pathStates.set(0); PathNode root = getNode(nodes, 0); root.resetState(a, 0); XIntsRefBuilder string = new XIntsRefBuilder(); string.append(0); while (string.length() > 0) { PathNode node = nodes[string.length()-1]; // Get next label leaving the current node: int label = node.nextLabel(a); if (label != -1) { string.setIntAt(string.length()-1, label); if (a.isAccept(node.to)) { // This transition leads to an accept state, // so we save the current string: results.add(string.toIntsRef()); if (results.size() == limit) { break; } } if (a.getNumTransitions(node.to) != 0) { // Now recurse: the destination of this transition has // outgoing transitions: if (pathStates.get(node.to)) { throw new IllegalArgumentException("automaton has cycles"); } pathStates.set(node.to); // Push node onto stack: if (nodes.length == string.length()) { PathNode[] newNodes = new PathNode[ArrayUtil.oversize(nodes.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(nodes, 0, newNodes, 0, nodes.length); nodes = newNodes; } getNode(nodes, string.length()).resetState(a, node.to); string.setLength(string.length() + 1); string.grow(string.length()); } } else { // No more transitions leaving this state, // pop/return back to previous state: assert pathStates.get(node.state); pathStates.clear(node.state); string.setLength(string.length() - 1); } } } return results; } /** Returns a new automaton accepting the same language with added * transitions to a dead state so that from every state and every label * there is a transition. */ static XAutomaton totalize(XAutomaton a) { XAutomaton result = new XAutomaton(); int numStates = a.getNumStates(); for(int i=0;i maxi) { result.addTransition(i, deadState, maxi, t.min-1); } if (t.max + 1 > maxi) { maxi = t.max + 1; } } if (maxi <= Character.MAX_CODE_POINT) { result.addTransition(i, deadState, maxi, Character.MAX_CODE_POINT); } } result.finishState(); return result; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy