All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.util.automaton.BasicOperations Maven / Gradle / Ivy

There is a newer version: 1.9.8
Show newest version
/*
 * dk.brics.automaton
 * 
 * Copyright (c) 2001-2009 Anders Moeller
 * All rights reserved.
 * 
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/*
 * COPIED FROM APACHE LUCENE 4.7.2
 *
 * Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
 *
 * (see https://issues.apache.org/jira/browse/OAK-10786 for details)
 */

package org.apache.lucene.util.automaton;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

import java.util.ArrayList;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
 * Basic automata operations.
 * 
 * @lucene.experimental
 */
final public class BasicOperations {
  
  private BasicOperations() {}
  
  /**
   * Returns an automaton that accepts the concatenation of the languages of the
   * given automata.
   * 

* Complexity: linear in number of states. */ static public Automaton concatenate(Automaton a1, Automaton a2) { if (a1.isSingleton() && a2.isSingleton()) return BasicAutomata .makeString(a1.singleton + a2.singleton); if (isEmpty(a1) || isEmpty(a2)) return BasicAutomata.makeEmpty(); // adding epsilon transitions with the NFA concatenation algorithm // in this case always produces a resulting DFA, preventing expensive // redundant determinize() calls for this common case. boolean deterministic = a1.isSingleton() && a2.isDeterministic(); if (a1 == a2) { a1 = a1.cloneExpanded(); a2 = a2.cloneExpanded(); } else { a1 = a1.cloneExpandedIfRequired(); a2 = a2.cloneExpandedIfRequired(); } for (State s : a1.getAcceptStates()) { s.accept = false; s.addEpsilon(a2.initial); } a1.deterministic = deterministic; //a1.clearHashCode(); a1.clearNumberedStates(); a1.checkMinimizeAlways(); return a1; } /** * Returns an automaton that accepts the concatenation of the languages of the * given automata. *

* Complexity: linear in total number of states. */ static public Automaton concatenate(List l) { if (l.isEmpty()) return BasicAutomata.makeEmptyString(); boolean all_singleton = true; for (Automaton a : l) if (!a.isSingleton()) { all_singleton = false; break; } if (all_singleton) { StringBuilder b = new StringBuilder(); for (Automaton a : l) b.append(a.singleton); return BasicAutomata.makeString(b.toString()); } else { for (Automaton a : l) if (BasicOperations.isEmpty(a)) return BasicAutomata.makeEmpty(); Set ids = new HashSet(); for (Automaton a : l) ids.add(System.identityHashCode(a)); boolean has_aliases = ids.size() != l.size(); Automaton b = l.get(0); if (has_aliases) b = b.cloneExpanded(); else b = b.cloneExpandedIfRequired(); Set ac = b.getAcceptStates(); boolean first = true; for (Automaton a : l) if (first) first = false; else { if (a.isEmptyString()) continue; Automaton aa = a; if (has_aliases) aa = aa.cloneExpanded(); else aa = aa.cloneExpandedIfRequired(); Set ns = aa.getAcceptStates(); for (State s : ac) { s.accept = false; s.addEpsilon(aa.initial); if (s.accept) ns.add(s); } ac = ns; } b.deterministic = false; //b.clearHashCode(); b.clearNumberedStates(); b.checkMinimizeAlways(); return b; } } /** * Returns an automaton that accepts the union of the empty string and the * language of the given automaton. *

* Complexity: linear in number of states. */ static public Automaton optional(Automaton a) { a = a.cloneExpandedIfRequired(); State s = new State(); s.addEpsilon(a.initial); s.accept = true; a.initial = s; a.deterministic = false; //a.clearHashCode(); a.clearNumberedStates(); a.checkMinimizeAlways(); return a; } /** * Returns an automaton that accepts the Kleene star (zero or more * concatenated repetitions) of the language of the given automaton. Never * modifies the input automaton language. *

* Complexity: linear in number of states. */ static public Automaton repeat(Automaton a) { a = a.cloneExpanded(); State s = new State(); s.accept = true; s.addEpsilon(a.initial); for (State p : a.getAcceptStates()) p.addEpsilon(s); a.initial = s; a.deterministic = false; //a.clearHashCode(); a.clearNumberedStates(); a.checkMinimizeAlways(); return a; } /** * Returns an automaton that accepts min or more concatenated * repetitions of the language of the given automaton. *

* Complexity: linear in number of states and in min. */ static public Automaton repeat(Automaton a, int min) { if (min == 0) return repeat(a); List as = new ArrayList(); while (min-- > 0) as.add(a); as.add(repeat(a)); return concatenate(as); } /** * Returns an automaton that accepts between min and * max (including both) concatenated repetitions of the language * of the given automaton. *

* Complexity: linear in number of states and in min and * max. */ static public Automaton repeat(Automaton a, int min, int max) { if (min > max) return BasicAutomata.makeEmpty(); max -= min; a.expandSingleton(); Automaton b; if (min == 0) b = BasicAutomata.makeEmptyString(); else if (min == 1) b = a.clone(); else { List as = new ArrayList(); while (min-- > 0) as.add(a); b = concatenate(as); } if (max > 0) { Automaton d = a.clone(); while (--max > 0) { Automaton c = a.clone(); for (State p : c.getAcceptStates()) p.addEpsilon(d.initial); d = c; } for (State p : b.getAcceptStates()) p.addEpsilon(d.initial); b.deterministic = false; //b.clearHashCode(); b.clearNumberedStates(); b.checkMinimizeAlways(); } return b; } /** * Returns a (deterministic) automaton that accepts the complement of the * language of the given automaton. *

* Complexity: linear in number of states (if already deterministic). */ static public Automaton complement(Automaton a) { a = a.cloneExpandedIfRequired(); a.determinize(); a.totalize(); for (State p : a.getNumberedStates()) p.accept = !p.accept; a.removeDeadTransitions(); return a; } /** * Returns a (deterministic) automaton that accepts the intersection of the * language of a1 and the complement of the language of * a2. As a side-effect, the automata may be determinized, if not * already deterministic. *

* Complexity: quadratic in number of states (if already deterministic). */ static public Automaton minus(Automaton a1, Automaton a2) { if (BasicOperations.isEmpty(a1) || a1 == a2) return BasicAutomata .makeEmpty(); if (BasicOperations.isEmpty(a2)) return a1.cloneIfRequired(); if (a1.isSingleton()) { if (BasicOperations.run(a2, a1.singleton)) return BasicAutomata.makeEmpty(); else return a1.cloneIfRequired(); } return intersection(a1, a2.complement()); } /** * Returns an automaton that accepts the intersection of the languages of the * given automata. Never modifies the input automata languages. *

* Complexity: quadratic in number of states. */ static public Automaton intersection(Automaton a1, Automaton a2) { if (a1.isSingleton()) { if (BasicOperations.run(a2, a1.singleton)) return a1.cloneIfRequired(); else return BasicAutomata.makeEmpty(); } if (a2.isSingleton()) { if (BasicOperations.run(a1, a2.singleton)) return a2.cloneIfRequired(); else return BasicAutomata.makeEmpty(); } if (a1 == a2) return a1.cloneIfRequired(); Transition[][] transitions1 = a1.getSortedTransitions(); Transition[][] transitions2 = a2.getSortedTransitions(); Automaton c = new Automaton(); LinkedList worklist = new LinkedList(); HashMap newstates = new HashMap(); StatePair p = new StatePair(c.initial, a1.initial, a2.initial); worklist.add(p); newstates.put(p, p); while (worklist.size() > 0) { p = worklist.removeFirst(); p.s.accept = p.s1.accept && p.s2.accept; Transition[] t1 = transitions1[p.s1.number]; Transition[] t2 = transitions2[p.s2.number]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) if (t2[n2].max >= t1[n1].min) { StatePair q = new StatePair(t1[n1].to, t2[n2].to); StatePair r = newstates.get(q); if (r == null) { q.s = new State(); worklist.add(q); newstates.put(q, q); r = q; } int min = t1[n1].min > t2[n2].min ? t1[n1].min : t2[n2].min; int max = t1[n1].max < t2[n2].max ? t1[n1].max : t2[n2].max; p.s.addTransition(new Transition(min, max, r.s)); } } } c.deterministic = a1.deterministic && a2.deterministic; c.removeDeadTransitions(); c.checkMinimizeAlways(); return c; } /** Returns true if these two automata accept exactly the * same language. This is a costly computation! Note * also that a1 and a2 will be determinized as a side * effect. */ public static boolean sameLanguage(Automaton a1, Automaton a2) { if (a1 == a2) { return true; } if (a1.isSingleton() && a2.isSingleton()) { return a1.singleton.equals(a2.singleton); } else if (a1.isSingleton()) { // subsetOf is faster if the first automaton is a singleton return subsetOf(a1, a2) && subsetOf(a2, a1); } else { return subsetOf(a2, a1) && subsetOf(a1, a2); } } /** * Returns true if the language of a1 is a subset of the language * of a2. As a side-effect, a2 is determinized if * not already marked as deterministic. *

* Complexity: quadratic in number of states. */ public static boolean subsetOf(Automaton a1, Automaton a2) { if (a1 == a2) return true; if (a1.isSingleton()) { if (a2.isSingleton()) return a1.singleton.equals(a2.singleton); return BasicOperations.run(a2, a1.singleton); } a2.determinize(); Transition[][] transitions1 = a1.getSortedTransitions(); Transition[][] transitions2 = a2.getSortedTransitions(); LinkedList worklist = new LinkedList(); HashSet visited = new HashSet(); StatePair p = new StatePair(a1.initial, a2.initial); worklist.add(p); visited.add(p); while (worklist.size() > 0) { p = worklist.removeFirst(); if (p.s1.accept && !p.s2.accept) { return false; } Transition[] t1 = transitions1[p.s1.number]; Transition[] t2 = transitions2[p.s2.number]; for (int n1 = 0, b2 = 0; n1 < t1.length; n1++) { while (b2 < t2.length && t2[b2].max < t1[n1].min) b2++; int min1 = t1[n1].min, max1 = t1[n1].max; for (int n2 = b2; n2 < t2.length && t1[n1].max >= t2[n2].min; n2++) { if (t2[n2].min > min1) { return false; } if (t2[n2].max < Character.MAX_CODE_POINT) min1 = t2[n2].max + 1; else { min1 = Character.MAX_CODE_POINT; max1 = Character.MIN_CODE_POINT; } StatePair q = new StatePair(t1[n1].to, t2[n2].to); if (!visited.contains(q)) { worklist.add(q); visited.add(q); } } if (min1 <= max1) { return false; } } } return true; } /** * Returns an automaton that accepts the union of the languages of the given * automata. *

* Complexity: linear in number of states. */ public static Automaton union(Automaton a1, Automaton a2) { if ((a1.isSingleton() && a2.isSingleton() && a1.singleton .equals(a2.singleton)) || a1 == a2) return a1.cloneIfRequired(); if (a1 == a2) { a1 = a1.cloneExpanded(); a2 = a2.cloneExpanded(); } else { a1 = a1.cloneExpandedIfRequired(); a2 = a2.cloneExpandedIfRequired(); } State s = new State(); s.addEpsilon(a1.initial); s.addEpsilon(a2.initial); a1.initial = s; a1.deterministic = false; //a1.clearHashCode(); a1.clearNumberedStates(); a1.checkMinimizeAlways(); return a1; } /** * Returns an automaton that accepts the union of the languages of the given * automata. *

* Complexity: linear in number of states. */ public static Automaton union(Collection l) { Set ids = new HashSet(); for (Automaton a : l) ids.add(System.identityHashCode(a)); boolean has_aliases = ids.size() != l.size(); State s = new State(); for (Automaton b : l) { if (BasicOperations.isEmpty(b)) continue; Automaton bb = b; if (has_aliases) bb = bb.cloneExpanded(); else bb = bb.cloneExpandedIfRequired(); s.addEpsilon(bb.initial); } Automaton a = new Automaton(); a.initial = s; a.deterministic = false; //a.clearHashCode(); a.clearNumberedStates(); a.checkMinimizeAlways(); return a; } // Simple custom ArrayList private final static class TransitionList { Transition[] transitions = new Transition[2]; int count; public void add(Transition t) { if (transitions.length == count) { Transition[] newArray = new Transition[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(transitions, 0, newArray, 0, count); transitions = newArray; } transitions[count++] = t; } } // Holds all transitions that start on this int point, or // end at this point-1 private final static class PointTransitions implements Comparable { int point; final TransitionList ends = new TransitionList(); final TransitionList starts = new TransitionList(); @Override public int compareTo(PointTransitions other) { return point - other.point; } public void reset(int point) { this.point = point; ends.count = 0; starts.count = 0; } @Override public boolean equals(Object other) { return ((PointTransitions) other).point == point; } @Override public int hashCode() { return point; } } private final static class PointTransitionSet { int count; PointTransitions[] points = new PointTransitions[5]; private final static int HASHMAP_CUTOVER = 30; private final HashMap map = new HashMap(); private boolean useHash = false; private PointTransitions next(int point) { // 1st time we are seeing this point if (count == points.length) { final PointTransitions[] newArray = new PointTransitions[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(points, 0, newArray, 0, count); points = newArray; } PointTransitions points0 = points[count]; if (points0 == null) { points0 = points[count] = new PointTransitions(); } points0.reset(point); count++; return points0; } private PointTransitions find(int point) { if (useHash) { final Integer pi = point; PointTransitions p = map.get(pi); if (p == null) { p = next(point); map.put(pi, p); } return p; } else { for(int i=0;i 1) ArrayUtil.timSort(points, 0, count); } public void add(Transition t) { find(t.min).starts.add(t); find(1+t.max).ends.add(t); } @Override public String toString() { StringBuilder s = new StringBuilder(); for(int i=0;i 0) { s.append(' '); } s.append(points[i].point).append(':').append(points[i].starts.count).append(',').append(points[i].ends.count); } return s.toString(); } } /** * Determinizes the given automaton. *

* Worst case complexity: exponential in number of states. */ public static void determinize(Automaton a) { if (a.deterministic || a.isSingleton()) { return; } final State[] allStates = a.getNumberedStates(); // subset construction final boolean initAccept = a.initial.accept; final int initNumber = a.initial.number; a.initial = new State(); SortedIntSet.FrozenIntSet initialset = new SortedIntSet.FrozenIntSet(initNumber, a.initial); LinkedList worklist = new LinkedList(); Map newstate = new HashMap(); worklist.add(initialset); a.initial.accept = initAccept; newstate.put(initialset, a.initial); int newStateUpto = 0; State[] newStatesArray = new State[5]; newStatesArray[newStateUpto] = a.initial; a.initial.number = newStateUpto; newStateUpto++; // like Set final PointTransitionSet points = new PointTransitionSet(); // like SortedMap final SortedIntSet statesSet = new SortedIntSet(5); while (worklist.size() > 0) { SortedIntSet.FrozenIntSet s = worklist.removeFirst(); // Collate all outgoing transitions by min/1+max: for(int i=0;i 0) { assert lastPoint != -1; statesSet.computeHash(); State q = newstate.get(statesSet); if (q == null) { q = new State(); final SortedIntSet.FrozenIntSet p = statesSet.freeze(q); worklist.add(p); if (newStateUpto == newStatesArray.length) { final State[] newArray = new State[ArrayUtil.oversize(1+newStateUpto, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(newStatesArray, 0, newArray, 0, newStateUpto); newStatesArray = newArray; } newStatesArray[newStateUpto] = q; q.number = newStateUpto; newStateUpto++; q.accept = accCount > 0; newstate.put(p, q); } else { assert (accCount > 0 ? true:false) == q.accept: "accCount=" + accCount + " vs existing accept=" + q.accept + " states=" + statesSet; } r.addTransition(new Transition(lastPoint, point-1, q)); } // process transitions that end on this point // (closes an overlapping interval) Transition[] transitions = points.points[i].ends.transitions; int limit = points.points[i].ends.count; for(int j=0;j

Related Artifacts
Related Groups
-->


© 2015 - 2025 Weber Informatics LLC | Privacy Policy