All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dk.brics.automaton.StringUnionOperations Maven / Gradle / Ivy

Go to download

This package contains a full DFA/NFA implementation with Unicode alphabet and support for all standard regular expression operations.

There is a newer version: 1.12-4
Show newest version
package dk.brics.automaton;

import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.IdentityHashMap;

/**
 * Operations for building minimal deterministic automata from sets of strings. 
 * The algorithm requires sorted input data, but is very fast (nearly linear with the input size).
 * 
 * @author Dawid Weiss
 */
final public class StringUnionOperations {

	/**
	 * Lexicographic order of input sequences.
	 */
	public final static Comparator LEXICOGRAPHIC_ORDER = new Comparator() {
		public int compare(CharSequence s1, CharSequence s2) {
			final int lens1 = s1.length();
			final int lens2 = s2.length();
			final int max = Math.min(lens1, lens2);

			for (int i = 0; i < max; i++) {
				final char c1 = s1.charAt(i);
				final char c2 = s2.charAt(i);
				if (c1 != c2)
					return c1 - c2;
			}
			return lens1 - lens2;
		}
	};

	/**
	 * State with char labels on transitions.
	 */
	final static class State {

		/** An empty set of labels. */
		private final static char[] NO_LABELS = new char[0];

		/** An empty set of states. */
		private final static State[] NO_STATES = new State[0];

		/**
		 * Labels of outgoing transitions. Indexed identically to {@link #states}.
		 * Labels must be sorted lexicographically.
		 */
		char[] labels = NO_LABELS;

		/**
		 * States reachable from outgoing transitions. Indexed identically to
		 * {@link #labels}.
		 */
		State[] states = NO_STATES;

		/**
		 * true if this state corresponds to the end of at least one
		 * input sequence.
		 */
		boolean is_final;

		/**
		 * Returns the target state of a transition leaving this state and labeled
		 * with label. If no such transition exists, returns
		 * null.
		 */
		public State getState(char label) {
			final int index = Arrays.binarySearch(labels, label);
			return index >= 0 ? states[index] : null; 
		}

		/**
		 * Returns an array of outgoing transition labels. The array is sorted in 
		 * lexicographic order and indexes correspond to states returned from 
		 * {@link #getStates()}.
		 */
		public char [] getTransitionLabels() {
			return this.labels;
		}

		/**
		 * Returns an array of outgoing transitions from this state. The returned
		 * array must not be changed.
		 */
		public State[] getStates() {
			return this.states;
		}

		/**
		 * Two states are equal if:
		 * 
    *
  • they have an identical number of outgoing transitions, labeled with * the same labels
  • *
  • corresponding outgoing transitions lead to the same states (to states * with an identical right-language). *
*/ @Override public boolean equals(Object obj) { final State other = (State) obj; return is_final == other.is_final && Arrays.equals(this.labels, other.labels) && referenceEquals(this.states, other.states); } /** * Return true if this state has any children (outgoing * transitions). */ public boolean hasChildren() { return labels.length > 0; } /** * Is this state a final state in the automaton? */ public boolean isFinal() { return is_final; } /** * Compute the hash code of the current status of this state. */ @Override public int hashCode() { int hash = is_final ? 1 : 0; hash ^= hash * 31 + this.labels.length; for (char c : this.labels) hash ^= hash * 31 + c; /* * Compare the right-language of this state using reference-identity of * outgoing states. This is possible because states are interned (stored * in registry) and traversed in post-order, so any outgoing transitions * are already interned. */ for (State s : this.states) { hash ^= System.identityHashCode(s); } return hash; } /** * Create a new outgoing transition labeled label and return * the newly created target state for this transition. */ State newState(char label) { assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: " + label; labels = copyOf(labels, labels.length + 1); states = copyOf(states, states.length + 1); labels[labels.length - 1] = label; return states[states.length - 1] = new State(); } /** * Return the most recent transitions's target state. */ State lastChild() { assert hasChildren() : "No outgoing transitions."; return states[states.length - 1]; } /** * Return the associated state if the most recent transition * is labeled with label. */ State lastChild(char label) { final int index = labels.length - 1; State s = null; if (index >= 0 && labels[index] == label) { s = states[index]; } assert s == getState(label); return s; } /** * Replace the last added outgoing transition's target state with the given * state. */ void replaceLastChild(State state) { assert hasChildren() : "No outgoing transitions."; states[states.length - 1] = state; } /** * JDK1.5-replacement of {@link Arrays#copyOf(char[], int)} */ private static char[] copyOf(char[] original, int newLength) { char[] copy = new char[newLength]; System.arraycopy(original, 0, copy, 0, Math.min(original.length, newLength)); return copy; } /** * JDK1.5-replacement of {@link Arrays#copyOf(char[], int)} */ public static State[] copyOf(State[] original, int newLength) { State[] copy = new State[newLength]; System.arraycopy(original, 0, copy, 0, Math.min(original.length, newLength)); return copy; } /** * Compare two lists of objects for reference-equality. */ private static boolean referenceEquals(Object[] a1, Object[] a2) { if (a1.length != a2.length) return false; for (int i = 0; i < a1.length; i++) if (a1[i] != a2[i]) return false; return true; } } /** * "register" for state interning. */ private HashMap register = new HashMap(); /** * Root automaton state. */ private State root = new State(); /** * Previous sequence added to the automaton in {@link #add(CharSequence)}. */ private StringBuilder previous; /** * Add another character sequence to this automaton. The sequence must be * lexicographically larger or equal compared to any previous sequences * added to this automaton (the input must be sorted). */ public void add(CharSequence current) { assert register != null : "Automaton already built."; assert current.length() > 0 : "Input sequences must not be empty."; assert previous == null || LEXICOGRAPHIC_ORDER.compare(previous, current) <= 0 : "Input must be sorted: " + previous + " >= " + current; assert setPrevious(current); // Descend in the automaton (find matching prefix). int pos = 0, max = current.length(); State next, state = root; while (pos < max && (next = state.lastChild(current.charAt(pos))) != null) { state = next; pos++; } if (state.hasChildren()) replaceOrRegister(state); addSuffix(state, current, pos); } /** * Finalize the automaton and return the root state. No more strings can be * added to the builder after this call. * * @return Root automaton state. */ public State complete() { if (this.register == null) throw new IllegalStateException(); if (root.hasChildren()) replaceOrRegister(root); register = null; return root; } /** * Internal recursive traversal for conversion. */ private static dk.brics.automaton.State convert(State s, IdentityHashMap visited) { dk.brics.automaton.State converted = visited.get(s); if (converted != null) return converted; converted = new dk.brics.automaton.State(); converted.setAccept(s.is_final); visited.put(s, converted); int i = 0; char [] labels = s.labels; for (StringUnionOperations.State target : s.states) { converted.addTransition(new Transition(labels[i++], convert(target, visited))); } return converted; } /** * Build a minimal, deterministic automaton from a sorted list of strings. */ public static dk.brics.automaton.State build(CharSequence[] input) { final StringUnionOperations builder = new StringUnionOperations(); for (CharSequence chs : input) builder.add(chs); return convert(builder.complete(), new IdentityHashMap()); } /** * Copy current into an internal buffer. */ private boolean setPrevious(CharSequence current) { if (previous == null) previous = new StringBuilder(); previous.setLength(0); previous.append(current); return true; } /** * Replace last child of state with an already registered * state or register the last child state. */ private void replaceOrRegister(State state) { final State child = state.lastChild(); if (child.hasChildren()) replaceOrRegister(child); final State registered = register.get(child); if (registered != null) { state.replaceLastChild(registered); } else { register.put(child, child); } } /** * Add a suffix of current starting at fromIndex * (inclusive) to state state. */ private void addSuffix(State state, CharSequence current, int fromIndex) { final int len = current.length(); for (int i = fromIndex; i < len; i++) { state = state.newState(current.charAt(i)); } state.is_final = true; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy