dk.brics.automaton.StringUnionOperations Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automaton Show documentation
Show all versions of automaton Show documentation
This package contains a full DFA/NFA implementation with Unicode
alphabet and support for all standard regular expression operations.
package dk.brics.automaton;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.IdentityHashMap;
/**
* Operations for building minimal deterministic automata from sets of strings.
* The algorithm requires sorted input data, but is very fast (nearly linear with the input size).
*
* @author Dawid Weiss
*/
final public class StringUnionOperations {
/**
* Lexicographic order of input sequences.
*/
public final static Comparator LEXICOGRAPHIC_ORDER = new Comparator() {
public int compare(CharSequence s1, CharSequence s2) {
final int lens1 = s1.length();
final int lens2 = s2.length();
final int max = Math.min(lens1, lens2);
for (int i = 0; i < max; i++) {
final char c1 = s1.charAt(i);
final char c2 = s2.charAt(i);
if (c1 != c2)
return c1 - c2;
}
return lens1 - lens2;
}
};
/**
* State with char
labels on transitions.
*/
final static class State {
/** An empty set of labels. */
private final static char[] NO_LABELS = new char[0];
/** An empty set of states. */
private final static State[] NO_STATES = new State[0];
/**
* Labels of outgoing transitions. Indexed identically to {@link #states}.
* Labels must be sorted lexicographically.
*/
char[] labels = NO_LABELS;
/**
* States reachable from outgoing transitions. Indexed identically to
* {@link #labels}.
*/
State[] states = NO_STATES;
/**
* true
if this state corresponds to the end of at least one
* input sequence.
*/
boolean is_final;
/**
* Returns the target state of a transition leaving this state and labeled
* with label
. If no such transition exists, returns
* null
.
*/
public State getState(char label) {
final int index = Arrays.binarySearch(labels, label);
return index >= 0 ? states[index] : null;
}
/**
* Returns an array of outgoing transition labels. The array is sorted in
* lexicographic order and indexes correspond to states returned from
* {@link #getStates()}.
*/
public char [] getTransitionLabels() {
return this.labels;
}
/**
* Returns an array of outgoing transitions from this state. The returned
* array must not be changed.
*/
public State[] getStates() {
return this.states;
}
/**
* Two states are equal if:
*
* - they have an identical number of outgoing transitions, labeled with
* the same labels
* - corresponding outgoing transitions lead to the same states (to states
* with an identical right-language).
*
*/
@Override
public boolean equals(Object obj) {
final State other = (State) obj;
return is_final == other.is_final
&& Arrays.equals(this.labels, other.labels)
&& referenceEquals(this.states, other.states);
}
/**
* Return true
if this state has any children (outgoing
* transitions).
*/
public boolean hasChildren() {
return labels.length > 0;
}
/**
* Is this state a final state in the automaton?
*/
public boolean isFinal() {
return is_final;
}
/**
* Compute the hash code of the current status of this state.
*/
@Override
public int hashCode() {
int hash = is_final ? 1 : 0;
hash ^= hash * 31 + this.labels.length;
for (char c : this.labels)
hash ^= hash * 31 + c;
/*
* Compare the right-language of this state using reference-identity of
* outgoing states. This is possible because states are interned (stored
* in registry) and traversed in post-order, so any outgoing transitions
* are already interned.
*/
for (State s : this.states) {
hash ^= System.identityHashCode(s);
}
return hash;
}
/**
* Create a new outgoing transition labeled label
and return
* the newly created target state for this transition.
*/
State newState(char label) {
assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: "
+ label;
labels = copyOf(labels, labels.length + 1);
states = copyOf(states, states.length + 1);
labels[labels.length - 1] = label;
return states[states.length - 1] = new State();
}
/**
* Return the most recent transitions's target state.
*/
State lastChild() {
assert hasChildren() : "No outgoing transitions.";
return states[states.length - 1];
}
/**
* Return the associated state if the most recent transition
* is labeled with label
.
*/
State lastChild(char label) {
final int index = labels.length - 1;
State s = null;
if (index >= 0 && labels[index] == label) {
s = states[index];
}
assert s == getState(label);
return s;
}
/**
* Replace the last added outgoing transition's target state with the given
* state.
*/
void replaceLastChild(State state) {
assert hasChildren() : "No outgoing transitions.";
states[states.length - 1] = state;
}
/**
* JDK1.5-replacement of {@link Arrays#copyOf(char[], int)}
*/
private static char[] copyOf(char[] original, int newLength) {
char[] copy = new char[newLength];
System.arraycopy(original, 0, copy, 0, Math.min(original.length,
newLength));
return copy;
}
/**
* JDK1.5-replacement of {@link Arrays#copyOf(char[], int)}
*/
public static State[] copyOf(State[] original, int newLength) {
State[] copy = new State[newLength];
System.arraycopy(original, 0, copy, 0, Math.min(original.length, newLength));
return copy;
}
/**
* Compare two lists of objects for reference-equality.
*/
private static boolean referenceEquals(Object[] a1, Object[] a2) {
if (a1.length != a2.length)
return false;
for (int i = 0; i < a1.length; i++)
if (a1[i] != a2[i])
return false;
return true;
}
}
/**
* "register" for state interning.
*/
private HashMap register = new HashMap();
/**
* Root automaton state.
*/
private State root = new State();
/**
* Previous sequence added to the automaton in {@link #add(CharSequence)}.
*/
private StringBuilder previous;
/**
* Add another character sequence to this automaton. The sequence must be
* lexicographically larger or equal compared to any previous sequences
* added to this automaton (the input must be sorted).
*/
public void add(CharSequence current) {
assert register != null : "Automaton already built.";
assert current.length() > 0 : "Input sequences must not be empty.";
assert previous == null || LEXICOGRAPHIC_ORDER.compare(previous, current) <= 0 :
"Input must be sorted: " + previous + " >= " + current;
assert setPrevious(current);
// Descend in the automaton (find matching prefix).
int pos = 0, max = current.length();
State next, state = root;
while (pos < max && (next = state.lastChild(current.charAt(pos))) != null) {
state = next;
pos++;
}
if (state.hasChildren())
replaceOrRegister(state);
addSuffix(state, current, pos);
}
/**
* Finalize the automaton and return the root state. No more strings can be
* added to the builder after this call.
*
* @return Root automaton state.
*/
public State complete() {
if (this.register == null)
throw new IllegalStateException();
if (root.hasChildren())
replaceOrRegister(root);
register = null;
return root;
}
/**
* Internal recursive traversal for conversion.
*/
private static dk.brics.automaton.State convert(State s,
IdentityHashMap visited) {
dk.brics.automaton.State converted = visited.get(s);
if (converted != null)
return converted;
converted = new dk.brics.automaton.State();
converted.setAccept(s.is_final);
visited.put(s, converted);
int i = 0;
char [] labels = s.labels;
for (StringUnionOperations.State target : s.states) {
converted.addTransition(new Transition(labels[i++], convert(target, visited)));
}
return converted;
}
/**
* Build a minimal, deterministic automaton from a sorted list of strings.
*/
public static dk.brics.automaton.State build(CharSequence[] input) {
final StringUnionOperations builder = new StringUnionOperations();
for (CharSequence chs : input)
builder.add(chs);
return convert(builder.complete(), new IdentityHashMap());
}
/**
* Copy current
into an internal buffer.
*/
private boolean setPrevious(CharSequence current) {
if (previous == null)
previous = new StringBuilder();
previous.setLength(0);
previous.append(current);
return true;
}
/**
* Replace last child of state
with an already registered
* state or register the last child state.
*/
private void replaceOrRegister(State state) {
final State child = state.lastChild();
if (child.hasChildren())
replaceOrRegister(child);
final State registered = register.get(child);
if (registered != null) {
state.replaceLastChild(registered);
} else {
register.put(child, child);
}
}
/**
* Add a suffix of current
starting at fromIndex
* (inclusive) to state state
.
*/
private void addSuffix(State state, CharSequence current, int fromIndex) {
final int len = current.length();
for (int i = fromIndex; i < len; i++) {
state = state.newState(current.charAt(i));
}
state.is_final = true;
}
}