org.carrot2.text.suffixtree.SuffixTree Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of carrot2-mini Show documentation
Show all versions of carrot2-mini Show documentation
Carrot2 search results clustering framework. Minimal functional subset
(core algorithms and infrastructure, no document sources).
/*
* Carrot2 project.
*
* Copyright (C) 2002-2019, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.text.suffixtree;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.LongIntScatterMap;
import com.carrotsearch.hppc.cursors.LongIntCursor;
/**
* Builds a suffix tree (or generalized suffix tree) on a sequence of any integers (or
* objects that can be represented as unique integers). A direct implementation of Esko
* Ukkonen's algorithm, but optimized for Java to use primitive data types instead of
* objects (or boxed types).
*
* @see "E. Ukkonen, On-line construction of suffix trees, Algorithmica, 1995, volume 14, number 3, pages 249-260."
*/
public final class SuffixTree
{
/** A constant to represent invalid suffix link from a state. */
private final static int NO_SUFFIX_LINK = Integer.MIN_VALUE;
/**
* Leaf state marker in {@link #states}.
*/
private final static int LEAF_STATE = -1;
/**
* Marker for the state's last edge in {@link #transitions}.
*/
public final static int NO_EDGE = -1;
/**
* Root state's identifier (constant).
*/
private final static int ROOT_STATE = 1;
/**
* The input sequence of integers.
*/
final ISequence sequence;
/**
* Cached size of {@link #sequence}.
*/
private final int inputSize;
/**
* States array indexed by state number. Values in this array are:
*
* - at build time, the suffix pointer (state pointer),
* - after the tree is built, the first edge from a given state (edge pointer).
*
*/
private IntArrayList states = new IntArrayList();
/**
* A hash map of transitions (edges) between states in the suffix tree. The map is
* keyed by a combination of state (upper 32 bits) and symbol (lower 32 bits). The
* value is an index in the transitions array.
*/
private final LongIntScatterMap transitions_map = new LongIntScatterMap();
/**
* An array of all transitions.
*
* @see #addTransition(int, int, int)
* @see #reuseTransition(int, int, int, int, int)
*/
private final IntArrayList transitions = new IntArrayList();
/**
* Variables used during tree construction. See Ukkonen's algorithm for details.
*/
private int s, k, i;
private boolean end_point;
/**
* Head state and root state.
*/
private final int head, root;
/**
* Default transition from head to the root.
*/
private final int root_transition;
/**
* Number of integers per single transition.
*/
private final int slots_per_transition;
/**
* State callback or null
.
*
* @see IStateCallback
*/
private final IStateCallback newStateCallback;
/**
* A callback invoked when new states are added to the tree.
*/
public interface IStateCallback
{
void newState(int state, int position);
}
/**
* Progress callback is invoked when iterating forward through the input sequence
* elements.
*/
public interface IProgressCallback
{
void next(int pos);
}
/**
* Visitor interface for traversals.
*
* @see VisitorAdapter
*/
public interface IVisitor
{
/**
* Invoked before state
is descended into.
*
* @return Returning false
omits the subtree of state
.
* {@link #post(int)} is not invoked for this state if skipped.
*/
public boolean pre(int state);
/**
* Invoked after state
is fully traversed.
*
* @param state Identifier of the completed state.
*/
public void post(int state);
/**
* Invoked when an edge is visited.
*
* @return Returning false
skips the traversal of
* toState
.
*/
public boolean edge(int fromState, int toState, int startIndex, int endIndex);
}
/**
* Empty implementation recursively walking the entire suffix tree.
*/
public static class VisitorAdapter implements IVisitor
{
public boolean pre(int state)
{
return true;
}
public void post(int state)
{
}
public boolean edge(int fromState, int toState, int startIndex, int endIndex)
{
return true;
}
}
/**
* Build a suffix tree for a given input sequence of symbols.
*/
public SuffixTree(ISequence sequence, IStateCallback newStateCallback,
final IProgressCallback progressCallback)
{
this.sequence = sequence;
this.newStateCallback = newStateCallback;
// Prepare initial conditions.
head = createState();
root = createState();
setSuffixLink(root, head);
assert ROOT_STATE == root;
addTransition(root, 0, 0);
slots_per_transition = transitions.size();
root_transition = 0;
// Build the tree.
s = root;
inputSize = sequence.size();
for (k = i = 1; i <= inputSize; i++)
{
if (progressCallback != null) progressCallback.next(i - 1);
update();
canonize(s, k, i);
}
// Connect edges from a single state to speed up iterators.
for (int i = states.size() - 1; i >= 0; i--)
states.set(i, LEAF_STATE);
for (LongIntCursor c : transitions_map)
{
final int g = c.value;
final int state = (int) (c.key >>> 32);
final int prev = states.get(state);
if (prev != LEAF_STATE)
{
transitions.set(g + 3, prev);
}
states.set(state, g);
}
}
/**
* Update subroutine of the suffix tree building algorithm.
*/
private final void update()
{
int oldr = root;
while (true)
{
int r = testAndSplit(i - 1, i);
if (end_point) break;
createTransition(r, i, inputSize, createNewState(i));
if (oldr != root) setSuffixLink(oldr, r);
oldr = r;
canonize(getSuffixLink(s), k, i - 1);
}
if (oldr != root) setSuffixLink(oldr, s);
}
/**
* Test and split subroutine of the suffix tree building algorithm.
*/
private final int testAndSplit(int p, int ti)
{
if (k <= p)
{
final int g = findTransition(s, k);
assert g >= 0;
final int gk = transitions.get(g + 1);
final int gj = transitions.get(g + 2);
final int gs = transitions.get(g);
if (sequence.objectAt(ti - 1) == sequence.objectAt(gk + p - k))
{
end_point = true;
return s;
}
else
{
final int r = createNewState(gk + p - k);
reuseTransition(removeTransition(s, k), s, gk, gk + p - k, r);
createTransition(r, gk + p - k + 1, gj, gs);
end_point = false;
return r;
}
}
else
{
end_point = findTransition(s, ti) >= 0;
return s;
}
}
/**
* Canonization subroutine of the suffix tree building algorithm.
*/
private void canonize(int s, int k, int p)
{
if (p >= k)
{
int g = findTransition(s, k);
int d;
while (g >= 0 && (d = transitions.get(g + 2) - transitions.get(g + 1)) <= p - k)
{
k = k + d + 1;
s = transitions.get(g);
if (k <= p) g = findTransition(s, k);
}
}
this.s = s;
this.k = k;
}
/*
*
*/
private void setSuffixLink(int fromState, int toState)
{
states.set(fromState, toState);
}
/*
*
*/
private int getSuffixLink(int s)
{
final int ts = this.states.get(s);
assert ts != NO_SUFFIX_LINK;
return ts;
}
/**
* Add a new state to the tree, calling external callback if requested.
*/
private final int createNewState(int position)
{
final int state = createState();
if (newStateCallback != null)
{
newStateCallback.newState(state, position);
}
return state;
}
/**
* Adds a new state to the list of {@link #states}.
*/
private final int createState()
{
final int state = states.size();
states.add(NO_SUFFIX_LINK);
return state;
}
/**
* Create a transition from state s
to state ts
, labeled
* with symbols between k
and p
(1-based, inclusive).
*/
private final void createTransition(int s, int k, int p, int ts)
{
assert k > 0 && p > 0;
final int transition = addTransition(ts, k, p);
transitions_map.put(asLong(s, sequence.objectAt(k - 1)), transition);
}
/**
* Reuse an existing transition slot to store a transition from state s
* to state ts
, labeled with symbols between k
and
* p
(1-based, inclusive).
*/
private final void reuseTransition(int transition, int s, int k, int p, int ts)
{
assert k > 0 && p > 0;
transitions.set(transition, ts);
transitions.set(transition + 1, k);
transitions.set(transition + 2, p);
transitions_map.put(asLong(s, sequence.objectAt(k - 1)), transition);
}
/**
* Adds a transition to state ts
, labeled with symbols between
* k
and p
(1-based, inclusive), but does not add hash map
* entry (for internal use).
*/
private final int addTransition(int ts, int k, int p)
{
final int transition = transitions.size();
transitions.add(ts);
transitions.add(k);
transitions.add(p);
transitions.add(NO_EDGE);
return transition;
}
/**
* Find a transition from state s
, labeled with symbol at index
* k - 1
in the input sequence.
*/
private final int findTransition(int s, int k)
{
return s == head ? root_transition : findEdge(s, sequence.objectAt(k - 1));
}
/**
* Remove the transition from state s
, labeled with symbol at index
* k - 1
and return its slot in the transitions array.
*/
private int removeTransition(int s, int k)
{
assert s != head;
return transitions_map.remove(asLong(s, sequence.objectAt(k - 1)));
}
/**
* Make a long
from two integers.
*/
private final static long asLong(int i1, int i2)
{
return ((long) i1) << 32 | (i2 & 0xffffffffL);
}
/**
* @return Return the number of transitions (edges) in the tree.
*/
public final int getTransitionsCount()
{
return (this.transitions.size() / slots_per_transition) - 1;
}
/**
* @return Return the number of states in the tree.
*/
public final int getStatesCount()
{
return this.states.size() - 1;
}
/**
* @return true
if this suffix tree has a path from the root state to a
* leaf state corresponding to a given sequence of objects. This indicates the
* input sequence had a suffix identical to sequence
.
*/
public boolean containsSuffix(ISequence seq)
{
int state = root;
int i = 0;
while (true)
{
// Find an edge leaving the current state marked with symbol sequence[i].
final int edge = findEdge(state, seq.objectAt(i));
if (edge < 0)
{
// Different characters on explicit state.
return false;
}
// Follow the edge, checking symbols on the way.
int j = getStartIndex(edge);
final int m = getEndIndex(edge) + 1;
for (;i < seq.size() && j < m; j++, i++)
{
if (seq.objectAt(i) != this.sequence.objectAt(j))
{
// Different characters on implicit state.
return false;
}
}
if (i == seq.size())
{
// End of input sequence must be aligned with the tree's leaf state.
return j == inputSize;
}
// Follow to the child state.
state = getToState(edge);
}
}
/**
* Walks the states and edges of the suffix tree, depth-first.
*/
public final void visit(final IVisitor visitor)
{
visitState(root, visitor);
}
/**
* Start visiting from a given state.
*/
public final void visitState(final int state, final IVisitor visitor)
{
if (visitor.pre(state))
{
int edge = firstEdge(state);
while (edge != NO_EDGE)
{
final int toState = transitions.get(edge);
if (visitor.edge(state, toState, getStartIndex(edge), getEndIndex(edge)))
{
visitState(toState, visitor);
}
edge = nextEdge(edge);
}
visitor.post(state);
}
}
/**
* For procedural traversals (not visitors).
*/
public int getRootState()
{
return root;
}
/**
* Check if state
is a leaf (has no outgoing edges).
*/
public final boolean isLeaf(int state)
{
return this.states.get(state) == LEAF_STATE;
}
/**
* Returns the index of the first edge from a given state or {@link #NO_EDGE} if a
* given state has no edges. Does not perform any sanity check on the input state.
*/
public final int firstEdge(int state)
{
return states.get(state);
}
/**
* Returns the index of the next edge (sibling) or {@link #NO_EDGE} if
* edge
is the last edge in its state.
*/
public final int nextEdge(int edge)
{
return transitions.get(edge + 3);
}
/**
* Find a transition from state state
, labeled with a given symbol.
* {@link #NO_EDGE} is returned if there is no such edge.
*/
public final int findEdge(int state, int symbol)
{
return transitions_map.getOrDefault(asLong(state, symbol), NO_EDGE);
}
/**
* Returns the target state for a given edge.
*/
public int getToState(int edge)
{
return transitions.get(edge);
}
/**
* Returns the edge label's start index (inclusive).
*/
public int getStartIndex(int edge)
{
return transitions.get(edge + 1) - 1;
}
/**
* Returns the edge label's end index (inclusive).
*/
public int getEndIndex(int edge)
{
return transitions.get(edge + 2) - 1;
}
}