
eu.crydee.ahocorasick.AhoCorasick Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aho-corasick Show documentation
Show all versions of aho-corasick Show documentation
Aho-Corasick search with generic alphabets
The newest version!
package eu.crydee.ahocorasick;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.SetMultimap;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Generic Aho-Corasick search class.
*
* @param type of the alphabet to use for your searches.
*/
public class AhoCorasick {
private class State {
private final Map goToTransitions = new HashMap<>();
private final Set outputs = new HashSet<>();
private State failTransition;
private Optional defaultTransition = Optional.empty();
public Map getGoToTransitions() {
return Collections.unmodifiableMap(goToTransitions);
}
public State goTo(T symbol) {
if (goToTransitions.containsKey(symbol)) {
return goToTransitions.get(symbol);
}
return defaultTransition.orElse(null);
}
public boolean canGoTo(T symbol) {
return defaultTransition.isPresent()
|| goToTransitions.containsKey(symbol);
}
public void addGoTo(T symbol, State target) {
goToTransitions.put(symbol, target);
}
public void addOutput(Integer patternIndex) {
outputs.add(patternIndex);
}
public void addOutputs(Set patternIndeces) {
outputs.addAll(patternIndeces);
}
public Set getOutputs() {
return Collections.unmodifiableSet(outputs);
}
public State fail() {
return failTransition;
}
public void setFail(State target) {
failTransition = target;
}
public void setDefault(State state) {
defaultTransition = Optional.of(state);
}
}
private final State root = new State();
/**
* Constructor of the AhoCorasick search automaton.
*
* @param patterns The list of patterns (sequence of symbols) to use during
* the search.
*/
public AhoCorasick(List patterns) {
/* First phase of the construction in the original paper.
* Builds the goto graph and assigns some outputs.
*/
for (int i = 0, s = patterns.size(); i < s; ++i) {
T[] pattern = patterns.get(i);
State current = root;
for (T symbol : pattern) {
if (current.canGoTo(symbol)) {
current = current.goTo(symbol);
} else {
State newState = new State();
current.addGoTo(symbol, newState);
current = newState;
}
}
current.addOutput(i);
}
root.setDefault(root);
/* Second phase of the construction in the original paper.
* Builds the fail function and assigns the rest of the outputs.
*/
// initialization: all the depth == 1 nodes get a fail to 0.
Deque pool = root.goToTransitions.values().stream()
.collect(Collectors.toCollection(LinkedList::new));
pool.forEach(s -> s.setFail(root));
while (!pool.isEmpty()) {
State state = pool.pop();
for (Map.Entry e : state.getGoToTransitions().entrySet()) {
T symbol = e.getKey();
State next = e.getValue();
pool.addLast(next);
State fail = state.fail();
while (!fail.canGoTo(symbol)) {
fail = fail.fail();
}
fail = fail.goTo(symbol);
next.setFail(fail);
next.addOutputs(fail.getOutputs());
}
}
}
/**
* Entry point of the library.
*
* Performs the Aho-Corasick search on the text. The result is a multimap
* from matched patterns indices to text indices marking the end of matches.
* Both indices are calculated starting from 0.
*
* @param text the sequence of symbols to search.
* @return a map of patterns indeces to text indices.
*/
public SetMultimap searchPatternToPos(T[] text) {
return search(text, true);
}
/**
* Entry point of the library.
*
* Performs the Aho-Corasick search on the text. The result is a multimap
* from text indices marking the end of matches to indices of matched
* patterns. Both indices are calculated starting from 0.
*
* @param text the sequence of symbols to search.
* @return a map of text indices to patterns indices.
*/
public SetMultimap searchPosToPattern(T[] text) {
return search(text, false);
}
public SetMultimap search(T[] text, boolean reversed) {
SetMultimap result = HashMultimap.create();
State state = root;
for (int i = 0, s = text.length; i < s; ++i) {
T symbol = text[i];
while (!state.canGoTo(symbol)) {
state = state.fail();
}
state = state.goTo(symbol);
if (!reversed) {
result.putAll(i, state.getOutputs());
} else {
for (Integer patternIndex : state.getOutputs()) {
result.put(patternIndex, i);
}
}
}
return result;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy