All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.helipy.text.ahocorasick.DatAutomaton Maven / Gradle / Ivy
package com.helipy.text.ahocorasick;
import java.io.PrintStream;
import java.util.Arrays;
import java.util.Collection;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
/**
* @param The related object type
* @author nuclear-sun
*/
public final class DatAutomaton implements Automaton {
private static final int MAX_KEYWORD_LENGTH = 1000;
private static final int ANCHOR_DEQUE_CLEAN_THRESHOLD = 20;
private final int[] base;
private final int[] check;
private final Tuple[] data;
private final int stateCount;
/**
* redundant data
*/
private final int reserveLength;
/**
* controls
*/
private final boolean interruptable;
private DatAutomaton(int[] base, int[] check, Tuple[] data, int stateCount, boolean interruptable) {
this.base = base;
this.check = check;
this.data = data;
this.stateCount = stateCount;
this.reserveLength = (stateCount << 1) + 1;
this.interruptable = interruptable;
}
static int calcStart(final Deque> anchorQueue, final int end, final int wordLength) {
int ignoreChars = 0;
Iterator> tupleIterator = anchorQueue.descendingIterator();
while (tupleIterator.hasNext()) {
Tuple anchor = tupleIterator.next();
if (end - anchor.first - ignoreChars >= wordLength) {
break;
}
ignoreChars += anchor.second;
}
return end - wordLength - ignoreChars + 1;
}
public static Builder builder() {
return new Builder();
}
private int nextState(int currState, char ch) {
int nextState = 0;
int index;
do {
index = base[currState] + ch;
if (index >= reserveLength && index < base.length && check[index] == currState) {
nextState = base[index];
return nextState;
}
currState = base[currState + stateCount];
if (currState == 0) {
return 1;
}
} while (nextState == 0);
// this line should never be reached
return 1;
}
/**
* @param state the state to study
* @return index for outer resource
*/
private List collectWords(final int state) {
List collector = null;
int curr = state;
while (curr > 1) {
int outerIndex = check[curr];
if (outerIndex > 0) {
if (collector == null) {
collector = new LinkedList<>();
}
collector.add(outerIndex);
}
curr = check[curr + stateCount];
}
return collector;
}
// CHECKSTYLE:ON
/**
* // CHECKSTYLE:OFF
*
* @param text text to scan
* @param handler how to handle matched keyword
*/
@Override
@SuppressWarnings("checkstyle:ReturnCount")
public void parseText(CharSequence text, MatchHandler handler) {
if (text == null || handler == null) {
return;
}
int currState = 1;
// used to calculate left edge when some special chars are ignored
final Deque> anchorDeque = new LinkedList<>();
int totalIgnoreChars = 0;
for (int i = 0, length = text.length(); i < length; i++) {
final char ch = text.charAt(i);
if (ch == 0) {
// treat '\0' as the only special case to ignore
totalIgnoreChars += 1;
Tuple last = anchorDeque.peekLast();
if (anchorDeque.isEmpty() || last.first != i - 1) {
Tuple newItem = new Tuple<>(i, 1);
anchorDeque.offerLast(newItem);
// clean anchorDeque when new item added
if (anchorDeque.size() > ANCHOR_DEQUE_CLEAN_THRESHOLD) {
Tuple firstItem = anchorDeque.peekFirst();
while (anchorDeque.size() > 1
&& i - firstItem.first - (totalIgnoreChars - firstItem.second) >= MAX_KEYWORD_LENGTH) {
totalIgnoreChars -= firstItem.second;
anchorDeque.removeFirst();
}
}
} else {
// last != null && i == last.getFirst() + 1
last.first = i;
last.second += 1;
}
continue;
}
currState = nextState(currState, ch);
List outputs = collectWords(currState);
if (outputs != null) {
for (Integer index : outputs) {
Tuple datum = this.data[index];
int start = calcStart(anchorDeque, i, datum.first.length());
boolean isContinue = handler.onMatch(start, i + 1, datum.first, datum.second);
if (!isContinue) {
return;
}
}
}
if (this.interruptable && Thread.currentThread().isInterrupted()) {
return;
}
}
}
@Override
public List> parseText(CharSequence text) {
List> results = new LinkedList<>();
MatchHandler listener = new MatchHandler() {
@Override
public boolean onMatch(int start, int end, String key, V value) {
Emit emit = new Emit<>(key, start, end, value);
results.add(emit);
return true;
}
};
parseText(text, listener);
return results;
}
// methods for statistics
public int getStateCount() {
return this.stateCount;
}
public int getArrayLength() {
return base.length;
}
public int getKeywordSize() {
return data.length;
}
public boolean getInterruptable() {
return this.interruptable;
}
private int countFreeSlots() {
int count = 0;
int ind = -check[0];
while (ind != 0) {
count += 1;
ind = -check[ind];
}
return count;
}
public void printStats(boolean detail, PrintStream out) {
out.println("words: " + data.length);
out.println("stateCount: " + stateCount);
out.println("capacity: " + base.length);
out.println("freeSlots: " + countFreeSlots());
if (detail) {
out.println();
out.println("index & base & check");
for (int i = 0; i < base.length; i++) {
out.printf("%5d %5d %5d%n", i, base[i], check[i]);
}
}
}
private static class Tuple {
private V1 first;
private V2 second;
Tuple(V1 first, V2 second) {
this.first = first;
this.second = second;
}
}
public static final class Builder {
private int[] base;
private int[] check;
// start from index 1
private Tuple[] data;
private int stateCount;
// controls
private boolean interruptable = false;
// temp
private Trie trie;
private Map dataMap;
private Map invertedIndex;
private Builder() {
this.dataMap = new HashMap<>();
this.invertedIndex = new HashMap<>();
}
public Builder put(String key, V value) {
this.dataMap.put(key, value);
return this;
}
public Builder putAll(Map data) {
this.dataMap.putAll(data);
return this;
}
public V get(String key) {
return this.dataMap.get(key);
}
public Builder add(String key) {
this.dataMap.put(key, null);
return this;
}
public Builder addAll(Collection keys) {
for (String key : keys) {
this.dataMap.put(key, null);
}
return this;
}
public Builder setInterruptable(boolean interruptable) {
this.interruptable = interruptable;
return this;
}
public DatAutomaton build() {
prepareData();
buildTrie();
this.stateCount = trie.getStateCount();
initDoubleArray();
placeAllStateInfo();
return new DatAutomaton(base, check, data, stateCount, interruptable);
}
private void prepareData() {
this.data = new Tuple[dataMap.size() + 1];
int i = 1;
for (Map.Entry mapEntry : dataMap.entrySet()) {
Tuple dataEntry = new Tuple<>(mapEntry.getKey(), mapEntry.getValue());
this.data[i] = dataEntry;
invertedIndex.put(mapEntry.getKey(), i);
i++;
}
}
private void buildTrie() {
Trie trieTmp = new Trie();
for (int i = 1; i < data.length; i++) {
trieTmp.addKeyword(data[i].first);
}
trieTmp.constructFailureAndPrevWordPointer();
this.trie = trieTmp;
}
private void initDoubleArray() {
int initCapacity = this.stateCount << 2;
this.base = new int[initCapacity];
this.check = new int[initCapacity];
// build free list
final int firstFreeIndex = (stateCount << 1) + 1;
final int lastFreeIndex = initCapacity - 1;
check[0] = -firstFreeIndex;
base[0] = -lastFreeIndex;
for (int i = firstFreeIndex; i < initCapacity; i++) {
check[i] = -(i + 1);
base[i] = -(i - 1);
}
check[lastFreeIndex] = 0;
base[firstFreeIndex] = 0;
}
private void resize(int newCapacity) {
if (newCapacity <= this.base.length) {
return;
}
int[] newBase = new int[newCapacity];
int[] newCheck = new int[newCapacity];
System.arraycopy(base, 0, newBase, 0, base.length);
System.arraycopy(check, 0, newCheck, 0, check.length);
// build free list
for (int i = check.length; i < newCapacity; i++) {
newCheck[i] = -(i + 1);
newBase[i] = -(i - 1);
}
int prevLastFreeIndex = -base[0];
newCheck[prevLastFreeIndex] = -check.length;
newCheck[newCapacity - 1] = 0;
newBase[base.length] = -prevLastFreeIndex;
newBase[0] = -(newCapacity - 1);
this.check = newCheck;
this.base = newBase;
}
/**
* find free indexes for characters of state
*
* @param state
* @return the appropriate base address value, may be negative. Use Integer.MAX_VALUE to indicate failure.
*/
private int findBaseValue(final State state) {
Map childrenMap = state.getSuccess();
if (childrenMap.size() == 0) {
return 0;
}
char[] characters = new char[childrenMap.size()];
int i = 0;
for (Character ch : childrenMap.keySet()) {
characters[i++] = ch;
}
Arrays.sort(characters);
int[] diffs = new int[characters.length];
for (i = 1; i < characters.length; i++) {
diffs[i] = characters[i] - characters[0];
}
int freeIndex = -check[0];
while (freeIndex > 0) {
int maxNeededIndex = freeIndex + diffs[characters.length - 1];
if (maxNeededIndex >= check.length) {
resize(maxNeededIndex + 1);
} else if (maxNeededIndex < 0) { // overflow
throw new OutOfMemoryError();
}
for (i = characters.length - 1; i > 0; i--) {
int detectIndex = freeIndex + diffs[i];
if (check[detectIndex] > 0) {
break;
}
}
if (i == 0) {
return freeIndex - characters[0];
}
freeIndex = -check[freeIndex];
}
return Integer.MAX_VALUE;
}
private void placeAllStateInfo() {
Queue queue = new LinkedList<>();
queue.offer(trie.getRootState());
while (!queue.isEmpty()) {
State state = queue.poll();
int ordinal = state.getOrdinal();
// 1. place outer index
final String keyword = state.getKeyword();
if (keyword != null) {
check[ordinal] = invertedIndex.get(keyword);
}
// 2. place failure pointer
State failure = state.getFailure();
if (failure != null) {
base[ordinal + stateCount] = failure.getOrdinal();
}
// 3. place previous word pointer
State prevWordState = state.getPrevWordState();
if (prevWordState != null) {
check[ordinal + stateCount] = prevWordState.getOrdinal();
}
// 4. place success table
placeSuccess(state);
for (State child : state.getSuccess().values()) {
queue.offer(child);
}
}
}
private void placeSuccess(State state) {
int baseValue = findBaseValue(state);
if (baseValue == Integer.MAX_VALUE) {
throw new RuntimeException("No base value found for state " + state.getOrdinal());
}
// place base address
base[state.getOrdinal()] = baseValue;
// place goto table
Map success = state.getSuccess();
for (Map.Entry entry : success.entrySet()) {
int freeIndex = baseValue + entry.getKey();
int prevFreeIndex = -base[freeIndex];
int nextFreeIndex = -check[freeIndex];
check[prevFreeIndex] = -nextFreeIndex;
base[nextFreeIndex] = -prevFreeIndex;
base[freeIndex] = entry.getValue().getOrdinal();
check[freeIndex] = state.getOrdinal();
}
}
}
}