All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.lahodiuk.ahocorasick.AhoCorasickOptimized Maven / Gradle / Ivy

package com.lahodiuk.ahocorasick;

// Java Collections are used only during the building of the automaton.
// The automaton itself uses only the primitive data types 
// and does not produce garbage during the matching.

import java.util.*;
import java.util.stream.Stream;

/**
 * TAKEN FROM:
 * https://github.com/lagodiuk/aho-corasick-optimized/blob/master/src/main/java/com/lahodiuk/ahocorasick/AhoCorasickOptimized.java
 * 
 * Implementation of the Aho-Corasick string matching algorithm, described in
 * the paper "Efficient String Matching: An Aid to Bibliographic Search",
 * written by Alfred V. Aho and Margaret J. Corasick, Bell Laboratories, 1975
 *
 * This implementation takes into account the specificities of the HotSpot JVM,
 * and supposed to be the Garbage Collector friendly. The automaton is based
 * only on the primitive data types in order to avoid Autoboxing and Unboxing
 * conversions.
 *
 * @author of the implementation is Yurii Lahodiuk ([email protected])
 * @author 2017-11-11: Erik Faessler from the JULIE Lab added the method
 *         {@link #isEntryPrefix(String)}.
 */
public class AhoCorasickOptimized {

	private static final int INITIAL_STATE = 0;
	private static final int FAIL = -1;

	// the sorted array of the unique characters (alphabet)
	// every character from the alphabet is "mapped" to it's own index inside
	// this array
	// mapping: "character" -> "character index"
	private char[] charToIntMapping;
	// every character, which is not inside the alphabet is mapped to this
	// special index
	private final int absentCharInt;

	// the automaton transitions table
	// mapping: "current state AND input character index" -> "new state"
	private int[][] goTo;
	// table of the outputs of every state
	// mapping: "state" -> "matched patterns"
	private List[] output;
	// table of the fail transitions of the automaton
	// mapping: "state" -> "new state"
	private int[] fail;

	public AhoCorasickOptimized(String... patterns) {
		this(Arrays.asList(patterns));
	}
	public AhoCorasickOptimized(Collection patterns) {

		this.initializeCharToIntMapping(patterns.stream());
		this.absentCharInt = this.charToIntMapping.length;

		int maxAmountOfStates = this.getMaxPossibleAmountOfStates(patterns.stream());

		this.initializeTransitionsTable(maxAmountOfStates);
		this.initializeOutputTable(maxAmountOfStates);
		this.initializeFailureTransitions(maxAmountOfStates);

		int actualStatesCount = this.calculateTransitionsTable(patterns.stream());

		this.adjustTransitionsTableSize(actualStatesCount);
		this.adjustOutputTableSize(actualStatesCount);
		this.adjustFailureTransitionsSize(actualStatesCount);

		this.makeInitialStateNeverFail();
		this.calculateFailureTransitions();
	}

	public void adjustFailureTransitionsSize(int actualStatesCount) {
		if (actualStatesCount == this.fail.length) {
			return;
		}
		int[] adjustedFail = new int[actualStatesCount];
		System.arraycopy(this.fail, 0, adjustedFail, 0, actualStatesCount);
		this.fail = adjustedFail;
	}

	public void adjustOutputTableSize(int actualStatesCount) {
		if (actualStatesCount == this.output.length) {
			return;
		}
		@SuppressWarnings("unchecked")
		List[] adjustedOutput = new List[actualStatesCount];
		System.arraycopy(this.output, 0, adjustedOutput, 0, actualStatesCount);
		this.output = adjustedOutput;
	}

	public void adjustTransitionsTableSize(int actualStatesCount) {
		if (actualStatesCount == this.goTo.length) {
			return;
		}
		int[][] adjustedGoTo = new int[actualStatesCount][this.charToIntMapping.length + 1];
		for (int i = 0; i < actualStatesCount; i++) {
			adjustedGoTo[i] = this.goTo[i];
		}
		this.goTo = adjustedGoTo;
	}

	public final void match(final String text, MatchCallback callback) {

		int state = INITIAL_STATE;

		for (int ci = 0; ci < text.length(); ci++) {

			char chr = text.charAt(ci);
			int char2IntMappingIndex = Arrays.binarySearch(this.charToIntMapping, chr);
			int chrInt = char2IntMappingIndex < 0 ? this.absentCharInt : char2IntMappingIndex;

			while (this.goTo[state][chrInt] == FAIL) {
				state = this.fail[state];
			}

			state = this.goTo[state][chrInt];

			List matched = this.output[state];
			for (int j = 0; j < matched.size(); j++) {
				String found = matched.get(j);
				callback.onMatch((ci - found.length()) + 1, ci, found);
			}
		}
	}

	/**
	 * Added by Erik Faessler, 2017-11-11: This method does not find dictionary
	 * entries in the given string but just checks if the given string is a
	 * strict prefix of at least one dictionary entry. That is, the method
	 * returns false for strings that are completely contained in the dictionary
	 * and not a prefix of another entry.
	 * 
	 * @param text
	 * @return
	 */
	public final boolean isEntryPrefix(final String text) {

		int state = INITIAL_STATE;

		for (int ci = 0; ci < text.length(); ci++) {

			char chr = text.charAt(ci);
			int char2IntMappingIndex = Arrays.binarySearch(this.charToIntMapping, chr);
			int chrInt = char2IntMappingIndex < 0 ? this.absentCharInt : char2IntMappingIndex;

			if (this.goTo[state][chrInt] == FAIL) {
				state = FAIL;
				break;
			}

			state = this.goTo[state][chrInt];
		}
		if (state >= 0 && !this.output[state].isEmpty()) {
			// The input string is contained in the dictionary. Now we must
			// check if there are further transitions for longer entries
			for (int i = 0; i < this.goTo[state].length; ++i) {
				if (this.goTo[state][i] != FAIL)
					return true;
			}
		}
		// The string is not contained in the dictionary. We must check if it
		// was completely consumed without failing. A fail would mean that the
		// input string was not a prefix but couldn't be matched at some point
		// to a dictionary entry.
		return state != INITIAL_STATE && state != FAIL && this.output[state].isEmpty();
	}

	@SuppressWarnings("unchecked")
	private void initializeOutputTable(int maxAmountOfStates) {
		this.output = new List[maxAmountOfStates];
		for (int i = 0; i < this.output.length; i++) {
			this.output[i] = new ArrayList<>();
		}
	}

	private void initializeFailureTransitions(int maxAmountOfStates) {
		this.fail = new int[maxAmountOfStates];
		Arrays.fill(this.fail, FAIL);
		this.fail[INITIAL_STATE] = INITIAL_STATE;
	}

	private void initializeTransitionsTable(int maxAmountOfStates) {
		this.goTo = new int[maxAmountOfStates][this.charToIntMapping.length + 1];
		for (int[] row : this.goTo) {
			Arrays.fill(row, FAIL);
		}
	}

	private void makeInitialStateNeverFail() {
		for (int i = 0; i < this.goTo[INITIAL_STATE].length; i++) {
			if (this.goTo[INITIAL_STATE][i] == FAIL) {
				this.goTo[INITIAL_STATE][i] = INITIAL_STATE;
			}
		}
	}

	private int getMaxPossibleAmountOfStates(Stream patterns) {
		return 1 + patterns.mapToInt(String::length).sum();
	}

	private void initializeCharToIntMapping(Stream patterns) {
		Set uniqueChars = new HashSet<>();
		patterns.forEach(s -> {
			for (char c : s.toCharArray()) {
				uniqueChars.add(c);
			}
		});
		this.charToIntMapping = new char[uniqueChars.size()];
		int charToIntMappingIdx = 0;
		for (char c : uniqueChars) {
			this.charToIntMapping[charToIntMappingIdx] = c;
			charToIntMappingIdx++;
		}
		Arrays.sort(this.charToIntMapping);
	}

	// Calculation of the failure transitions using BFS
	private void calculateFailureTransitions() {

		Queue queue = new LinkedList<>();

		// all states of depth 1 (counting from the initial state)
		// have failure transition to the initial state
		for (int stateReachableFromInitial : this.goTo[INITIAL_STATE]) {
			if (stateReachableFromInitial != INITIAL_STATE) {
				queue.add(stateReachableFromInitial);
				this.fail[stateReachableFromInitial] = INITIAL_STATE;
			}
		}

		while (!queue.isEmpty()) {
			int curr = queue.remove();

			for (int chrInt = 0; chrInt < this.goTo[curr].length; chrInt++) {

				int stateReachableFromCurr = this.goTo[curr][chrInt];

				if (stateReachableFromCurr != FAIL) {
					queue.add(stateReachableFromCurr);

					int state = this.fail[curr];
					while (this.goTo[state][chrInt] == FAIL) {
						state = this.fail[state];
					}

					this.fail[stateReachableFromCurr] = this.goTo[state][chrInt];
					this.output[stateReachableFromCurr].addAll(this.output[this.fail[stateReachableFromCurr]]);
				}
			}
		}
	}

	private int calculateTransitionsTable(Stream patterns) {

		int newState = 0;
		for (String s : (Iterable) () -> patterns.iterator()) {

			int state = INITIAL_STATE;

			// index of the current character
			int ci = 0;

			// traversal through the states, which are already created
			while (ci < s.length()) {
				char chr = s.charAt(ci);
				int chrInt = Arrays.binarySearch(this.charToIntMapping, chr);

				if (this.goTo[state][chrInt] != FAIL) {
					state = this.goTo[state][chrInt];
					ci++;
				} else {
					break;
				}
			}

			// creation of the new states
			while (ci < s.length()) {
				char chr = s.charAt(ci);
				int chrInt = Arrays.binarySearch(this.charToIntMapping, chr);

				newState = newState + 1;
				this.goTo[state][chrInt] = newState;
				state = newState;

				ci++;
			}

			// remember current pattern as the output for the last processed
			// state
			this.output[state].add(s);
		}

		return newState + 1;
	}

	public String generateGraphvizAutomatonRepresentation(boolean displayEdgesToInitialState) {
		return Util.generateGraphvizAutomatonRepresentation(this, displayEdgesToInitialState);
	}

	public interface MatchCallback {

		void onMatch(int startPosition, int endPosition, String matched);
	}

	public static class Util {

		private static final String STYLE_FAILURE_TRANSITION = " [style=dashed, color=gray, constraint=false];";
		private static final String STYLE_STATE_WITHOUT_OUTPUT = " [shape=circle];";
		private static final String STYLE_STATE_WITH_OUTPUT = " [shape=doublecircle];";
		private static final char TAB = '\t';
		private static final char NEW_LINE = '\n';

		public static String generateGraphvizAutomatonRepresentation(AhoCorasickOptimized automaton,
				boolean displayEdgesToInitialState) {

			StringBuilder sb = new StringBuilder();
			sb.append("digraph automaton {").append(NEW_LINE);

			sb.append(TAB).append("graph [rankdir=LR];").append(NEW_LINE);
			Queue queue = new LinkedList<>();
			queue.add(INITIAL_STATE);

			List visitedStates = new ArrayList<>();

			// BFS traversal of the automaton
			while (!queue.isEmpty()) {
				int state = queue.remove();
				visitedStates.add(state);

				for (int charInt = 0; charInt < automaton.charToIntMapping.length; charInt++) {

					if ((automaton.goTo[state][charInt] != FAIL) && (automaton.goTo[state][charInt] != INITIAL_STATE)) {

						queue.add(automaton.goTo[state][charInt]);

						appendAutomatonTransitionGraphviz(automaton, sb, state, charInt);
					}
				}
			}

			appendFailureTransitionsToGraphviz(automaton, displayEdgesToInitialState, sb, visitedStates);

			displayStatesInGraphviz(automaton, sb, visitedStates);

			sb.append("}");
			return sb.toString();
		}

		public static void appendAutomatonTransitionGraphviz(AhoCorasickOptimized automaton, StringBuilder sb,
				int state, int charInt) {

			sb.append(TAB).append(state).append(" -> ").append(automaton.goTo[state][charInt]).append(" [label=")
					.append(automaton.charToIntMapping[charInt]).append(", weight=100, style=bold];").append(NEW_LINE);
		}

		private static void displayStatesInGraphviz(AhoCorasickOptimized automaton, StringBuilder sb,
				List visitedStates) {

			for (int state : visitedStates) {
				if (!automaton.output[state].isEmpty()) {
					sb.append(TAB).append(state).append(STYLE_STATE_WITH_OUTPUT).append(NEW_LINE);
				} else {
					sb.append(TAB).append(state).append(STYLE_STATE_WITHOUT_OUTPUT).append(NEW_LINE);
				}
			}
		}

		private static void appendFailureTransitionsToGraphviz(AhoCorasickOptimized automaton,
				boolean displayEdgesToInitialState, StringBuilder sb, List states) {

			for (int state : states) {
				if (displayEdgesToInitialState
						|| ((automaton.fail[state] != INITIAL_STATE) || (state == INITIAL_STATE))) {

					sb.append(TAB).append(state).append(" -> ").append(automaton.fail[state])
							.append(STYLE_FAILURE_TRANSITION).append(NEW_LINE);
				}
			}
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy