All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.eclipse.jface.text.MultiStringMatcher Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2019 Paul Pazderski, Thomas Wolf, and others.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     Paul Pazderski; Thomas Wolf - initial API and implementation
 *******************************************************************************/
package org.eclipse.jface.text;

import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Consumer;
import java.util.stream.Collectors;

/**
 * Fast matcher to find the occurrences of any of a fixed set of constant strings. Supports finding
 * all (possibly overlapping) matches, or only the leftmost longest match.
 *
 * @since 3.9
 */
public class MultiStringMatcher {

	// An implementation of the Aho-Corasick algorithm (without the DFA construction from section 6 of the
	// paper; just the failure and output links).
	//
	// See Aho, Alfred V.; Corasick, Margaret J.: "Efficient String Matching: An Aid to Bibliographic Search",
	// CACM 18(6), 1975.
	//
	// The algorithm has been modified to support reporting either all matches or only leftmost longest matches.

	/**
	 * Describes a match result of {@link MultiStringMatcher#indexOf(CharSequence, int)}, giving
	 * access to the matched string and the offset in the text it was matched at.
	 */
	public static interface Match {

		/**
		 * Obtains the matched string.
		 *
		 * @return the text matched
		 */
		String getText();

		/**
		 * Obtains the offset the {@link #getText() text} was matched at.
		 *
		 * @return the offset
		 */
		int getOffset();

	}

	/** A Builder for creating a {@link MultiStringMatcher}. */
	public static interface Builder {

		/**
		 * Adds search strings to be looked for. {@code null} and empty strings in the arguments are
		 * ignored.
		 *
		 * @param searchStrings to add to be looked for by the matcher.
		 * @return this
		 * @throws IllegalStateException if the {@link MultiStringMatcher} was already built.
		 */
		Builder add(String... searchStrings);

		/**
		 * Returns the {@link MultiStringMatcher} built by this builder.
		 * 

* Note that a {@link Builder} instance can build only one * {@link MultiStringMatcher} instance. This is by design; otherwise the builder would have * to store all the searchStrings somewhere, which may be rather memory intensive if a lot * of search strings are added. *

* * @return the {@link MultiStringMatcher} * @throws IllegalStateException if the {@link MultiStringMatcher} was already built. */ MultiStringMatcher build(); } private static class BuilderImpl implements Builder { private MultiStringMatcher m; BuilderImpl() { m= new MultiStringMatcher(); } private void check() { if (m == null) { throw new IllegalStateException("Builder.build() was already called"); //$NON-NLS-1$ } } @Override public Builder add(String... searchStrings) { check(); m.add(searchStrings); return this; } @Override public MultiStringMatcher build() { check(); MultiStringMatcher result= m; m= null; if (!result.root.hasChildren()) { // no search strings were added; return a specialized "matches nothing" matcher return new MultiStringMatcher() { @Override public void find(CharSequence text, int offset, Consumer matches) { return; } @Override public Match indexOf(CharSequence text, int offset) { return null; } }; } result.buildLinks(); return result; } } /** * Creates an initially empty {@link Builder}. * * @return the {@link Builder} */ public static Builder builder() { return new BuilderImpl(); } private static class MatchResult implements Match { private final String match; private final int offset; public MatchResult(String match, int offset) { this.match= match; this.offset= offset; } @Override public String getText() { return match; } @Override public int getOffset() { return offset; } @Override public int hashCode() { return Objects.hashCode(match) * 31 + Integer.hashCode(offset); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null || getClass() != obj.getClass()) { return false; } MatchResult other= (MatchResult) obj; return offset == other.offset && Objects.equals(match, other.match); } @Override public String toString() { return '[' + match + ", " + offset + ']'; //$NON-NLS-1$ } } /** A node in the trie built from the search strings. */ private static class Node { HashMap children; String match; Node fail; Node output; final int depth; Node(int depth) { this.depth= depth; } Node next(Character c) { return children == null ? null : children.get(c); } Node add(char c) { if (children == null) { children= new HashMap<>(); } return children.computeIfAbsent(Character.valueOf(c), key -> new Node(depth + 1)); } boolean hasChildren() { return children != null; } @Override public String toString() { return "[depth=" + depth + ", match=" + match //$NON-NLS-1$ //$NON-NLS-2$ + ", children=" + (children == null ? "" : children.keySet().stream().map(c -> c.toString()).collect(Collectors.joining(", "))) //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ + ']'; } } /** Root node of the trie. */ private final Node root= new Node(0) { @Override Node next(Character c) { // Implements the sentinel loop on the root node for all non-matching characters. Node child= super.next(c); return child == null ? this : child; } }; private MultiStringMatcher() { // Always use a Builder or the static helper methods to create a MultiStringMatcher } private void add(String... searchStrings) { if (searchStrings != null) { for (String searchString : searchStrings) { if (searchString == null || searchString.isEmpty()) { continue; } Node node= root; for (char c : searchString.toCharArray()) { node= node.add(c); } node.match= searchString; } } } private void buildLinks() { // Build the fail and output links. See the paper referenced at the top; this // is a one-to-one implementation of the original algorithm. Variable names // s, r, and state are kept as in the paper. List queue= new LinkedList<>(); for (Node s : root.children.values()) { if (s.hasChildren()) { // No need to queue nodes without children since we don't do anything // with them anyway. queue.add(s); } s.fail= root; } while (!queue.isEmpty()) { Node r= queue.remove(0); for (Map.Entry entry : r.children.entrySet()) { Character c= entry.getKey(); Node s= entry.getValue(); if (s.hasChildren()) { queue.add(s); } Node state= r.fail; Node f; while ((f= state.next(c)) == null) { state= state.fail; } s.fail= f; if (f.match != null) { s.output= f; } else if (f.output != null) { s.output= f.output; } } } } /** * Finds all occurrences of any of the search strings of the {@link MultiStringMatcher} in the * given {@code text} starting at the given {@code offset}, including overlapping occurrences. * * @param text to search (not {@code null}) * @param offset to start searching at * @param matches {@link Consumer} all matches are fed to * * @since 3.10 */ public void find(CharSequence text, int offset, Consumer matches) { // Main search loop of the standard Aho-Corasick algorithm. int textEnd= text.length(); Node node= root; for (int i= offset; i < textEnd; i++) { Character c= Character.valueOf(text.charAt(i)); Node next; while ((next= node.next(c)) == null) { node= node.fail; } node= next; if (node.match != null) { matches.accept(new MatchResult(node.match, i - node.depth + 1)); } Node out= node.output; while (out != null) { matches.accept(new MatchResult(out.match, i - out.depth + 1)); out= out.output; } } } /** * Finds all occurrences of any of the search strings of the {@link MultiStringMatcher} in the * given {@code text} starting at the given {@code offset}, including overlapping occurrences. * * @param text to search (not {@code null}) * @param offset to start searching at * @return a possibly empty list of matches */ public List find(CharSequence text, int offset) { List matches= new LinkedList<>(); find(text, offset, matches::add); return matches; } /** * Find the next occurrence of any of the search strings of the {@link MultiStringMatcher} in * the given {@code text} starting at the given {@code offset}. *

* Performs a simultaneous search for all the strings, returning the leftmost match. If multiple * search strings match at the same index, the longest match is returned. *

* * @param text to search (not {@code null}) * @param offset to start searching at * @return the leftmost longest match found, or {@code null} if no match was found. */ public Match indexOf(CharSequence text, int offset) { // Main search loop of the Aho-Corasick algorithm, modified to stop after // the leftmost longest match. // // To find a match, we pursue a primary goal (lowest offset) and a secondary goal // (longest match). We differentiate between primary and sub-matches. Matching starts // by walking down one path of the trie. Any match we find on this path is a primary // match and any new primary match is better than the one before. A sub-match is a // matching prefix of a suffix of the text currently scanned along the path. These // sub-matches occur on paths off the one we're currently following, and they are // linked in the trie via the output links. Their offset is always greater than that // of a primary match, and sub-matches further into the 'output' chain are shorter. // Therefore we are interested only in the first such sub-match. While walking down // a path, sub-matches off this path are not found in offset order, so we have to // check whether a new sub-match is better (lower offset, or longer) than a previously // found sub-match. // // When we can't continue matching on the current path, the algorithm uses the fail // links to try to find an alternate path to match (which would match a suffix of // what was traversed so far). Therefore, if we already had a primary match, it is // returned, since any other match must have a higher offset. If there is no alternate // path, we fall off the trie (the algorithm bring us back to root, and would start // again from the top). If we have any match, we may stop and return it. If we _do_ // change to an alternate path but there's a sub-match with a lower offset, we also // may return that. Otherwise we continue normally on the new path. int textEnd= text.length(); Match primaryMatch= null; Match subMatch= null; Node node= root; for (int i= offset; i < textEnd; i++) { Character c= Character.valueOf(text.charAt(i)); Node next= node.next(c); if (next == null) { // Can't continue on this path. if (primaryMatch != null) { // Return primary match because any other match must have a higher offset. return primaryMatch; } // Search for another path to continue matching. do { node= node.fail; } while ((next= node.next(c)) == null); if (subMatch != null) { if (next == root) { // We fell off the trie and could not switch to another. Return the best // sub-match. return subMatch; } else if (subMatch.getOffset() < i - node.depth) { // The new path starts at i - node.depth == i - next.depth + 1, so if a // sub-match is earlier, we may return it. Any primary match on this path // or on any other path we might switch to later on will have a higher // offset, and so will any sub-matches we might discover on these paths. return subMatch; } } } node= next; if (node.match != null) { // Any new primary match is better because all have the same offset but any new one // must be longer. An existing sub-match from a previous path is checked above. primaryMatch= new MatchResult(node.match, i - node.depth + 1); if (!node.hasChildren()) { // We will fall off the trie on the next character, so we can return right here. return primaryMatch; } } // Check for sub matches but only if there is no primary match because only another // primary match can be better. if (primaryMatch == null) { Node out= node.output; if (out != null) { int newOffset= i - out.depth + 1; if (subMatch == null || newOffset < subMatch.getOffset() || (newOffset == subMatch.getOffset() && out.depth > subMatch.getText().length())) { subMatch= new MatchResult(out.match, newOffset); } } } } return primaryMatch != null ? primaryMatch : subMatch; } /** * Finds the leftmost longest occurrence of any of the given {@code searchStrings} in the * {@code text} starting at the given {@code offset}. *

* To match the same set of search strings repeatedly against texts it is more efficient to * build and re-use a {@link MultiStringMatcher}. *

* * @param text to search (not {@code null}) * @param offset to start searching at * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored * @return a {@link Match} describing the match found, or {@code null} if no match was found or * there are no non-{@code null} non-empty {@code searchStrings} */ public static Match indexOf(CharSequence text, int offset, String... searchStrings) { return create(searchStrings).indexOf(text, offset); } /** * Creates a {@link MultiStringMatcher} for the given {@code searchStrings}. *

* If there are no non-{@code null} non-empty {@code searchStrings}, the returned * {@link MultiStringMatcher} will never match anything. *

* * @param searchStrings to look for; non-{@code null} and non-empty strings are ignored * @return the {@link MultiStringMatcher} */ public static MultiStringMatcher create(String... searchStrings) { return builder().add(searchStrings).build(); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy