All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.metafacture.commons.tries.SetMatcher Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2013, 2014 Deutsche Nationalbibliothek
 *
 * Licensed under the Apache License, Version 2.0 the "License";
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.metafacture.commons.tries;

import java.io.PrintStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map.Entry;
import java.util.Queue;

/**
 * Implementation of the Aho-Corasick algorithm.
 *
 * @param  type of stored value
 * @author Markus Michael Geipel
 */
public final class SetMatcher {
    private final ACNode root = new ACNode<>(null, 0);
    private boolean isPrepared;

    /**
     * Creates an instance of {@link SetMatcher}.
     */
    public SetMatcher() {
    }

    /**
     * Adds a value for a key.
     *
     * @param key   the key
     * @param value the value
     */
    public void put(final String key, final T value) {
        if (isPrepared) {
            throw new IllegalStateException("keys cannot be added during matching.");
        }

        final int length = key.length();
        ACNode node = root;
        ACNode next;
        for (int i = 0; i < length - 1; ++i) {
            next = node.getNext(key.charAt(i));
            if (next == null) {
                next = node.addNext(key.charAt(i));
            }
            node = next;
        }
        next = node.getNext(key.charAt(length - 1));
        if (next == null) {
            next = node.addNext(key.charAt(length - 1), value);
        }
        else if (next.getValue() == null) {
            next.setValue(value);
        }
        else {
            throw new IllegalStateException("Key '" + key + "' already in trie");
        }
    }

    /**
     * Gets the List of Matches of a text.
     *
     * @param text the text
     * @return List of Matches
     */
    public List> match(final String text) {
        if (!isPrepared) {
            prepare();
            isPrepared = true;
        }
        final List> matches = new ArrayList>();

        ACNode node = root;
        final int length = text.length();
        int index = 0;

        while (index < length) {
            final ACNode next = node.getNext(text.charAt(index));
            if (next != null) {
                node = next;
            }
            else if (node != root) {
                node = node.getFailure();
                continue;
            }
            ++index;
            collectMatches(node, index, matches);
        }
        return matches;
    }

    private void collectMatches(final ACNode node, final int index, final List> matches) {
        //direct hit or hit in chain of failure links?
        ACNode tempNode = node;
        do {
            if (tempNode.getValue() != null) {
                matches.add(new Match(tempNode.getValue(), index - tempNode.getDepth(), tempNode.getDepth()));
            }
            tempNode = tempNode.getFailure();
        } while (tempNode != root);
    }

    private void prepare() {
        final Queue> queue = new LinkedList>();

        // prepare root
        root.setFailure(root);
        for (final ACNode child : root.getNext()) {
            child.setFailure(root);
            queue.add(child);
        }
        // prepare rest
        while (!queue.isEmpty()) {
            final ACNode parent = queue.poll();
            final ACNode parentFailure = parent.getFailure();

            for (final Entry> link : parent.getLinks()) {
                final char key = link.getKey().charValue();
                final ACNode child = link.getValue();
                ACNode node = parentFailure;

                while (node.getNext(key) == null && node != root) {
                    node = node.getFailure();
                }

                if (node.getNext(key) == null) {
                    child.setFailure(root);
                }
                else {
                    child.setFailure(node.getNext(key));
                }
                queue.add(child);
            }
        }
    }

    /**
     * Prints dot description of the automaton to the PrintStream for
     * visualization in GraphViz. Used for debugging and education.
     *
     * @param out the stream to which the description is written
     */
    public void printAutomaton(final PrintStream out) {
        out.println("digraph ahocorasick {");
        printDebug(out, root);
        out.println("}");
    }

    private void printDebug(final PrintStream out, final ACNode node) {
        if (node.getValue() == null) {
            out.println(node.hashCode() + " [shape=point label=\"\"]");
        }
        else {
            out.println(node.hashCode() + " [shape=circle style=filled label=\"\"]");
        }

        if (node.getFailure() != root) {
            out.println(node.hashCode() + " -> " + node.getFailure().hashCode() + "[color=gray]");
        }
        for (final Entry> link : node.getLinks()) {
            out.println(node.hashCode() + "  -> " + link.getValue().hashCode() + " [label=\"" + link.getKey() + "\"]");
            printDebug(out, link.getValue());
        }
    }

    /**
     * Describes a match.
     *
     * @param  type of the stored value
     */
    public static final class Match {
        private final T value;
        private final int start;
        private final int length;

        /**
         * Constructs a Match.
         *
         * @param value the value
         * @param start the position
         * @param length the length
         */
        public Match(final T value, final int start, final int length) {
            this.value = value;
            this.start = start;
            this.length = length;
        }

        /**
         * Gets the value.
         *
         * @return the value
         */
        public T getValue() {
            return value;
        }

        /**
         * Gets the start position.
         *
         * @return the start position
         */
        public int getStart() {
            return start;
        }

        /**
         * Gets the length.
         *
         * @return the length
         */
        public int getLength() {
            return length;
        }

        @Override
        public String toString() {
            return value + " " + start + "+" + length;
        }

    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy