All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.washington.cs.knowitall.regex.FiniteAutomaton Maven / Gradle / Ivy

package edu.washington.cs.knowitall.regex;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;

import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;

import edu.washington.cs.knowitall.regex.Expression.AssertionExpression;
import edu.washington.cs.knowitall.regex.Expression.MatchingGroup;

/**
 * A finite automaton implementation.  There is support for epsilon
 * transitions (NFA) but if those are omitted then this works as an
 * implementation of a DFA.
 *
 * @author Michael Schmitz 
 */
public class FiniteAutomaton {
    /**
     * A component automaton with a single start state and a single end
     * state.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static class Automaton {
        public final StartState start;
        public final EndState end;

        public Automaton(StartState start, EndState end) {
            this.start = start;
            this.end = end;
        }

        public Automaton(Expression expr) {
            this.start = new StartState(expr);
            this.end = new EndState(expr);
        }

        public boolean apply(List tokens) {
            return this.evaluate(tokens, true) != null;
        }

        public int minMatchingLength() {
            return start.minMatchingLength();
        }

        public Match.FinalMatch lookingAt(List tokens) {
            return lookingAt(tokens, 0);
        }

        /**
         * @return null if no match, otherwise a representation of the match
         */
        public Match.FinalMatch lookingAt(List tokens, int startIndex) {
            if (tokens.size() - startIndex - this.minMatchingLength() < 0) {
                // don't try if we can't possible match
                return null;
            }
            else {
                List sublist = tokens.subList(startIndex, tokens.size());

                Step path = this.evaluate(sublist, startIndex == 0);
                if (path == null) {
                    return null;
                }

                // build list of edges
                List> edges = new ArrayList>();
                while (path.state != this.start) {
                    edges.add(path.path);
                    path = path.prev;
                }

                Match.IntermediateMatch match = new Match.IntermediateMatch();
                buildMatch(sublist.iterator(), null, new AtomicInteger(startIndex), this.start,
                           Lists.reverse(edges).iterator(), match);
                return new Match.FinalMatch(match);
            }
        }

        /**
         * Retrace the path through the NFA and produce an object that
         * represents the match.
         * @param tokenIterator an iterator over the tokens.
         * @param expression the expression to match.
         * @param index the present index.
         * @param state the present state.
         * @param edgeIterator an iterator over the edges in the solution.
         * @param match the solution.
         * @return
         */
        private State buildMatch(Iterator tokenIterator, Expression expression,
                AtomicInteger index, State state, Iterator> edgeIterator,
                Match.IntermediateMatch match) {

            Match.IntermediateMatch newMatch = new Match.IntermediateMatch();

            while (edgeIterator.hasNext() && !((state instanceof EndState)
                   && ((EndState)state).expression == expression)) {

                AbstractEdge edge = edgeIterator.next();

                // run the sub-automaton
                if (edge instanceof Edge
                    && !(((Edge) edge).expression instanceof AssertionExpression)) {
                    // consume a token, this is the base case
                    E token = tokenIterator.next();
                    newMatch.add(((Edge)edge).expression, token, index.getAndIncrement());

                    state = edge.dest;
                }
                else if (state instanceof StartState) {
                    // recurse on StartState so we have a group for that match
                    Expression expr = ((StartState)state).expression;
                    state = buildMatch(tokenIterator, expr, index, edge.dest, edgeIterator, newMatch);
                    assert(state instanceof EndState && ((EndState)state).expression == expr);
                }
                else {
                    assert(edge instanceof Epsilon);
                    state = edge.dest;
                }
            }

            // add the sub match group
            if (expression != null
                && (!newMatch.isEmpty() || expression instanceof MatchingGroup)) {
                // create a wrapper for the expressions it matched
                Match.Group pair = new Match.Group(expression);
                for (Match.Group p : newMatch.pairs()) {
                    if (p.expr instanceof Expression.BaseExpression) {
                        pair.addTokens(p);
                    }
                }

                // add it
                match.add(pair);
            }

            // add the contents of the sub match group
            match.addAll(newMatch.pairs());

            return state;
        }

        /**
         * A representation of a movement from a state to another, with a
         * backreference to the previous state.  This is used in building
         * a match object once a solution has been found.
         * @author Michael Schmitz 
         *
         * @param 
         */
        private static class Step {
            public final State state;
            public final Step prev;
            public final AbstractEdge path;

            public Step(State state) {
                this(state, null, null);
            }

            public Step(State state, Step prev, AbstractEdge path) {
                this.state = state;
                this.prev = prev;
                this.path = path;
            }

            public String toString() {
                return this.state.toString();
            }
        }

        /**
         * Expand all epsilon transitions for the supplied steps.  That is,
         * add all states available via an epsilon transition from a supplied
         * state to the list.
         * @param steps
         */
        private void expandEpsilons(List> steps) {
            int size = steps.size();
            for (int i = 0; i < size; i++) {
                Step step = steps.get(i);

                expandEpsilon(step, steps);
            }
        }

        /**
         * Expand all epsilon transitions for the specified step.  That is,
         * add all states avaiable via an epsilon transition from step.state.
         * @param step
         * @param steps
         */
        private void expandEpsilon(Step step, List> steps) {
            // loop over edges
            for (final Epsilon edge : step.state.epsilons) {

                // try free edges if they do not lead to an existing
                // step
                if (!Iterables.any(steps,
                                new Predicate>() {
                                    @Override
                                    public boolean apply(Step step) {
                                        return step.state == edge.dest;
                                    }
                                })) {
                    Step newstep = new Step(edge.dest, step, edge);
                    steps.add(newstep);
                    expandEpsilon(newstep, steps);
                }
            }
        }

        /**
         * Expand any state that has an assertion edge if the assertion passes
         * given the present state.
         * @param steps
         * @param newsteps
         * @param hasStart true iff the tokens contains the start token.
         * @param tokens
         * @param totalTokens
         */
        private void expandAssertions(List> steps, List> newsteps, boolean hasStart,
                                      List tokens, int totalTokens) {
            for (Step step : steps) {
                for (final Edge edge : step.state.edges) {
                    if (edge.expression instanceof AssertionExpression) {
                        AssertionExpression assertion = (AssertionExpression)edge.expression;

                        if (assertion.apply(hasStart, tokens, totalTokens)) {
                            newsteps.add(new Step(edge.dest, step, edge));
                        }
                    }
                }
            }
        }

        private Step evaluate(List tokens, boolean hasStart) {
            List> steps = new ArrayList>();
            steps.add(new Step(this.start));
            return evaluate(tokens, steps, hasStart);
        }

        /**
         * Evaluate the NFA against the list of tokens using the Thompson NFA
         * algorithm.
         * @param tokens the tokens to evaluate against
         * @param steps present list of accessible states.
         * @param hasStart true iff tokens contains the start token.
         * @return a Step object representing the last transition or null.
         */
        private Step evaluate(List tokens, List> steps, boolean hasStart) {
            int totalTokens = tokens.size();

            int solutionTokensLeft = totalTokens;
            Step solution = null;
            while (!steps.isEmpty()) {

                expandEpsilons(steps);

                List> intermediate = new ArrayList>(steps);
                List> newsteps = new ArrayList>(steps.size() * 2);
                do {

                    // check if at end
                    for (Step step : intermediate) {
                        if (step.state == this.end) {
                            if (tokens.size() == totalTokens) {
                                // can't succeed if no tokens are consumed
                            }
                            else {
                                // we have reached the end
                                if (tokens.size() < solutionTokensLeft) {
                                    solution = step;
                                    solutionTokensLeft = tokens.size();
                                }
                            }
                        }
                    }

                    // handle assertions
                    newsteps.clear();
                    expandAssertions(intermediate, newsteps, hasStart, tokens, totalTokens);
                    expandEpsilons(newsteps);

                    intermediate.clear();
                    intermediate.addAll(newsteps);

                    steps.addAll(newsteps);
                } while (newsteps.size() > 0);

                newsteps.clear();
                if (!tokens.isEmpty()) {
                    for (Step step : steps) {
                        for (final Edge edge : step.state.edges) {
                            // try other edges if they match the current token
                            if (edge.apply(tokens.get(0))) {
                                newsteps.add(new Step(edge.dest, step, edge));
                            }
                        }
                    }

                    // consume a token
                    tokens = tokens.subList(1, tokens.size());
                }

                steps = newsteps;
            }

            return solution;
        }
    }

    /**
     * Representation of a state in the automaton.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static class State {
        public final List> edges = new ArrayList>();
        public final List> epsilons = new ArrayList>();

        /**
         * Add an epsilon transition between this state and dest.
         * @param dest the state to connect
         */
        public void connect(State dest) {
            this.epsilons.add(new Epsilon(dest));
        }

        /**
         * Add an edge between this state and dest.
         * @param dest the state to connect
         * @param cost the expression of the edge
         */
        public void connect(State dest, Expression cost) {
            this.edges.add(new Edge(dest, cost));
        }

        public String toString() {
            return this.getClass().getSimpleName() + ":" + this.edges.size();
        }
    }

    /**
     * A start or end state.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static class TerminusState extends State {
        public final Expression expression;
        public TerminusState(Expression expression) {
            super();
            this.expression = expression;
        }

        public String toString() {
            return this.getClass().getSimpleName()
                   + "("+this.expression.toString()+"):" + this.edges.size();
        }
    }

    /**
     * A start state.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static class StartState extends TerminusState {
        public StartState(Expression expression) {
            super(expression);
        }

        public int minMatchingLength() {
            return this.expression.minMatchingLength();
        }
    }

    /**
     * An end state.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static class EndState extends TerminusState {
        public EndState(Expression expression) {
            super(expression);
        }
    }

    /**
     * An abstract representation of an edge.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static abstract class AbstractEdge implements Predicate {
        public final State dest;

        public AbstractEdge(State dest) {
            this.dest = dest;
        }
    }

    /**
     * An edge with cost {@code expression}.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static class Edge extends AbstractEdge {
        public final Expression expression;

        public Edge(State dest, Expression base) {
            super(dest);
            this.expression = base;
        }

        @Override
        public String toString() {
            return "(" + this.expression.toString() + ") -> " + this.dest.toString();
        }

        @Override
        public boolean apply(E entity) {
            if (expression == null) {
                return true;
            }
            else {
                return expression.apply(entity);
            }
        }
    }

    /**
     * An edge without cost, an epsilon transition.
     * @author Michael Schmitz 
     *
     * @param 
     */
    public static class Epsilon extends AbstractEdge {
        public Epsilon(State dest) {
            super(dest);
        }

        @Override
        public String toString() {
            return "(epsilon) -> " + dest.toString();
        }

        @Override
        public boolean apply(E entity) {
            return true;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy