All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.squareup.trex.SequencePattern Maven / Gradle / Ivy

package com.squareup.trex;

import java.util.List;
import java.util.NoSuchElementException;
import java.util.PrimitiveIterator;
import java.util.Stack;
import java.util.function.Consumer;

/**
 * When backtracking, we need to save the state for a previous valid
 * match on a branch that we're continuing to explore.
 * This class encapsulates this saved state.
 *
 * @author Gabor Angeli
 */
class SequencePatternBranchState {
  /**
   * The index we'd matched up so far. This is where we should
   * continue running the iterator from.
   */
  final int index;
  /**
   * The iterator of matches we have, starting from {@link #index}
   * and having already matched {@link #matchCount} components.
   */
  final PrimitiveIterator.OfInt matches;
  /**
   * The number of times we've matched our components so far.
   */
  final int matchCount;

  /**
   * Create a saved branch state.
   *
   * @param index @see #index
   * @param matches @see #matches
   * @param matchCount @see #matchCount
   */
  SequencePatternBranchState(
      int index,
      PrimitiveIterator.OfInt matches,
      int matchCount) {
    this.index = index;
    this.matches = matches;
    this.matchCount = matchCount;
  }
}

/**
 * The implementation of the iterator for matching an arbitrary
 * sequence of components. The meat of the work for a
 * {@link SequencePattern} is done in this class.
 *
 * @author Gabor Angeli
 */
class SequencePatternIterator implements PrimitiveIterator.OfInt {
  /**
   * The state of our iterator at the moment. This is stored in the variable
   * {@link #state} and updated as we run the various iterator functions.
   */
  enum State {
    NEVER_PRIMED,
    HAVE_MATCH,
    BRANCH_EXHAUSTED,
    ITERATOR_EXHAUSTED
  }

  /**
   * We are a sequence of either a single component repeated or a list of components.
   * If it's a single component repeated, then this stores that component. Otherwise,
   * the list of components is stored in {@link #components}. Exactly one of these two
   * fields must be non-null.
   */
  /* @Nullable */ private final Pattern component;

  /**
   * We are a sequence of either a single component repeated or a list of components.
   * If it's a single component repeated, the component is stored in {@link #component}.
   * Otherwise, this field stores the list of components we need to match. Exactly one
   * of these two fields must be non-null.
   */
  /* @Nullable */ private final List components;

  /**
   * The minimum number of components we need to match, inclusive of this number.
   * This is either the minimum repeat count, or 0 if we have a {@linkplain #components
   * components list}.
   */
  public final int minCountInclusive;

  /**
   * The maximum number of components we need to match, inclusive of this number.
   * This is either the maximum repeat count, or the length of our {@linkplain #components
   * components list}.
   */
  public final int maxCountInclusive;

  /**
   * The input we are matching with this iterator.
   */
  public final List input;

  /**
   * The matcher providing the context for our match. This is primarily used to define
   * our timeouts.
   */
  public final Matcher context;

  /**
   * If true, iterate over matches as a reluctant quantifier. That is,
   * return shorter matches before longer ones. This is the case for patterns
   * like 'token*?'.
   */
  public final boolean isReluctant;

  /**
   * The stack for our backtracking search. Each element of the stack defines a decision
   * point where we had a potentially valid partial match (or completely valid exact match)
   * and chose one path instead of the other. This is, effectively, a mildly optimized version
   * of a vanilla depth-first-search stack.
   */
  /* @Nullable */
  private Stack branchStack = null;  // avoid needless object allocation

  /**
   * The index into our input sequence when we started matching this pattern.
   */
  public final int initialIndex;

  /**
   * The pattern this iterator was made from. Used for registering capture group
   * matches with the matcher.
   */
  public final SequencePattern sourcePattern;

  /**
   * The next index to return from the iterator. This is only a valid value if the state
   * is in {@link State#HAVE_MATCH}. Otherwise, this is a stale index and should not be
   * returned.
   */
  private int nextIndex;

  /**
   * The current state of the iterator. This is updated directly in the various methods
   * in this class, and each method enforces some invariant on what the state will be after
   * the method returns.
   */
  private State state = State.NEVER_PRIMED;

  /**
   * A variable used for a reluctant quantifier to store the current index for the
   * backtracking search. This can be thought of as the {@link SequencePatternBranchState#index}
   * field of a dummy next state.
   */
  private int reluctantIndex;

  /**
   * A variable used for a reluctant quantifier to store the current match count for the
   * backtracking search. This can be thought of as the
   * {@link SequencePatternBranchState#matchCount} field of a dummy next state.
   */
  private int reluctantMatchCount;

  /** A straightforward constructor for the fields of the iterator */
  SequencePatternIterator(
      /* @Nullable */ Pattern component,
      /* @Nullable */ List components,
      int minCountInclusive, int maxCountInclusive,
      boolean isReluctant,
      List input, int initialIndex,
      SequencePattern sourcePattern,
      Matcher context) {
    assert (component == null || components == null) && !(component == null && components == null)
        : "Must have exactly one component type";
    this.component = component;
    this.components = components;
    this.minCountInclusive = minCountInclusive;
    this.maxCountInclusive = maxCountInclusive;
    this.isReluctant = isReluctant;
    this.input = input;
    this.initialIndex = initialIndex;
    this.nextIndex = initialIndex;
    this.sourcePattern = sourcePattern;
    this.context = context;
    this.reluctantIndex = initialIndex;
    this.reluctantMatchCount = 0;
  }

  /**
   * 

* Run all the way down a given branch of our backtracking search, attempting * to create the greediest match on that branch. This stores all of the branches * it did not take in {@link #branchStack} so that we can return to them later, * but is not itself responsible for running the backtracking. It simply reports * whether or not this branch was a successful match. *

* *

* This function is guaranteed to leave us in a state of either * {@link State#HAVE_MATCH} or {@link State#BRANCH_EXHAUSTED}. * The former if we have a match on this branch of the search, * and the latter if we do not have a match, because we matched * too few or too many words. *

* * @param index The start index from which we are starting our match. * This is an index into the tokens list we are matching. * @param initialComponentsConsumed The number of components we have already consumed. * This is an index into our components list, or the * number of times we've repeated our component. */ private void primeBranchEager(int index, int initialComponentsConsumed) { // Checking for timeouts is actually a moderately expensive operation, // on the order of a few dozen ns per invocation. Putting the check here // was chosen as a tradeoff between checking frequently enough to be useful, // but not so frequently as to incur a noticeable slowdown. This function is // chosen as a compromise between being called frequently enough to be useful, // but not so frequently that we slow down execution too much. if (this.context.timeoutExceeded()) { throw new RuntimeException("Timout exceeded for pattern match"); } int numComponentsConsumed; for (numComponentsConsumed = initialComponentsConsumed; // note: '<= input.size()' to allow for terminal 0-length matches numComponentsConsumed < maxCountInclusive && index <= input.size(); ++numComponentsConsumed) { PrimitiveIterator.OfInt match = (components == null ? component : components.get(numComponentsConsumed)) .consume(input, index, context); if (match.hasNext()) { // We have a match at this depth int indexBeforeMatch = index; index = match.nextInt(); if (!(match instanceof SingleValueIterator)) { // We may have more matches. Push the iterator to the stack if (branchStack == null) { branchStack = new Stack<>(); } branchStack.push(new SequencePatternBranchState(indexBeforeMatch, match, numComponentsConsumed)); } else if (numComponentsConsumed >= minCountInclusive) { // Save this state as a valid state we can come back to, // if we want to match fewer than our max number of matches if (branchStack == null) { branchStack = new Stack<>(); } branchStack.push(new SequencePatternBranchState(indexBeforeMatch, SingleValueIterator.EMPTY, numComponentsConsumed)); } // Update the index if appropriate // Note the '+ 1', since we just matched a token not yet // represented in |matchCount| if (numComponentsConsumed + 1 >= minCountInclusive) { // Set our next index to the most greedy match so far this.nextIndex = index; } } else { break; } } // If we didn't match enough tokens, or matched too many, // we don't have a match if (numComponentsConsumed < this.minCountInclusive || numComponentsConsumed > this.maxCountInclusive) { state = State.BRANCH_EXHAUSTED; } else { state = State.HAVE_MATCH; } } /** *

* Run our backtracking search to find the next match to return. *

* *

* This will always leave our state in one of {@link State#HAVE_MATCH} * or {@link State#ITERATOR_EXHAUSTED} *

* * @see #primeReluctant(), the reluctant variant of the backtracking search. */ private void primeEager() { // If we've never been primed, prime our depth stack if (state == State.NEVER_PRIMED) { primeBranchEager(this.nextIndex, 0); } assert state != State.NEVER_PRIMED : "We should have primed ourselves axiomatically by now"; // Run our backtracking search until we find a match while (state == State.BRANCH_EXHAUSTED && // only run while we don't have a match branchStack != null && // if we never had more options, stop searching !branchStack.isEmpty() // if we're out of options, stop searching ) { SequencePatternBranchState branchState = this.branchStack.pop(); // Get the match if (branchState.matches.hasNext()) { // There are more matches we can make at this match count. // Consume the next element from the iterator this.nextIndex = branchState.matches.nextInt(); assert this.nextIndex >= 0 : "Our iterator should never return an invalid match value"; // Push ourselves back on the stack, if this remains a valid state. // This remains a valid state if either: // 1. There are more matches in the iterator // 2. Not matching is a valid option at this state if (!(branchState.matches instanceof SingleValueIterator)) { this.branchStack.push(branchState); } // Re-prime the stack, if we're valid so far primeBranchEager(this.nextIndex, branchState.matchCount + 1); } else if (branchState.matchCount >= minCountInclusive) { // We should only get here if we're allowed to simply not match this element // That is, this is the only situation when empty iterators are pushed back // on th stack (see case 2 above) this.nextIndex = branchState.index; state = State.HAVE_MATCH; } // otherwise, continue searching } // Promote complete branch exhaustion to iterator exhaustion. // If our last branch failed to match, it means the whole // iterator has failed to match as well. if (state == State.BRANCH_EXHAUSTED) { state = State.ITERATOR_EXHAUSTED; } } /** *

* Run our backtracking search to find the next match to return. This runs the search * as a reluctant search, returning shorter matches before longer ones. * This function is also responsible for checking on the timeouts, as the backtracking * search is likely the slow part of any match. *

* *

* This will always leave our state in one of {@link State#HAVE_MATCH} * or {@link State#ITERATOR_EXHAUSTED} *

* * @see #primeEager(), the eager variant of the backtracking search. */ @SuppressWarnings("fallthrough") private void primeReluctant() { // Check for timeouts if (this.context.timeoutExceeded()) { throw new RuntimeException("Timout exceeded for pattern match"); } switch (state) { case NEVER_PRIMED: if (minCountInclusive == 0) { // Special case: we're allowed to match nothing. // note that nextIndex is already set to the correct value. state = State.HAVE_MATCH; break; } state = State.BRANCH_EXHAUSTED; // fall through case BRANCH_EXHAUSTED: // State when we need to re-prime int index = reluctantIndex; int numComponentsConsumed = reluctantMatchCount; // Base case is either we've consumed too much or we're out of tokens // This is the inverse condition in primeBranchEager for continuing the loop // If it triggers, it means we have nothing left to consume. // note: '<= input.size()' to allow for terminal 0-length matches // This branch is taken if the base case is not met if (numComponentsConsumed < maxCountInclusive && index <= input.size()) { // Recursive case: try to match PrimitiveIterator.OfInt match = (components == null ? component : components.get(numComponentsConsumed)) .consume(input, index, context); if (match.hasNext()) { // We have a match at this depth reluctantIndex = match.nextInt(); reluctantMatchCount += 1; // Save our state for backtracking, since we have more options to match here if (!(match instanceof SingleValueIterator)) { if (branchStack == null) { branchStack = new Stack<>(); } branchStack.push(new SequencePatternBranchState(index, match, numComponentsConsumed)); } // Check if we have a match if (reluctantMatchCount >= minCountInclusive) { // We've matched enough times; mark ourselves as matching this.state = State.HAVE_MATCH; this.nextIndex = reluctantIndex; } else { // We haven't matched enough times; recurse // Only recurse if we've either (1) made forward progress, or // (2) have a different token to try next. if (reluctantIndex > index || components != null) { primeReluctant(); } } } } break; default: case HAVE_MATCH: case ITERATOR_EXHAUSTED: break; } // Check if we can backtrack if (state == State.BRANCH_EXHAUSTED && branchStack != null && !branchStack.isEmpty()) { // We have a saved state we can continue from. // Restore ourselves to the saved state SequencePatternBranchState savedState = branchStack.pop(); if (savedState.matches.hasNext()) { reluctantIndex = savedState.matches.nextInt(); reluctantMatchCount = savedState.matchCount + 1; if (savedState.matches.hasNext()) { branchStack.push(savedState); } // Check if this is an immediate success if (reluctantMatchCount >= minCountInclusive) { this.state = State.HAVE_MATCH; this.nextIndex = reluctantIndex; } else { // If it's not an immediate success, continue primeReluctant(); } } else { // Skip over empty iterators while backtracking primeReluctant(); } } else if (state == State.BRANCH_EXHAUSTED) { // We're out of options state = State.ITERATOR_EXHAUSTED; } } /** {@inheritDoc} */ @Override public boolean hasNext() { switch (state) { case HAVE_MATCH: return true; case ITERATOR_EXHAUSTED: return false; default: if (isReluctant) { primeReluctant(); } else { primeEager(); } assert state == State.HAVE_MATCH || state == State.ITERATOR_EXHAUSTED : "By the invariant of the prime() method, we should be in one of these states"; // Note: by the invariant assertion above, we cannot infinite loop on this method return hasNext(); } } /** {@inheritDoc} */ @Override public int nextInt() { if (!hasNext()) { // note[gabor]: hasNext() primes the iterator if needed throw new NoSuchElementException(); } // Reset our priming state state = State.BRANCH_EXHAUSTED; // Register the match sourcePattern.registerMatch(initialIndex, nextIndex, context); // Return our next index return this.nextIndex; } } /** *

* A pattern that's composed of a sequence of other patterns. This is a unified * implementation of two cases of this: (1) where the sequence is a sequence of different * patterns (e.g., 'foo bar'), and (2) where this sequence is the same pattern repeated * a certain number of times (e.g., 'foo+'). *

* * @author Gabor Angeli */ class SequencePattern extends Pattern { /** * We are a sequence of either a single component repeated or a list of components. * If it's a single component repeated, then this stores that component. Otherwise, * the list of components is stored in {@link #components}. Exactly one of these two * fields must be non-null. */ /* @Nullable */ private final Pattern component; /** * We are a sequence of either a single component repeated or a list of components. * If it's a single component repeated, the component is stored in {@link #component}. * Otherwise, this field stores the list of components we need to match. Exactly one * of these two fields must be non-null. */ /* @Nullable */ private final List components; /** * The minimum number of components we need to match, inclusive of this number. * This is either the minimum repeat count, or 0 if we have a {@linkplain #components * components list}. */ public final int minCountInclusive; /** * The maximum number of components we need to match, inclusive of this number. * This is either the maximum repeat count, or the length of our {@linkplain #components * components list}. * This is set to {@link Integer#MAX_VALUE} if no upper bound is set (e.g., for the * operator). */ public final int maxCountInclusive; /** * If true, we try to return matches reluctantly -- that is, return the shortest * possible match first. By default, this is false and we match eagerly -- * returning the longest match first. */ public final boolean isReluctant; /** * Create a new sequence pattern. * * @param component See {@link #component}. Exactly one of this or |components| must be defined. * @param components See {@link #components}. Exactly one of this or |components| must be defined. * @param minCountInclusive See {@link #minCountInclusive}. * @param maxCountInclusive See {@link #maxCountInclusive}. * @param isReluctant See {@link #isReluctant}. */ SequencePattern( /* @Nullable */ Pattern component, /* @Nullable */ List components, int minCountInclusive, int maxCountInclusive, boolean isReluctant) { assert (component == null || components == null) && !(component == null && components == null) : "Must have exactly one component type"; this.component = component; this.components = components; this.minCountInclusive = minCountInclusive; this.maxCountInclusive = maxCountInclusive; this.isReluctant = isReluctant; } /** {@inheritDoc} */ @Override protected PrimitiveIterator.OfInt consume(List input, int index, Matcher context) { if (components == null && component == null) { return SingleValueIterator.EMPTY; } return new SequencePatternIterator(component, components, minCountInclusive, maxCountInclusive, isReluctant, input, index, this, context); } /** {@inheritDoc} */ @Override protected void forEachComponent(Consumer fn) { if (this.component != null) { this.component.forEachComponent(fn); } if (this.components != null) { for (Pattern component : this.components) { component.forEachComponent(fn); } } fn.accept(this); } /** {@inheritDoc} */ @Override protected void populateToString(StringBuilder b) { if (components != null) { // This is a list of components for (int i = 0; i < components.size(); ++i) { if (i != 0) { b.append(' '); } b.append(components.get(i).toString()); } } else if (component == null) { // This case should be impossible b.append("???"); } else { // This is a repeated component if (minCountInclusive == 0 && maxCountInclusive == Integer.MAX_VALUE) { b.append(component.toString()).append('*'); } else if (minCountInclusive == 1 && maxCountInclusive == Integer.MAX_VALUE) { b.append(component.toString()).append('+'); } else if (minCountInclusive == 0 && maxCountInclusive == 1) { b.append(component.toString()).append('?'); } else if (maxCountInclusive == Integer.MAX_VALUE) { b.append(component.toString()) .append('{') .append(minCountInclusive) .append(',') .append('}'); } else if (minCountInclusive == maxCountInclusive) { b.append(component.toString()) .append('{') .append(minCountInclusive) .append('}'); } else { b.append(component.toString()) .append('{') .append(minCountInclusive) .append(',') .append(maxCountInclusive) .append('}'); } // Add the '?' to signify we're reluctant, if relevant if (isReluctant) { b.append('?'); } } } /** * A convenience function for implementing the star operator (e.g., foo*). */ static SequencePattern star(Pattern match, boolean isReluctant) { return new SequencePattern(match, null, 0, Integer.MAX_VALUE, isReluctant); } /** * A convenience function for implementing the plus operator (e.g., foo+). */ static SequencePattern plus(Pattern match, boolean isReluctant) { return new SequencePattern(match, null, 1, Integer.MAX_VALUE, isReluctant); } /** * A convenience function for implementing the question mark operator (e.g., foo?). */ static SequencePattern qmark(Pattern match, boolean isReluctant) { return new SequencePattern(match, null, 0, 1, isReluctant); } /** * A convenience function for matching a sequence of patterns (e.g., 'foo bar'). */ static SequencePattern sequence(List seq) { return new SequencePattern(null, seq, seq.size(), seq.size(), false /* is reluctant */); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy