All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.re2j.DFA Maven / Gradle / Ivy

The newest version!
// Copyright 2015 The RE2 Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Original RE2 source here:
// https://github.com/google/re2/blob/master/re2/dfa.cc

package com.google.re2j;

import com.google.re2j.RE2.MatchKind;

import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;

import static com.google.re2j.DFAState.DEAD_STATE;
import static com.google.re2j.Inst.Op.EMPTY_WIDTH;
import static com.google.re2j.MachineInput.EOF;
import static com.google.re2j.RE2.MatchKind.FIRST_MATCH;
import static com.google.re2j.RE2.MatchKind.LONGEST_MATCH;
import static com.google.re2j.Utils.EMPTY_BEGIN_LINE;
import static com.google.re2j.Utils.EMPTY_BEGIN_TEXT;
import static com.google.re2j.Utils.EMPTY_END_LINE;
import static com.google.re2j.Utils.EMPTY_END_TEXT;
import static com.google.re2j.Utils.EMPTY_NO_WORD_BOUNDARY;
import static com.google.re2j.Utils.EMPTY_WORD_BOUNDARY;
import static com.google.re2j.Utils.isRuneStart;
import static com.google.re2j.Utils.isWordByte;
import static java.util.Arrays.sort;

class DFA {

  /**
   * An exception thrown when DFA has reached a number of {@link DFAState}s limit.
   */
  static class DFATooManyStatesException extends RuntimeException {
    private DFATooManyStatesException() {
      super("DFA has reached a number of states limit");
    }
  }

  static final int NO_MATCH = -1;

  static final int FLAG_MATCH = 0x100;      //This is a matching state
  static final int FLAG_LAST_WORD = 0x200;  //The last byte was word character
  static final int FLAG_NEED_SHIFT = 16;
  static final int FLAG_EMPTY_MASK = 0xFF;  // Mask for the empty flag bits

  private static final int MARK = -1;       // Separates priorities of items in a WorkQueue

  // Total number of start parameters (total number of empty flag combinations plus anchored flag)
  private static final int START_PARAMS_CACHE_SIZE = 1 << 13;
  private static final int START_PARAMS_CACHE_SHIFT = 12;

  private static final StartParams DEAD_START_PARAMS = new StartParams(DEAD_STATE, new boolean[256]);

  // Info for the search
  private final Prog prog;

  // Program instructions.
  private final Inst[] instructions;

  // Search for longest match like egrep or POSIX or for first match like Perl, PCRE
  private final MatchKind matchKind;

  // Should input bytes be read forward or backward
  private final boolean runForward;

  // some preallocated workspace.
  private WorkQueue currentWorkQ;
  private WorkQueue nextWorkQ;
  private final int[] instStack;

  private final StartParams[] startParamsCache = new StartParams[START_PARAMS_CACHE_SIZE];
  private final ConcurrentHashMap statesCache;
  private final AtomicInteger availableStates;

  public DFA(Prog prog, MatchKind matchKind, boolean reversed,
             ConcurrentHashMap statesCache, AtomicInteger availableStates) {
    this.prog = prog;
    this.instructions = prog.getInst();
    this.matchKind = matchKind;
    this.runForward = !reversed;
    this.statesCache = statesCache;
    this.availableStates = availableStates;

    int progSize = prog.numInst();
    int nMarks = 0;
    if (matchKind == LONGEST_MATCH) {
      nMarks = progSize;
    }

    currentWorkQ = new WorkQueue(progSize, nMarks);
    nextWorkQ = new WorkQueue(progSize, nMarks);
    int stackSize = 2 * progSize + nMarks;
    instStack = new int[stackSize];
  }

  // Public entry point to run the search. Creates the SearchParams, and then runs the search loop.
  public int search(MachineInput in, int startPos, int endPos, boolean anchored, boolean wantEarliestMatch) {
    StartParams startParams = analyzeSearch(in, startPos, endPos, anchored);
    if (startParams.startState.isDead()) {
      return NO_MATCH;
    }

    return searchLoop(in, startPos, endPos, wantEarliestMatch, startParams);
  }

  // Converts the WorkQueue q to a state.
  // TODO: sawmatch optimization from dfa.cc
  private DFAState workQueueToCachedState(WorkQueue q, int flag) {
    int[] instIndexes = new int[q.getMaxSize()];
    int nIndexes = 0; //number of indexes in instIndexes so far
    int neededFlags = 0;
    int size = q.getSize();
    for (int i = 0; i < size; i++) {
      int instIndex = q.getValueAt(i);

      if (q.isMark(instIndex)) {
        if (nIndexes > 0 && instIndexes[nIndexes - 1] != MARK) {
          instIndexes[nIndexes++] = MARK;
        }
        continue;
      }

      Inst inst = instructions[instIndex];
      switch (inst.op()) {
        case ALT_MATCH:
        case BYTE:
        case EMPTY_WIDTH:
        case MATCH:
        case ALT:
          instIndexes[nIndexes++] = instIndex;
          if (inst.op() == EMPTY_WIDTH) {
            neededFlags |= inst.arg;
          }
          break;
        default:
          break;
      }
    }

    // if the last inst is a mark remove it
    if (nIndexes > 0 && instIndexes[nIndexes - 1] == MARK) {
      nIndexes--;
    }

    // If there are no empty-width instructions waiting to execute,
    // the extra flag bits will not be used. Discard them to reduce
    // number of distinct states.
    if (neededFlags == 0) {
      flag &= FLAG_MATCH;
    }

    // No match possibilities
    if (nIndexes == 0 && flag == 0) {
      return DEAD_STATE;
    }

    // If we're in the longest match mode, the state is a sequence of
    // unordered state sets separated by Marks. Sort each set to
    // canonicalize, to reduce the number of distinct sets stored.
    if (matchKind == LONGEST_MATCH) {
      int ip = 0;
      while (ip < nIndexes) {
        int markp = ip;
        while (markp < nIndexes && instIndexes[markp] != MARK) {
          markp++;
        }
        sort(instIndexes, ip, markp);
        if (markp < nIndexes) {
          markp++;
        }
        ip = markp;
      }
    }

    flag |= neededFlags << FLAG_NEED_SHIFT;
    return getCachedState(instIndexes, nIndexes, flag);
  }

  private DFAState getCachedState(int[] instIndexes, int nIndexes, int flag) {
    DFAStateKey key = new DFAStateKey(instIndexes, nIndexes, flag);
    DFAState state = statesCache.get(key);

    if (state == null) {
      // create new state with trimmed instruction array
      state = new DFAState(instIndexes, nIndexes, flag);
      key = new DFAStateKey(state.getInstIndexes(), nIndexes, flag);

      DFAState previousState = statesCache.putIfAbsent(key, state);

      // it is possible that somebody simultaneously inserted state for the same key
      if (previousState != null) {
        return previousState;
      }

      if (availableStates.decrementAndGet() < 0) {
        throw new DFATooManyStatesException();
      }
    }

    return state;
  }

  // Use queue to create a WorkQueue from the state
  private void stateToWorkQueue(DFAState state, WorkQueue queue) {
    queue.clear();
    int[] instIndexes = state.getInstIndexes();
    for (int index : instIndexes) {
      if (index == MARK) {
        queue.mark();
      } else {
        queue.insertNew(index);
      }
    }
  }


  // Add id and instructions that follow from it (if it's e.g. an ALT instruction)
  // to the WorkQueue q. flag contains the empty width instruction flags
  private void addToQueue(WorkQueue q, int id, int flag) {

    // Use instSack to hold the stack of instructions still to process.
    // It is sized to have room for 2* (prog.numInsts) + nmark instructions.
    // Each instruction can be processed by the switch below only once, and the processing
    // pushes at most two instructions plus maybe a mark.
    // (If we are using marks, nmark = prog.numInsts().  Otherwise nmark = 0;
    int stackSize = 0;
    instStack[stackSize++] = id;
    while (stackSize > 0) {
      id = instStack[--stackSize];

      if (id == MARK) {
        q.mark();
        continue;
      }

      // If id is already on the queue, there's nothing to do.  Otherwise add it.
      // We don't actually keep all the ones that get added -- for example, kInstAlt is ignored
      // when on a work queue -- but adding all of the instructions here increases the likelihood
      // of q.contains(id), reducing the amound of duplicated work
      if (q.contains(id)) {
        continue;
      }

      q.insertNew(id);

      Inst inst = instructions[id];
      switch (inst.op()) {
        case FAIL: //nothing to do for these
        case BYTE:
        case MATCH:
          break;

        case CAPTURE:
        case NOP:
          instStack[stackSize++] = inst.out;
          break;

        case ALT:
        case ALT_MATCH:
          instStack[stackSize++] = inst.arg;

          if (currentWorkQ.maxMark > 0 && id == prog.startUnanchored && id != prog.start) {
            instStack[stackSize++] = MARK;
          }

          instStack[stackSize++] = inst.out;
          break;
        case EMPTY_WIDTH:
          //continue if all of the empty width flags match up.
          if ((inst.arg & ~flag) != 0) {
            break;
          }
          instStack[stackSize++] = inst.out;
      }
    }
  }

  // Runs currentWorkQ on the empty string flags, and populates nextWorkQ with new insts
  private void runWorkQueueOnEmptyString(int flag) {
    nextWorkQ.clear();
    for (int i = 0; i < currentWorkQ.getSize(); i++) {
      int instIndex = currentWorkQ.getValueAt(i);
      if (currentWorkQ.isMark(instIndex)) {
        addToQueue(nextWorkQ, MARK, flag);
      } else {
        addToQueue(nextWorkQ, instIndex, flag);
      }
    }
  }

  // Runs the byte against the work in currentWorkQ.
  // Populates nextWorkQ with new instructions.
  // Returns whether a match was found.
  private boolean runWorkQueueOnByte(byte b, int flag) {
    nextWorkQ.clear();
    boolean isMatch = false;
    for (int i = 0; i < currentWorkQ.getSize(); i++) {
      int instIndex = currentWorkQ.getValueAt(i);
      if (currentWorkQ.isMark(instIndex)) {
        if (isMatch) {
          return true;
        }
        nextWorkQ.mark();
        continue;
      }

      Inst inst = prog.getInst(instIndex);
      switch (inst.op()) {
        case FAIL:    // never succeeds
        case CAPTURE: // already followed all following
        case NOP:
        case ALT:
        case ALT_MATCH:
        case EMPTY_WIDTH:
          break;
        case BYTE:
          if (inst.matchByte(b)) {
            addToQueue(nextWorkQ, inst.out, flag);
          }
          break;
        case MATCH:
          isMatch = true;
          if (matchKind == FIRST_MATCH) {
            return true;
          }
          break;
      }
    }
    return isMatch;
  }

  // Run the state on the given byte.  If next state has already been found, get it directly.
  // Otherwise create a WorkQueue from the state and run it against the byte to create the next state.
  // Return the next state.
  private DFAState runStateOnByte(DFAState state, byte b) {
    if (state.isDead()) {
      throw new IllegalArgumentException("cannot run byte on DEAD STATE");
    }

    DFAState nextState = state.getNextState(b);
    if (nextState != null) {
      return nextState;
    }

    stateToWorkQueue(state, currentWorkQ);

    // Add implicit empty width flags
    int needFlag = state.getFlag() >> FLAG_NEED_SHIFT;
    int beforeFlag = state.getFlag() & FLAG_EMPTY_MASK;
    int oldBeforeFlag = beforeFlag;
    int afterFlag = 0;

    if (b == '\n') {
      beforeFlag |= EMPTY_END_LINE;
      afterFlag |= EMPTY_BEGIN_LINE;
    }

    if (b == EOF) {
      beforeFlag |= EMPTY_END_LINE | EMPTY_END_TEXT;
    }

    boolean isLastWord = (state.getFlag() & FLAG_LAST_WORD) != 0;  //last byte processed was a word character
    boolean isWord = b != EOF && isWordByte(b);
    if (isWord == isLastWord) {
      beforeFlag |= EMPTY_NO_WORD_BOUNDARY;
    } else {
      beforeFlag |= EMPTY_WORD_BOUNDARY;
    }

    if ((beforeFlag & ~oldBeforeFlag & needFlag) != 0) {
      runWorkQueueOnEmptyString(beforeFlag);
      switchWorkQueues();
    }

    boolean isMatch = runWorkQueueOnByte(b, afterFlag);

    // We're done with the currentWorkQ.  Switch it with nextWorkQ
    switchWorkQueues();

    int flag = afterFlag;
    if (isMatch) {
      flag |= FLAG_MATCH;
    }
    if (isWord) {
      flag |= FLAG_LAST_WORD;
    }

    nextState = workQueueToCachedState(currentWorkQ, flag);
    state.setNextState(b, nextState);
    return nextState;
  }

  private void switchWorkQueues() {
    WorkQueue tmpQueue = currentWorkQ;
    currentWorkQ = nextWorkQ;
    nextWorkQ = tmpQueue;
  }

  // Analyzes the search to build the SearchParams
  private StartParams analyzeSearch(MachineInput in, int startPos, int endPos, boolean anchored) {
    if (startPos < 0 || startPos > in.endPos()) {
      return DEAD_START_PARAMS;
    }

    int flags = 0;
    if (runForward) {
      if (startPos == 0) {
        flags = EMPTY_BEGIN_TEXT | EMPTY_BEGIN_LINE;
      } else if (in.getByteUnchecked(startPos - 1) == '\n') {
        flags = EMPTY_BEGIN_LINE;
      } else if (isWordByte(in.getByteUnchecked(startPos - 1))) {
        flags = FLAG_LAST_WORD;
      }
    } else {
      if (endPos == in.endPos()) {
        flags = EMPTY_BEGIN_TEXT | EMPTY_BEGIN_LINE;
      } else if (in.getByteUnchecked(endPos) == '\n') {
        flags = EMPTY_BEGIN_LINE;
      } else if (isWordByte(in.getByteUnchecked(endPos))) {
        flags = FLAG_LAST_WORD;
      }
    }

    return getCachedStartParams(anchored, flags);
  }

  private StartParams getCachedStartParams(boolean anchored, int flags) {
    int key = startParamsKey(anchored, flags);
    if (startParamsCache[key] != null) {
      return startParamsCache[key];
    }
    StartParams startParams = computeStartParams(anchored, flags);
    startParamsCache[key] = startParams;
    return startParams;
  }

  private int startParamsKey(boolean anchored, int flags) {
    return flags | ((anchored ? 1 : 0) << START_PARAMS_CACHE_SHIFT);
  }

  private StartParams computeStartParams(boolean anchored, int flags) {
    currentWorkQ.clear();
    if (anchored) {
      addToQueue(currentWorkQ, prog.start, flags);
    } else {
      addToQueue(currentWorkQ, prog.startUnanchored, flags);
    }

    DFAState startState = workQueueToCachedState(currentWorkQ, flags);
    if (startState.isDead()) {
      return DEAD_START_PARAMS;
    }

    // compute the first byte by running over all possible bytes and
    // seeing if there is exactly one that changes the state.
    boolean firstByte[] = new boolean[256];
    for (int i = 0; i < 256; i++) {
      DFAState state = runStateOnByte(startState, (byte) i);
      if (state == startState) {
        continue;
      }

      // in forward search make sure we don't start in a middle of rune
      // (backward search starts correctly because it is anchored at match end)
      if (runForward && !isRuneStart((byte) i)) {
        continue;
      }

      // This byte brought us to a new state
      firstByte[i] = true;
    }

    return new StartParams(startState, firstByte);
  }

  // the main search loop
  private int searchLoop(MachineInput in, int startPos, int endPos, boolean wantEarliestMatch, StartParams startParams) {
    int lastMatchIndex = NO_MATCH;
    DFAState currentState = startParams.startState;

    int currentIndex;
    int endIndex;

    if (runForward) {
      currentIndex = startPos;
      endIndex = endPos;
    } else {
      currentIndex = endPos;
      endIndex = startPos;
    }

    while (currentIndex != endIndex) {
      if (currentState == startParams.startState) {
        currentIndex = findFirstByte(in, currentIndex, endIndex, startParams.firstByte);
        if (currentIndex == endIndex) {
          break;
        }
      }

      byte b;
      if (runForward) {
        b = in.getByteUnchecked(currentIndex++);
      } else {
        b = in.getByteUnchecked(--currentIndex);
      }

      currentState = getNextState(currentState, b);
      if (currentState.isDead()) {
        return lastMatchIndex;
      }

      if (currentState.isMatch()) {
        // The DFA notices the match one byte late, so adjust p before using it in the match.
        if (runForward) {
          lastMatchIndex = currentIndex - 1;
        } else {
          lastMatchIndex = currentIndex + 1;
        }
        if (wantEarliestMatch) {
          return lastMatchIndex;
        }
      }
    }

    byte lastByte;
    if (runForward) {
      if (endPos == in.endPos()) {
        lastByte = EOF;
      } else {
        lastByte = in.getByteUnchecked(endPos);
      }
    } else {
      if (startPos == 0) {
        lastByte = EOF;
      } else {
        lastByte = in.getByteUnchecked(startPos - 1);
      }
    }

    // Process one more byte to see if it triggers a match (Remember, matches are delayed one byte).
    currentState = getNextState(currentState, lastByte);
    if (currentState.isMatch()) {
      lastMatchIndex = currentIndex;
    }

    return lastMatchIndex;
  }

  private int findFirstByte(MachineInput in, int currentIndex, int endIndex, boolean[] firstByte) {
    if (runForward) {
      return findFirstByteForward(in, currentIndex, endIndex, firstByte);
    } else {
      return findFirstByteBackward(in, currentIndex, endIndex, firstByte);
    }
  }

  private int findFirstByteForward(MachineInput in, int currentIndex, int endIndex, boolean[] firstByte) {
    for (int i = currentIndex; i < endIndex; ++i) {
      if (firstByte[in.getByteUnchecked(i) & 0xff]) {
        return i;
      }
    }

    return endIndex;
  }

  private int findFirstByteBackward(MachineInput in, int currentIndex, int endIndex, boolean[] firstByte) {
    for (int i = currentIndex - 1; i >= endIndex; --i) {
      if (firstByte[in.getByteUnchecked(i) & 0xff]) {
        return i + 1;
      }
    }

    return endIndex;
  }

  private DFAState getNextState(DFAState currentState, byte currentByte) {
    DFAState nextState = currentState.getNextState(currentByte);

    // Null means the next state has not been found yet. Compute it.
    if (nextState == null) {
      nextState = runStateOnByte(currentState, currentByte);
    }

    return nextState;
  }

  /**
   * This is a WorkQueue created from the insts in a DFAState. A mark seperates priorities for
   * LEFT_LONGEST_MATCH mode. Matches found before a mark have priority, as they are farther left
   * than those after it.
   */
  private static class WorkQueue extends SparseSet {
    final int normalSlots;
    final int maxMark;

    int nextMark;
    boolean wasLastMark;

    WorkQueue(int normalSlots, int maxMark) {
      super(normalSlots + maxMark);
      this.normalSlots = normalSlots;
      this.maxMark = maxMark;
      this.nextMark = normalSlots;
      this.wasLastMark = false;
    }

    boolean isMark(int i) {
      return i >= normalSlots;
    }

    void clear() {
      super.clear();
      nextMark = normalSlots;
    }

    void mark() {
      if (!wasLastMark) {
        wasLastMark = true;
        add(nextMark++);
      }
    }

    int getMaxSize() {
      return normalSlots + maxMark;
    }

    //inserts a new instruction into the WorkQueue
    void insertNew(int id) {
      wasLastMark = false;
      add(id);
    }
  }

  private final static class StartParams {
    final DFAState startState;
    final boolean[] firstByte; // if byte gets us out of the start state

    StartParams(DFAState startState, boolean[] firstByte) {
      this.startState = startState;
      this.firstByte = firstByte;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy