All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.re2j.Machine Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2020 The Go Authors. All rights reserved.
 *
 * Use of this source code is governed by a BSD-style
 * license that can be found in the LICENSE file.
 */
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/exec.go

package com.google.re2j;

import java.util.Arrays;

// A Machine matches an input string of Unicode characters against an
// RE2 instance using a simple NFA.
//
// Called by RE2.doExecute.
class Machine {

  // A logical thread in the NFA.
  private static class Thread {
    Thread(int n) {
      this.cap = new int[n];
    }

    int[] cap;
    Inst inst;
  }

  // A queue is a 'sparse array' holding pending threads of execution.  See:
  // research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
  private static class Queue {

    final Thread[] denseThreads; // may contain stale Thread in slots >= size
    final int[] densePcs; // may contain stale pc in slots >= size
    final int[] sparse; // may contain stale but in-bounds values.
    int size; // of prefix of |dense| that is logically populated

    Queue(int n) {
      this.sparse = new int[n];
      this.densePcs = new int[n];
      this.denseThreads = new Thread[n];
    }

    boolean contains(int pc) {
      int j = sparse[pc];
      return j < size && densePcs[j] == pc;
    }

    boolean isEmpty() {
      return size == 0;
    }

    int add(int pc) {
      int j = size++;
      sparse[pc] = j;
      denseThreads[j] = null;
      densePcs[j] = pc;
      return j;
    }

    void clear() {
      size = 0;
    }

    @Override
    public String toString() {
      StringBuilder out = new StringBuilder();
      out.append('{');
      for (int i = 0; i < size; ++i) {
        if (i != 0) {
          out.append(", ");
        }
        out.append(densePcs[i]);
      }
      out.append('}');
      return out.toString();
    }
  }

  // Corresponding compiled regexp.
  private RE2 re2;

  // Compiled program.
  private final Prog prog;

  // Two queues for runq, nextq.
  private final Queue q0, q1;

  // pool of available threads
  // Really a stack:
  private Thread[] pool = new Thread[10];
  private int poolSize;

  // Whether a match was found.
  private boolean matched;

  // Capture information for the match.
  private int[] matchcap;
  private int ncap;

  // Make sure to include new fields in the copy constructor

  // Pointer to form a linked stack for the pool of Machines. Not included in copy constructor.
  Machine next;

  /**
   * Constructs a matching Machine for the specified {@code RE2}.
   */
  Machine(RE2 re2) {
    this.prog = re2.prog;
    this.re2 = re2;
    this.q0 = new Queue(prog.numInst());
    this.q1 = new Queue(prog.numInst());
    this.matchcap = new int[prog.numCap < 2 ? 2 : prog.numCap];
  }

  /** Copy constructor, but does not include {@code next} */
  Machine(Machine copy) {
    // Make sure to include any new fields here
    this.re2 = copy.re2;
    this.prog = copy.prog;
    this.q0 = copy.q0;
    this.q1 = copy.q1;
    this.pool = copy.pool;
    this.poolSize = copy.poolSize;
    this.matched = copy.matched;
    this.matchcap = copy.matchcap;
    this.ncap = copy.ncap;
  }

  // init() reinitializes an existing Machine for re-use on a new input.
  void init(int ncap) {
    // length change need new arrays
    this.ncap = ncap;
    if (ncap > matchcap.length) {
      initNewCap(ncap);
    } else {
      resetCap(ncap);
    }
  }

  private void resetCap(int ncap) {
    // same size just reset to 0
    for (int i = 0; i < poolSize; i++) {
      Thread t = pool[i];
      Arrays.fill(t.cap, 0, ncap, 0);
    }
  }

  private void initNewCap(int ncap) {
    for (int i = 0; i < poolSize; i++) {
      Thread t = pool[i];
      t.cap = new int[ncap];
    }
    this.matchcap = new int[ncap];
  }

  int[] submatches() {
    if (ncap == 0) {
      return Utils.EMPTY_INTS;
    }
    return Arrays.copyOf(matchcap, ncap);
  }

  // alloc() allocates a new thread with the given instruction.
  // It uses the free pool if possible.
  private Thread alloc(Inst inst) {
    Thread t;
    if (poolSize > 0) {
      poolSize--;
      t = pool[poolSize];
    } else {
      t = new Thread(matchcap.length);
    }
    t.inst = inst;
    return t;
  }

  // Frees all threads on the thread queue, returning them to the free pool.
  private void free(Queue queue) {
    free(queue, 0);
  }

  private void free(Queue queue, int from) {
    int numberOfThread = queue.size - from;
    int requiredPoolLength = poolSize + numberOfThread;
    if (pool.length < requiredPoolLength) {
      pool = Arrays.copyOf(pool, Math.max(pool.length * 2, requiredPoolLength));
    }

    for (int i = from; i < queue.size; ++i) {
      Thread t = queue.denseThreads[i];
      if (t != null) {
        pool[poolSize] = t;
        poolSize++;
      }
    }
    queue.clear();
  }

  // free() returns t to the free pool.
  private void free(Thread t) {
    if (pool.length <= poolSize) {
      pool = Arrays.copyOf(pool, pool.length * 2);
    }
    pool[poolSize] = t;
    poolSize++;
  }

  // match() runs the machine over the input |in| starting at |pos| with the
  // RE2 Anchor |anchor|.
  // It reports whether a match was found.
  // If so, matchcap holds the submatch information.
  boolean match(MachineInput in, int pos, int anchor) {
    int startCond = re2.cond;
    if (startCond == Utils.EMPTY_ALL) { // impossible
      return false;
    }
    if ((anchor == RE2.ANCHOR_START || anchor == RE2.ANCHOR_BOTH) && pos != 0) {
      return false;
    }
    matched = false;
    Arrays.fill(matchcap, 0, prog.numCap, -1);
    Queue runq = q0, nextq = q1;
    int r = in.step(pos);
    int rune = r >> 3;
    int width = r & 7;
    int rune1 = -1;
    int width1 = 0;
    if (r != MachineInput.EOF) {
      r = in.step(pos + width);
      rune1 = r >> 3;
      width1 = r & 7;
    }
    int flag; // bitmask of EMPTY_* flags
    if (pos == 0) {
      flag = Utils.emptyOpContext(-1, rune);
    } else {
      flag = in.context(pos);
    }
    for (; ; ) {

      if (runq.isEmpty()) {
        if ((startCond & Utils.EMPTY_BEGIN_TEXT) != 0 && pos != 0) {
          // Anchored match, past beginning of text.
          break;
        }
        if (matched) {
          // Have match; finished exploring alternatives.
          break;
        }
        if (!re2.prefix.isEmpty() && rune1 != re2.prefixRune && in.canCheckPrefix()) {
          // Match requires literal prefix; fast search for it.
          int advance = in.index(re2, pos);
          if (advance < 0) {
            break;
          }
          pos += advance;
          r = in.step(pos);
          rune = r >> 3;
          width = r & 7;
          r = in.step(pos + width);
          rune1 = r >> 3;
          width1 = r & 7;
        }
      }
      if (!matched && (pos == 0 || anchor == RE2.UNANCHORED)) {
        // If we are anchoring at begin then only add threads that begin
        // at |pos| = 0.
        if (ncap > 0) {
          matchcap[0] = pos;
        }
        add(runq, prog.start, pos, matchcap, flag, null);
      }
      int nextPos = pos + width;
      flag = in.context(nextPos);
      step(runq, nextq, pos, nextPos, rune, flag, anchor, pos == in.endPos());
      if (width == 0) { // EOF
        break;
      }
      if (ncap == 0 && matched) {
        // Found a match and not paying attention
        // to where it is, so any match will do.
        break;
      }
      pos += width;
      rune = rune1;
      width = width1;
      if (rune != -1) {
        r = in.step(pos + width);
        rune1 = r >> 3;
        width1 = r & 7;
      }
      Queue tmpq = runq;
      runq = nextq;
      nextq = tmpq;
    }
    free(nextq);
    return matched;
  }

  // step() executes one step of the machine, running each of the threads
  // on |runq| and appending new threads to |nextq|.
  // The step processes the rune |c| (which may be -1 for EOF),
  // which starts at position |pos| and ends at |nextPos|.
  // |nextCond| gives the setting for the EMPTY_* flags after |c|.
  // |anchor| is the anchoring flag and |atEnd| signals if we are at the end of
  // the input string.
  private void step(
      Queue runq,
      Queue nextq,
      int pos,
      int nextPos,
      int c,
      int nextCond,
      int anchor,
      boolean atEnd) {
    boolean longest = re2.longest;
    for (int j = 0; j < runq.size; ++j) {
      Thread t = runq.denseThreads[j];
      if (t == null) {
        continue;
      }
      if (longest && matched && ncap > 0 && matchcap[0] < t.cap[0]) {
        free(t);
        continue;
      }
      Inst i = t.inst;
      boolean add = false;
      switch (i.op) {
        case Inst.MATCH:
          if (anchor == RE2.ANCHOR_BOTH && !atEnd) {
            // Don't match if we anchor at both start and end and those
            // expectations aren't met.
            break;
          }
          if (ncap > 0 && (!longest || !matched || matchcap[1] < pos)) {
            t.cap[1] = pos;
            System.arraycopy(t.cap, 0, matchcap, 0, ncap);
          }
          if (!longest) {
            free(runq, j + 1);
          }
          matched = true;
          break;

        case Inst.RUNE:
          add = i.matchRune(c);
          break;

        case Inst.RUNE1:
          add = c == i.runes[0];
          break;

        case Inst.RUNE_ANY:
          add = true;
          break;

        case Inst.RUNE_ANY_NOT_NL:
          add = c != '\n';
          break;

        default:
          throw new IllegalStateException("bad inst");
      }
      if (add) {
        t = add(nextq, i.out, nextPos, t.cap, nextCond, t);
      }
      if (t != null) {
        free(t);
        runq.denseThreads[j] = null;
      }
    }
    runq.clear();
  }

  // add() adds an entry to |q| for |pc|, unless the |q| already has such an
  // entry.  It also recursively adds an entry for all instructions reachable
  // from |pc| by following empty-width conditions satisfied by |cond|.  |pos|
  // gives the current position in the input.  |cond| is a bitmask of EMPTY_*
  // flags.
  private Thread add(Queue q, int pc, int pos, int[] cap, int cond, Thread t) {
    if (pc == 0) {
      return t;
    }
    if (q.contains(pc)) {
      return t;
    }
    int d = q.add(pc);
    Inst inst = prog.inst[pc];
    switch (inst.op) {
      default:
        throw new IllegalStateException("unhandled");

      case Inst.FAIL:
        break; // nothing

      case Inst.ALT:
      case Inst.ALT_MATCH:
        t = add(q, inst.out, pos, cap, cond, t);
        t = add(q, inst.arg, pos, cap, cond, t);
        break;

      case Inst.EMPTY_WIDTH:
        if ((inst.arg & ~cond) == 0) {
          t = add(q, inst.out, pos, cap, cond, t);
        }
        break;

      case Inst.NOP:
        t = add(q, inst.out, pos, cap, cond, t);
        break;

      case Inst.CAPTURE:
        if (inst.arg < ncap) {
          int opos = cap[inst.arg];
          cap[inst.arg] = pos;
          add(q, inst.out, pos, cap, cond, null);
          cap[inst.arg] = opos;
        } else {
          t = add(q, inst.out, pos, cap, cond, t);
        }
        break;

      case Inst.MATCH:
      case Inst.RUNE:
      case Inst.RUNE1:
      case Inst.RUNE_ANY:
      case Inst.RUNE_ANY_NOT_NL:
        if (t == null) {
          t = alloc(inst);
        } else {
          t.inst = inst;
        }
        if (ncap > 0 && t.cap != cap) {
          System.arraycopy(cap, 0, t.cap, 0, ncap);
        }
        q.denseThreads[d] = t;
        t = null;
        break;
    }
    return t;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy