All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.re2j.MachineInput Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2020 The Go Authors. All rights reserved.
 *
 * Use of this source code is governed by a BSD-style
 * license that can be found in the LICENSE file.
 */
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/regexp.go

package com.google.re2j;

/**
 * MachineInput abstracts different representations of the input text supplied to the Machine. It
 * provides one-character lookahead.
 */
abstract class MachineInput {

  static final int EOF = (-1 << 3);

  static MachineInput fromUTF8(byte[] b) {
    return new UTF8Input(b);
  }

  static MachineInput fromUTF8(byte[] b, int start, int end) {
    return new UTF8Input(b, start, end);
  }

  static MachineInput fromUTF16(CharSequence s) {
    return new UTF16Input(s, 0, s.length());
  }

  static MachineInput fromUTF16(CharSequence s, int start, int end) {
    return new UTF16Input(s, start, end);
  }

  //// Interface

  // Returns the rune at the specified index; the units are
  // unspecified, but could be UTF-8 byte, UTF-16 char, or rune
  // indices.  Returns the width (in the same units) of the rune in
  // the lower 3 bits, and the rune (Unicode code point) in the high
  // bits.  Never negative, except for EOF which is represented as -1
  // << 3 | 0.
  abstract int step(int pos);

  // can we look ahead without losing info?
  abstract boolean canCheckPrefix();

  // Returns the index relative to |pos| at which |re2.prefix| is found
  // in this input stream, or a negative value if not found.
  abstract int index(RE2 re2, int pos);

  // Returns a bitmask of EMPTY_* flags.
  abstract int context(int pos);

  // Returns the end position in the same units as step().
  abstract int endPos();

  //// Implementations

  // An implementation of MachineInput for UTF-8 byte arrays.
  // |pos| and |width| are byte indices.
  private static class UTF8Input extends MachineInput {

    final byte[] b;
    final int start;
    final int end;

    UTF8Input(byte[] b) {
      this.b = b;
      start = 0;
      end = b.length;
    }

    UTF8Input(byte[] b, int start, int end) {
      if (end > b.length) {
        throw new ArrayIndexOutOfBoundsException(
            "end is greater than length: " + end + " > " + b.length);
      }
      this.b = b;
      this.start = start;
      this.end = end;
    }

    @Override
    int step(int i) {
      i += start;
      if (i >= end) {
        return EOF;
      }

      // UTF-8.  RFC 3629 in five lines:
      //
      // Unicode code points            UTF-8 encoding (binary)
      //         00-7F  (7 bits)   0tuvwxyz
      //     0080-07FF (11 bits)   110pqrst 10uvwxyz
      //     0800-FFFF (16 bits)   1110jklm 10npqrst 10uvwxyz
      // 010000-10FFFF (21 bits)   11110efg 10hijklm 10npqrst 10uvwxyz
      int x = b[i++] & 0xff; // zero extend
      if ((x & 0x80) == 0) {
        return x << 3 | 1;
      } else if ((x & 0xE0) == 0xC0) { // 110xxxxx
        x = x & 0x1F;
        if (i >= end) {
          return EOF;
        }
        x = x << 6 | (b[i++] & 0x3F);
        return x << 3 | 2;
      } else if ((x & 0xF0) == 0xE0) { // 1110xxxx
        x = x & 0x0F;
        if (i + 1 >= end) {
          return EOF;
        }
        x = x << 6 | (b[i++] & 0x3F);
        x = x << 6 | (b[i++] & 0x3F);
        return x << 3 | 3;
      } else { // 11110xxx
        x = x & 0x07;
        if (i + 2 >= end) {
          return EOF;
        }
        x = x << 6 | (b[i++] & 0x3F);
        x = x << 6 | (b[i++] & 0x3F);
        x = x << 6 | (b[i++] & 0x3F);
        return x << 3 | 4;
      }
    }

    @Override
    boolean canCheckPrefix() {
      return true;
    }

    @Override
    int index(RE2 re2, int pos) {
      pos += start;
      int i = Utils.indexOf(b, re2.prefixUTF8, pos);
      return i < 0 ? i : i - pos;
    }

    @Override
    int context(int pos) {
      pos += this.start;
      int r1 = -1;
      if (pos > this.start && pos <= this.end) {
        int start = pos - 1;
        r1 = b[start--];
        if (r1 >= 0x80) { // decode UTF-8
          // Find start, up to 4 bytes earlier.
          int lim = pos - 4;
          if (lim < this.start) {
            lim = this.start;
          }
          while (start >= lim && (b[start] & 0xC0) == 0x80) { // 10xxxxxx
            start--;
          }
          if (start < this.start) {
            start = this.start;
          }
          r1 = step(start) >> 3;
        }
      }
      int r2 = pos < this.end ? (step(pos) >> 3) : -1;
      return Utils.emptyOpContext(r1, r2);
    }

    @Override
    int endPos() {
      return end;
    }
  }

  // |pos| and |width| are in Java "char" units.
  private static class UTF16Input extends MachineInput {
    final CharSequence str;
    final int start;
    final int end;

    public UTF16Input(CharSequence str, int start, int end) {
      this.str = str;
      this.start = start;
      this.end = end;
    }

    @Override
    int step(int pos) {
      pos += start;
      if (pos < end) {
        int rune = Character.codePointAt(str, pos);
        return rune << 3 | Character.charCount(rune);
      } else {
        return EOF;
      }
    }

    @Override
    boolean canCheckPrefix() {
      return true;
    }

    @Override
    int index(RE2 re2, int pos) {
      pos += start;
      int i = indexOf(str, re2.prefix, pos);
      return i < 0 ? i : i - pos;
    }

    @Override
    int context(int pos) {
      pos += start;
      int r1 = pos > 0 && pos <= str.length() ? Character.codePointBefore(str, pos) : -1;
      int r2 = pos < str.length() ? Character.codePointAt(str, pos) : -1;
      return Utils.emptyOpContext(r1, r2);
    }

    @Override
    int endPos() {
      return end;
    }

    private int indexOf(CharSequence hayStack, String needle, int pos) {
      if (hayStack instanceof String) {
        return ((String) hayStack).indexOf(needle, pos);
      }
      if (hayStack instanceof StringBuilder) {
        return ((StringBuilder) hayStack).indexOf(needle, pos);
      }
      return indexOfFallback(hayStack, needle, pos);
    }

    // Modified version of {@link String#indexOf(String) that allows a CharSequence.
    private int indexOfFallback(CharSequence hayStack, String needle, int fromIndex) {
      if (fromIndex >= hayStack.length()) {
        return needle.isEmpty() ? 0 : -1;
      }
      if (fromIndex < 0) {
        fromIndex = 0;
      }
      if (needle.isEmpty()) {
        return fromIndex;
      }

      char first = needle.charAt(0);
      int max = hayStack.length() - needle.length();

      for (int i = fromIndex; i <= max; i++) {
        /* Look for first character. */
        if (hayStack.charAt(i) != first) {
          while (++i <= max && hayStack.charAt(i) != first) {}
        }

        /* Found first character, now look at the rest of v2 */
        if (i <= max) {
          int j = i + 1;
          int end = j + needle.length() - 1;
          for (int k = 1; j < end && hayStack.charAt(j) == needle.charAt(k); j++, k++) {}

          if (j == end) {
            /* Found whole string. */
            return i;
          }
        }
      }
      return -1;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy