com.google.re2j.MachineInput Maven / Gradle / Ivy
/*
* Copyright (c) 2020 The Go Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style
* license that can be found in the LICENSE file.
*/
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/regexp.go
package com.google.re2j;
/**
* MachineInput abstracts different representations of the input text supplied to the Machine. It
* provides one-character lookahead.
*/
abstract class MachineInput {
static final int EOF = (-1 << 3);
static MachineInput fromUTF8(byte[] b) {
return new UTF8Input(b);
}
static MachineInput fromUTF8(byte[] b, int start, int end) {
return new UTF8Input(b, start, end);
}
static MachineInput fromUTF16(CharSequence s) {
return new UTF16Input(s, 0, s.length());
}
static MachineInput fromUTF16(CharSequence s, int start, int end) {
return new UTF16Input(s, start, end);
}
//// Interface
// Returns the rune at the specified index; the units are
// unspecified, but could be UTF-8 byte, UTF-16 char, or rune
// indices. Returns the width (in the same units) of the rune in
// the lower 3 bits, and the rune (Unicode code point) in the high
// bits. Never negative, except for EOF which is represented as -1
// << 3 | 0.
abstract int step(int pos);
// can we look ahead without losing info?
abstract boolean canCheckPrefix();
// Returns the index relative to |pos| at which |re2.prefix| is found
// in this input stream, or a negative value if not found.
abstract int index(RE2 re2, int pos);
// Returns a bitmask of EMPTY_* flags.
abstract int context(int pos);
// Returns the end position in the same units as step().
abstract int endPos();
//// Implementations
// An implementation of MachineInput for UTF-8 byte arrays.
// |pos| and |width| are byte indices.
private static class UTF8Input extends MachineInput {
final byte[] b;
final int start;
final int end;
UTF8Input(byte[] b) {
this.b = b;
start = 0;
end = b.length;
}
UTF8Input(byte[] b, int start, int end) {
if (end > b.length) {
throw new ArrayIndexOutOfBoundsException(
"end is greater than length: " + end + " > " + b.length);
}
this.b = b;
this.start = start;
this.end = end;
}
@Override
int step(int i) {
i += start;
if (i >= end) {
return EOF;
}
// UTF-8. RFC 3629 in five lines:
//
// Unicode code points UTF-8 encoding (binary)
// 00-7F (7 bits) 0tuvwxyz
// 0080-07FF (11 bits) 110pqrst 10uvwxyz
// 0800-FFFF (16 bits) 1110jklm 10npqrst 10uvwxyz
// 010000-10FFFF (21 bits) 11110efg 10hijklm 10npqrst 10uvwxyz
int x = b[i++] & 0xff; // zero extend
if ((x & 0x80) == 0) {
return x << 3 | 1;
} else if ((x & 0xE0) == 0xC0) { // 110xxxxx
x = x & 0x1F;
if (i >= end) {
return EOF;
}
x = x << 6 | (b[i++] & 0x3F);
return x << 3 | 2;
} else if ((x & 0xF0) == 0xE0) { // 1110xxxx
x = x & 0x0F;
if (i + 1 >= end) {
return EOF;
}
x = x << 6 | (b[i++] & 0x3F);
x = x << 6 | (b[i++] & 0x3F);
return x << 3 | 3;
} else { // 11110xxx
x = x & 0x07;
if (i + 2 >= end) {
return EOF;
}
x = x << 6 | (b[i++] & 0x3F);
x = x << 6 | (b[i++] & 0x3F);
x = x << 6 | (b[i++] & 0x3F);
return x << 3 | 4;
}
}
@Override
boolean canCheckPrefix() {
return true;
}
@Override
int index(RE2 re2, int pos) {
pos += start;
int i = Utils.indexOf(b, re2.prefixUTF8, pos);
return i < 0 ? i : i - pos;
}
@Override
int context(int pos) {
pos += this.start;
int r1 = -1;
if (pos > this.start && pos <= this.end) {
int start = pos - 1;
r1 = b[start--];
if (r1 >= 0x80) { // decode UTF-8
// Find start, up to 4 bytes earlier.
int lim = pos - 4;
if (lim < this.start) {
lim = this.start;
}
while (start >= lim && (b[start] & 0xC0) == 0x80) { // 10xxxxxx
start--;
}
if (start < this.start) {
start = this.start;
}
r1 = step(start) >> 3;
}
}
int r2 = pos < this.end ? (step(pos) >> 3) : -1;
return Utils.emptyOpContext(r1, r2);
}
@Override
int endPos() {
return end;
}
}
// |pos| and |width| are in Java "char" units.
private static class UTF16Input extends MachineInput {
final CharSequence str;
final int start;
final int end;
public UTF16Input(CharSequence str, int start, int end) {
this.str = str;
this.start = start;
this.end = end;
}
@Override
int step(int pos) {
pos += start;
if (pos < end) {
int rune = Character.codePointAt(str, pos);
return rune << 3 | Character.charCount(rune);
} else {
return EOF;
}
}
@Override
boolean canCheckPrefix() {
return true;
}
@Override
int index(RE2 re2, int pos) {
pos += start;
int i = indexOf(str, re2.prefix, pos);
return i < 0 ? i : i - pos;
}
@Override
int context(int pos) {
pos += start;
int r1 = pos > 0 && pos <= str.length() ? Character.codePointBefore(str, pos) : -1;
int r2 = pos < str.length() ? Character.codePointAt(str, pos) : -1;
return Utils.emptyOpContext(r1, r2);
}
@Override
int endPos() {
return end;
}
private int indexOf(CharSequence hayStack, String needle, int pos) {
if (hayStack instanceof String) {
return ((String) hayStack).indexOf(needle, pos);
}
if (hayStack instanceof StringBuilder) {
return ((StringBuilder) hayStack).indexOf(needle, pos);
}
return indexOfFallback(hayStack, needle, pos);
}
// Modified version of {@link String#indexOf(String) that allows a CharSequence.
private int indexOfFallback(CharSequence hayStack, String needle, int fromIndex) {
if (fromIndex >= hayStack.length()) {
return needle.isEmpty() ? 0 : -1;
}
if (fromIndex < 0) {
fromIndex = 0;
}
if (needle.isEmpty()) {
return fromIndex;
}
char first = needle.charAt(0);
int max = hayStack.length() - needle.length();
for (int i = fromIndex; i <= max; i++) {
/* Look for first character. */
if (hayStack.charAt(i) != first) {
while (++i <= max && hayStack.charAt(i) != first) {}
}
/* Found first character, now look at the rest of v2 */
if (i <= max) {
int j = i + 1;
int end = j + needle.length() - 1;
for (int k = 1; j < end && hayStack.charAt(j) == needle.charAt(k); j++, k++) {}
if (j == end) {
/* Found whole string. */
return i;
}
}
}
return -1;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy