com.google.re2j.CharClass Maven / Gradle / Ivy
The newest version!
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Original Go source here:
// http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/parse.go
package com.google.re2j;
/**
* A "builder"-style helper class for manipulating character classes
* represented as an array of pairs of runes [lo, hi], each denoting an
* inclusive interval.
*
* All methods mutate the internal state and return {@code this}, allowing
* operations to be chained.
*/
class CharClass {
private int[] r; // inclusive ranges, pairs of [lo,hi]. r.length is even.
private int len; // prefix of |r| that is defined. Even.
// Constructs a CharClass with initial ranges |r|.
// The right to mutate |r| is passed to the callee.
CharClass(int[] r) {
this.r = r;
this.len = r.length;
}
// Constructs an empty CharClass.
CharClass() {
this.r = Utils.EMPTY_INTS;
this.len = 0;
}
// After a call to ensureCapacity(), |r.length| is at least |newLen|.
private void ensureCapacity(int newLen) {
if (r.length < newLen) {
// Expand by at least doubling, except when len == 0.
// TODO(adonovan): opt: perhaps it would be better to allocate exactly
// newLen, since the number of expansions is typically very small?
if (newLen < len * 2) {
newLen = len * 2;
}
int[] r2 = new int[newLen];
System.arraycopy(r, 0, r2, 0, len);
r = r2;
}
}
// Returns the character class as an int array. Subsequent CharClass
// operations may mutate this array, so typically this is the last operation
// performed on a given CharClass instance.
int[] toArray() {
if (this.len == r.length) {
return r;
} else {
int[] r2 = new int[len];
System.arraycopy(r, 0, r2, 0, len);
return r2;
}
}
// cleanClass() sorts the ranges (pairs of elements) of this CharClass,
// merges them, and eliminates duplicates.
CharClass cleanClass() {
if (len < 4) {
return this;
}
// Sort by lo increasing, hi decreasing to break ties.
qsortIntPair(r, 0, len - 2);
// Merge abutting, overlapping.
int w = 2; // write index
for (int i = 2; i < len; i += 2) {
int lo = r[i];
int hi = r[i + 1];
if (lo <= r[w - 1] + 1) {
// merge with previous range
if (hi > r[w - 1]) {
r[w - 1] = hi;
}
continue;
}
// new disjoint range
r[w] = lo;
r[w + 1] = hi;
w += 2;
}
len = w;
return this;
}
// appendLiteral() appends the literal |x| to this CharClass.
CharClass appendLiteral(int x, int flags) {
return ((flags & RE2.FOLD_CASE) != 0)
? appendFoldedRange(x, x)
: appendRange(x, x);
}
// appendRange() appends the range [lo-hi] (inclusive) to this CharClass.
CharClass appendRange(int lo, int hi) {
// Expand last range or next to last range if it overlaps or abuts.
// Checking two ranges helps when appending case-folded
// alphabets, so that one range can be expanding A-Z and the
// other expanding a-z.
if (len > 0) {
for (int i = 2; i <= 4; i += 2) { // twice, using i=2, i=4
if (len >= i) {
int rlo = r[len - i];
int rhi = r[len - i + 1];
if (lo <= rhi + 1 && rlo <= hi + 1) {
if (lo < rlo) {
r[len - i] = lo;
}
if (hi > rhi) {
r[len - i + 1] = hi;
}
return this;
}
}
}
}
// Can't coalesce; append. Expand capacity by doubling as needed.
ensureCapacity(len + 2);
r[len++] = lo;
r[len++] = hi;
return this;
}
// appendFoldedRange() appends the range [lo-hi] and its case
// folding-equivalent runes to this CharClass.
CharClass appendFoldedRange(int lo, int hi) {
// Optimizations.
if (lo <= Unicode.MIN_FOLD && hi >= Unicode.MAX_FOLD) {
// Range is full: folding can't add more.
return appendRange(lo, hi);
}
if (hi < Unicode.MIN_FOLD || lo > Unicode.MAX_FOLD) {
// Range is outside folding possibilities.
return appendRange(lo, hi);
}
if (lo < Unicode.MIN_FOLD) {
// [lo, minFold-1] needs no folding.
appendRange(lo, Unicode.MIN_FOLD - 1);
lo = Unicode.MIN_FOLD;
}
if (hi > Unicode.MAX_FOLD) {
// [maxFold+1, hi] needs no folding.
appendRange(Unicode.MAX_FOLD + 1, hi);
hi = Unicode.MAX_FOLD;
}
// Brute force. Depend on appendRange to coalesce ranges on the fly.
for (int c = lo; c <= hi; c++) {
appendRange(c, c);
for (int f = Unicode.simpleFold(c); f != c; f = Unicode.simpleFold(f)) {
appendRange(f, f);
}
}
return this;
}
// appendClass() appends the class |x| to this CharClass.
// It assumes |x| is clean. Does not mutate |x|.
CharClass appendClass(int[] x) {
for (int i = 0; i < x.length; i += 2) {
appendRange(x[i], x[i + 1]);
}
return this;
}
// appendFoldedClass() appends the case folding of the class |x| to this
// CharClass. Does not mutate |x|.
CharClass appendFoldedClass(int[] x) {
for (int i = 0; i < x.length; i += 2) {
appendFoldedRange(x[i], x[i + 1]);
}
return this;
}
// appendNegatedClass() append the negation of the class |x| to this
// CharClass. It assumes |x| is clean. Does not mutate |x|.
CharClass appendNegatedClass(int[] x) {
int nextLo = 0;
for (int i = 0; i < x.length; i += 2) {
int lo = x[i];
int hi = x[i + 1];
if (nextLo <= lo - 1) {
appendRange(nextLo, lo - 1);
}
nextLo = hi + 1;
}
if (nextLo <= Unicode.MAX_RUNE) {
appendRange(nextLo, Unicode.MAX_RUNE);
}
return this;
}
// appendTable() appends the Unicode range table |table| to this CharClass.
// Does not mutate |table|.
CharClass appendTable(int[][] table) {
for (int[] triple : table) {
int lo = triple[0], hi = triple[1], stride = triple[2];
if (stride == 1) {
appendRange(lo, hi);
continue;
}
for (int c = lo; c <= hi; c += stride) {
appendRange(c, c);
}
}
return this;
}
// appendNegatedTable() returns the result of appending the negation of range
// table |table| to this CharClass. Does not mutate |table|.
CharClass appendNegatedTable(int[][] table) {
int nextLo = 0; // lo end of next class to add
for (int[] triple : table) {
int lo = triple[0], hi = triple[1], stride = triple[2];
if (stride == 1) {
if (nextLo <= lo - 1) {
appendRange(nextLo, lo - 1);
}
nextLo = hi + 1;
continue;
}
for (int c = lo; c <= hi; c += stride) {
if (nextLo <= c - 1) {
appendRange(nextLo, c - 1);
}
nextLo = c + 1;
}
}
if (nextLo <= Unicode.MAX_RUNE) {
appendRange(nextLo, Unicode.MAX_RUNE);
}
return this;
}
// appendTableWithSign() calls append{,Negated}Table depending on sign.
// Does not mutate |table|.
CharClass appendTableWithSign(int[][] table, int sign) {
return sign < 0
? appendNegatedTable(table)
: appendTable(table);
}
// negateClass() negates this CharClass, which must already be clean.
CharClass negateClass() {
int nextLo = 0; // lo end of next class to add
int w = 0; // write index
for (int i = 0; i < len; i += 2) {
int lo = r[i], hi = r[i + 1];
if (nextLo <= lo - 1) {
r[w] = nextLo;
r[w + 1] = lo - 1;
w += 2;
}
nextLo = hi + 1;
}
len = w;
if (nextLo <= Unicode.MAX_RUNE) {
// It's possible for the negation to have one more
// range - this one - than the original class, so use append.
ensureCapacity(len + 2);
r[len++] = nextLo;
r[len++] = Unicode.MAX_RUNE;
}
return this;
}
// appendClassWithSign() calls appendClass() if sign is +1 or
// appendNegatedClass if sign is -1. Does not mutate |x|.
CharClass appendClassWithSign(int[] x, int sign) {
return sign < 0
? appendNegatedClass(x)
: appendClass(x);
}
// appendGroup() appends CharGroup |g| to this CharClass, folding iff
// |foldCase|. Does not mutate |g|.
CharClass appendGroup(CharGroup g, boolean foldCase) {
int[] cls = g.cls;
if (foldCase) {
cls = new CharClass().
appendFoldedClass(cls).
cleanClass().
toArray();
}
return appendClassWithSign(cls, g.sign);
}
// cmp() returns the ordering of the pair (a[i], a[i+1]) relative to
// (pivotFrom, pivotTo), where the first component of the pair (lo) is
// ordered naturally and the second component (hi) is in reverse order.
private static int cmp(int[] array, int i, int pivotFrom, int pivotTo) {
int cmp = array[i] - pivotFrom;
return cmp != 0
? cmp
: pivotTo - array[i + 1];
}
// qsortIntPair() quicksorts pairs of ints in |array| according to lt().
// Precondition: |left|, |right|, |this.len| must all be even; |this.len > 1|.
private static void qsortIntPair(int[] array, int left, int right) {
int pivotIndex = ((left + right) / 2) & ~1;
int pivotFrom = array[pivotIndex], pivotTo = array[pivotIndex + 1];
int i = left, j = right;
while (i <= j) {
while (i < right && cmp(array, i, pivotFrom, pivotTo) < 0) {
i += 2;
}
while (j > left && cmp(array, j, pivotFrom, pivotTo) > 0) {
j -= 2;
}
if (i <= j) {
if (i != j) {
int temp = array[i];
array[i] = array[j];
array[j] = temp;
temp = array[i + 1];
array[i + 1] = array[j + 1];
array[j + 1] = temp;
}
i += 2;
j -= 2;
}
}
if (left < j) {
qsortIntPair(array, left, j);
}
if (i < right) {
qsortIntPair(array, i, right);
}
}
// Exposed, since useful for debugging CharGroups too.
static String charClassToString(int[] r, int len) {
StringBuilder b = new StringBuilder();
b.append('[');
for (int i = 0; i < len; i += 2) {
if (i > 0) {
b.append(' ');
}
int lo = r[i], hi = r[i + 1];
// Avoid String.format (not available on GWT).
// Cf. https://code.google.com/p/google-web-toolkit/issues/detail?id=3945
if (lo == hi) {
b.append("0x");
b.append(Integer.toHexString(lo));
} else {
b.append("0x");
b.append(Integer.toHexString(lo));
b.append("-0x");
b.append(Integer.toHexString(hi));
}
}
b.append(']');
return b.toString();
}
@Override public String toString() {
return charClassToString(r, len);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy