com.basistech.tclre.Runtime Maven / Gradle / Ivy
Show all versions of tcl-regex Show documentation
/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.basistech.tclre;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;
import com.google.common.collect.Lists;
/**
* The internal implementation of matching.
*/
class Runtime {
private static final int UNTRIED = 0; /* not yet tried at all */
private static final int TRYING = 1; /* top matched, trying submatches */
private static final int TRIED = 2; /* top didn't match or submatches exhausted */
Guts g;
int eflags;
List match;
CharSequence data;
int dataLength; // cache this, it gets examined _a lot_.
private HsrePattern re;
private int[] mem; // backtracking.
/**
* exec - match regular expression
*/
boolean exec(HsrePattern re, CharSequence data, EnumSet execFlags) throws RegexException {
/* sanity checks */
/* setup */
if (0 != (re.guts.info & Flags.REG_UIMPOSSIBLE)) {
throw new RegexException("Regex marked impossible");
}
eflags = 0;
for (ExecFlags ef : execFlags) {
switch (ef) {
case NOTBOL:
eflags |= Flags.REG_NOTBOL;
break;
case NOTEOL:
eflags |= Flags.REG_NOTEOL;
break;
case LOOKING_AT:
eflags |= Flags.REG_LOOKING_AT;
break;
default:
throw new RuntimeException("impossible exec flag");
}
}
this.re = re;
this.g = re.guts;
this.data = data;
this.dataLength = this.data.length();
if (this.match != null) {
this.match.clear();
} else {
this.match = Lists.newArrayList();
}
match.add(null); // make room for 1.
if (0 != (g.info & Flags.REG_UBACKREF)) {
while (match.size() < g.nsub + 1) {
match.add(null);
}
}
if (mem != null && mem.length >= g.ntree) {
Arrays.fill(mem, 0);
} else {
mem = new int[g.ntree];
}
/* do it */
assert g.tree != null;
if (0 != (g.info & Flags.REG_UBACKREF)) {
return cfind(g.tree.machine);
} else {
return find(g.tree.machine);
}
}
/**
* find - find a match for the main NFA (no-complications case)
*
* First, it runs the 'search machine' in non-greedy mode (shortest). The 'search machine' is the NFA with
* .* added to the front and the back, more or less. See 'makesearch'. In many cases, running the search machine
* 'shortest' is considerably faster than running the actual regular expression in greedy (longest). So,
* even though this may seem like it would take extra time doing an extra scan, the alternative is much slower.
* The search machine tells you if the expression can be found anywhere, and, if so, where is the furthest possible
* end.
*
*
* If the search machine succeeds, the does an iteration to find the exact bounds;
* the loop uses 'longest' or 'shortest' as appropriate to the flags. In C, there was an option to
* _only_ run the search machine and return a simple boolean with no bounds.
* We have no API for that via an ExecFlag.
*
*
* If the top-level API call is 'lookingAt', we never want to scan down the data looking for matches. But 'shortest'
* can still be much faster than 'longest'. So, the code runs the original machine first. If non-greedy expressions
* were very common, I suppose that it would be faster to omit this step in that case. Thereafter, the loop has
* a check to bail if these is no match at the beginning of the data, which is the constraint of lookingAt.
*
*
*/
boolean find(Cnfa cnfa) {
int begin;
int end = -1;
int cold;
int open; /* open and close of range of possible starts */
int close;
boolean hitend;
boolean shorter = 0 != (g.tree.flags & Subre.SHORTER);
boolean lookingAt = 0 != (eflags & Flags.REG_LOOKING_AT);
int[] coldp = new int[1];
Dfa d = new Dfa(this, cnfa);
if (lookingAt) {
/*
* shortest is faster than longest. So, we want to check with it.
* However, since we aren't making a 'search re' with an extra .* on
* the front, we don't add an extra requirement to make progress on the
* very first arc. If the expression has something like a* at the front,
* it can 'no-progress' consuming the a characters.
* All of this casts doubts on the 'requireInitialProgress' feature -- at all.
* These initial calls to shortest should be all the opportunity we need
* to do 'lookingAt'.
*/
close = d.shortest(0, 0, data.length(), coldp, null);
cold = 0;
} else {
/* First, a shot with the search RE. */
Dfa s = new Dfa(this, g.search);
close = s.shortest(0, 0, data.length(), coldp, null);
cold = coldp[0];
}
if (close == -1) { /* not found */
return false;
}
/* find starting point and match */
open = cold;
cold = -1;
for (begin = open; begin <= close; begin++) {
/*
* if LOOKING_AT, we can't validly have a 'begin' after 'open'.
* I'm not sure this test can even ever go off, since the 'shortest' test
* up above should accomplish the same thing.
*/
if (begin > 0 && lookingAt) {
return false;
}
boolean[] hitendp = new boolean[1];
if (shorter) {
end = d.shortest(begin, begin, data.length(), null, hitendp);
} else {
end = d.longest(begin, data.length(), hitendp);
}
hitend = hitendp[0];
if (hitend && cold == -1) {
cold = begin;
}
if (end != -1) { /* success */
break; /* NOTE BREAK OUT */
}
}
if (end == -1) {
return false;
}
/* and pin down details */
match.set(0, new RegMatch(begin, end));
// no need to do the work.
return re.nsub <= 0 || dissect(g.tree, begin, end);
}
/**
* cfind - find a match for the main NFA (with complications)
*/
private boolean cfind(Cnfa cnfa) {
int[] cold = new int[1];
Dfa s = new Dfa(this, g.search);
Dfa d = new Dfa(this, cnfa);
return cfindloop(d, s, cold);
}
/**
* cfindloop - the heart of cfind
*/
private boolean cfindloop(Dfa d, Dfa s, int[] coldp) {
int begin;
int end;
int cold;
int open; /* open and close of range of possible starts */
int close;
int estart;
int estop;
boolean shorter = 0 != (g.tree.flags & Subre.SHORTER);
boolean hitend[] = new boolean[1];
boolean lookingAt = 0 != (eflags & Flags.REG_LOOKING_AT);
assert d != null && s != null;
close = 0;
do {
int[] cold0 = new int[1];
/*
* Call search NFA to see if this is possible at all.
*/
if (lookingAt) {
// in the looking at case, we use the un-search-ified RE.
close = d.shortest(close, close, data.length(), cold0, null);
cold = 0;
} else {
close = s.shortest(close, close, data.length(), cold0, null);
cold = cold0[0];
}
if (close == -1) {
break; /* NOTE BREAK */
}
assert cold != -1;
open = cold;
cold = -1;
for (begin = open; begin <= close; begin++) {
if (begin > 0 && lookingAt) {
// Is this possible given the looking-at constraint in the call to shortest above?
return false;
}
estart = begin;
estop = data.length();
for (;;) {
if (shorter) {
end = d.shortest(begin, estart, estop, null, hitend);
} else {
end = d.longest(begin, estop, hitend);
}
if (hitend[0] && cold == -1) {
cold = begin;
}
if (end == -1) {
break; /* NOTE BREAK OUT */
}
for (int x = 0; x < match.size(); x++) {
match.set(x, null);
}
int maxsubno = getMaxSubno(g.tree, 0);
mem = new int[maxsubno + 1];
boolean matched = cdissect(g.tree, begin, end);
if (matched) {
// indicate the full match bounds.
match.set(0, new RegMatch(begin, end));
coldp[0] = cold;
return true;
}
if (shorter ? end == estop : end == begin) {
/* no point in trying again */
coldp[0] = cold;
return false;
}
/* go around and try again */
if (shorter) {
estart = end + 1;
} else {
estop = end - 1;
}
}
}
} while (close < data.length());
coldp[0] = cold;
return false;
}
/**
* subset - set any subexpression relevant to a successful subre
*/
private void subset(RuntimeSubexpression sub, int begin, int end) {
int n = sub.number;
assert n > 0;
while (match.size() < (n + 1)) {
match.add(null);
}
match.set(n, new RegMatch(begin, end));
}
/**
* dissect - determine subexpression matches (uncomplicated case)
*/
private boolean dissect(RuntimeSubexpression t, int begin, int end) {
switch (t.op) {
case '=': /* terminal node */
assert t.left == null && t.right == null;
return true; /* no action, parent did the work */
case '|': /* alternation */
assert t.left != null;
return altdissect(t, begin, end);
case 'b': /* back ref -- shouldn't be calling us! */
throw new RuntimeException("impossible backref");
case '.': /* concatenation */
assert t.left != null && t.right != null;
return condissect(t, begin, end);
case '(': /* capturing */
assert t.left != null && t.right == null;
assert t.number > 0;
subset(t, begin, end);
return dissect(t.left, begin, end);
default:
throw new RuntimeException("Impossible op");
}
}
/**
* condissect - determine concatenation subexpression matches (uncomplicated)
*/
private boolean condissect(RuntimeSubexpression t, int begin, int end) {
Dfa d;
Dfa d2;
int mid;
assert t.op == '.';
assert t.left != null && t.left.machine.states.length > 0;
assert t.right != null && t.right.machine.states.length > 0;
boolean shorter = (t.left.flags & Subre.SHORTER) != 0;
int stop = shorter ? end : begin;
d = new Dfa(this, t.left.machine);
d2 = new Dfa(this, t.right.machine);
/* pick a tentative midpoint */
if (shorter) {
mid = d.shortest(begin, begin, end, null, null);
} else {
mid = d.longest(begin, end, null);
}
if (mid == -1) {
throw new RuntimeException("Impossible mid.");
}
/* iterate until satisfaction or failure */
while (d2.longest(mid, end, null) != end) {
/* that midpoint didn't work, find a new one */
if (mid == stop) {
/* all possibilities exhausted! */
throw new RuntimeException("no midpoint");
}
if (shorter) {
mid = d.shortest(begin, mid + 1, end, null, null);
} else {
mid = d.longest(begin, mid - 1, null);
}
if (mid == -1) {
throw new RuntimeException("Failed midpoint");
}
}
/* satisfaction */
boolean dissectMatch = dissect(t.left, begin, mid);
if (!dissectMatch) {
return false;
}
return dissect(t.right, mid, end);
}
/**
* altdissect - determine alternative subexpression matches (uncomplicated)
*/
private boolean altdissect(RuntimeSubexpression t, int begin, int end) {
Dfa d;
assert t != null;
assert t.op == '|';
for (; t != null; t = t.right) {
assert t.left != null && t.left.machine.states.length > 0;
d = new Dfa(this, t.left.machine);
if (d.longest(begin, end, null) == end) {
return dissect(t.left, begin, end);
}
}
throw new RuntimeException("none matched");
}
/**
* cdissect - determine subexpression matches (with complications)
* The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate".
*/
private boolean cdissect(RuntimeSubexpression t, int begin, int end) {
assert t != null;
switch (t.op) {
case '=': /* terminal node */
assert t.left == null && t.right == null;
return true; /* no action, parent did the work */
case '|': /* alternation */
assert t.left != null;
return caltdissect(t, begin, end);
case 'b': /* back ref -- shouldn't be calling us! */
assert t.left == null && t.right == null;
return cbrdissect(t, begin, end);
case '.': /* concatenation */
assert t.left != null && t.right != null;
return ccondissect(t, begin, end);
case '(': /* capturing */
assert t.left != null && t.right == null;
assert t.number > 0;
boolean cdmatch = cdissect(t.left, begin, end);
if (cdmatch) {
subset(t, begin, end);
}
return cdmatch;
default:
throw new RuntimeException("Impossible op");
}
}
/**
* - ccondissect - concatenation subexpression matches (with complications)
* The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate".
*/
private boolean ccondissect(RuntimeSubexpression t, int begin, int end) {
Dfa d;
Dfa d2;
int mid;
assert t.op == '.';
assert t.left != null && t.left.machine.states.length > 0;
assert t.right != null && t.right.machine.states.length > 0;
if (0 != (t.left.flags & Subre.SHORTER)) { /* reverse scan */
return crevdissect(t, begin, end);
}
d = new Dfa(this, t.left.machine);
d2 = new Dfa(this, t.right.machine);
/* pick a tentative midpoint */
if (mem[t.retry] == 0) {
mid = d.longest(begin, end, null);
if (mid == -1) {
return false;
}
mem[t.retry] = (mid - begin) + 1;
} else {
mid = begin + (mem[t.retry] - 1);
}
/* iterate until satisfaction or failure */
for (;;) {
/* try this midpoint on for size */
boolean cdmatch = cdissect(t.left, begin, mid);
if (cdmatch && d2.longest(mid, end, null) == end
&& (cdissect(t.right, mid, end))) {
break; /* NOTE BREAK OUT */
}
/* that midpoint didn't work, find a new one */
if (mid == begin) {
/* all possibilities exhausted */
return false;
}
mid = d.longest(begin, mid - 1, null);
if (mid == -1) {
/* failed to find a new one */
return false;
}
mem[t.retry] = (mid - begin) + 1;
zapmem(t.left);
zapmem(t.right);
}
/* satisfaction */
return true;
}
private void zapmem(RuntimeSubexpression t) {
mem[t.retry] = 0;
while (match.size() < t.number + 1) {
match.add(null);
}
if (t.left != null) {
zapmem(t.left);
}
if (t.right != null) {
zapmem(t.right);
}
}
/**
* crevdissect - determine backref shortest-first subexpression matches
* The retry memory stores the offset of the trial midpoint from begin,
* plus 1 so that 0 uniquely means "clean slate".
*/
private boolean crevdissect(RuntimeSubexpression t, int begin, int end) {
Dfa d;
Dfa d2;
int mid;
assert t.op == '.';
assert t.left != null && t.left.machine.states.length > 0;
assert t.right != null && t.right.machine.states.length > 0;
assert 0 != (t.left.flags & Subre.SHORTER);
/* concatenation -- need to split the substring between parts */
d = new Dfa(this, t.left.machine);
d2 = new Dfa(this, t.right.machine);
/* pick a tentative midpoint */
if (mem[t.retry] == 0) {
mid = d.shortest(begin, begin, end, null, null);
if (mid == -1) {
return false;
}
mem[t.retry] = (mid - begin) + 1;
} else {
mid = begin + (mem[t.retry] - 1);
}
/* iterate until satisfaction or failure */
for (;;) {
/* try this midpoint on for size */
boolean cdmatch = cdissect(t.left, begin, mid);
if (cdmatch
&& d2.longest(mid, end, null) == end
&& (cdissect(t.right, mid, end))) {
break; /* NOTE BREAK OUT */
}
/* that midpoint didn't work, find a new one */
if (mid == end) {
/* all possibilities exhausted */
return false;
}
mid = d.shortest(begin, mid + 1, end, null, null);
if (mid == -1) {
/* failed to find a new one */
return false;
}
mem[t.retry] = (mid - begin) + 1;
zapmem(t.left);
zapmem(t.right);
}
/* satisfaction */
return true;
}
/**
* cbrdissect - determine backref subexpression matches
*/
private boolean cbrdissect(RuntimeSubexpression t, int begin, int end) {
int i;
int n = t.number;
int len;
int paren;
int p;
int stop;
int min = t.min;
int max = t.max;
assert t.op == 'b';
assert n >= 0;
//TODO: could this get be out of range?
if (match.get(n) == null) {
return false;
}
paren = match.get(n).start;
len = match.get(n).end - match.get(n).start;
/* no room to maneuver -- retries are pointless */
if (0 != mem[t.retry]) {
return false;
}
mem[t.retry] = 1;
/* special-case zero-length string */
if (len == 0) {
return begin == end;
}
/* and too-short string */
assert end >= begin;
if ((end - begin) < len) {
return false;
}
stop = end - len;
/* count occurrences */
i = 0;
for (p = begin; p <= stop && (i < max || max == Compiler.INFINITY); p += len) {
// paren is index of
if (g.compare.compare(data, paren, p, len) != 0) {
break;
}
i++;
}
/* and sort it out */
if (p != end) { /* didn't consume all of it */
return false;
}
return min <= i && (i <= max || max == Compiler.INFINITY);
}
/*
- caltdissect - determine alternative subexpression matches (w. complications)
^ static int caltdissect(struct vars *, struct Subre , int , int );
*/
private boolean caltdissect(RuntimeSubexpression t, int begin, int end) {
Dfa d;
if (t == null) {
return false;
}
assert t.op == '|';
if (mem[t.retry] == TRIED) {
return caltdissect(t.right, begin, end);
}
if (mem[t.retry] == UNTRIED) {
d = new Dfa(this, t.left.machine);
if (d.longest(begin, end, null) != end) {
mem[t.retry] = TRIED;
return caltdissect(t.right, begin, end);
}
mem[t.retry] = TRYING;
}
boolean cdmatch = cdissect(t.left, begin, end);
if (cdmatch) {
return true;
}
mem[t.retry] = TRIED;
return caltdissect(t.right, begin, end);
}
private int getMaxSubno(RuntimeSubexpression tree, int i) {
i = Math.max(i, tree.retry);
if (tree.left != null) {
i = Math.max(i, getMaxSubno(tree.left, i));
}
if (tree.right != null) {
i = Math.max(i, getMaxSubno(tree.right, i));
}
return i;
}
}