com.basistech.tclre.Dfa Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of tcl-regex Show documentation
Show all versions of tcl-regex Show documentation
Java port of the regex engine from Tcl
The newest version!
/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.basistech.tclre;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import java.util.BitSet;
/**
* Runtime DFA.
*/
class Dfa {
final Object2ObjectMap stateSets;
final int nstates;
final int ncolors; // length of outarc and inchain vectors (really?)
final Cnfa cnfa;
final RuntimeColorMap cm;
final Runtime runtime;
Dfa(Runtime runtime, Cnfa cnfa) {
this.runtime = runtime;
this.cm = runtime.g.cm;
this.cnfa = cnfa;
/*
* Note that this isn't a cache;
* Benson believes that the maximum size here is proportional
* to the complexity of the machine, not to the input.
*/
stateSets = new Object2ObjectOpenHashMap();
nstates = cnfa.states.length;
ncolors = cnfa.ncolors;
}
/**
* Called at the start of a match.
* arguably we could just construct a new DFA each time.
*/
StateSet initialize(int start) {
// Discard state sets; reuse would be faster if we kept them,
// but then we'd need the real cache.
stateSets.clear();
StateSet stateSet = new StateSet(nstates, ncolors);
stateSet.states.set(cnfa.pre);
stateSet.noprogress = true;
// Insert into hash table based on that one state.
stateSets.put(stateSet.states, stateSet);
stateSet.setLastSeen(start);
return stateSet;
}
/**
* 'miss' -- the state set was not found in the stateSets.
*
* @param co
* @param cp
* @return
*/
StateSet miss(StateSet css, short co, int cp) {
if (css.outs[co] != null) {
return css.outs[co];
}
/* first, what set of states would we end up in? */
BitSet work = new BitSet(nstates);
boolean ispost = false;
boolean noprogress = true;
boolean gotstate = false;
for (int i = 0; i < nstates; i++) {
if (css.states.get(i)) {
long ca;
int ax;
short caco;
int catarget;
for (ax = cnfa.states[i] + 1,
ca = cnfa.arcs[ax],
caco = Cnfa.carcColor(ca),
catarget = Cnfa.carcTarget(ca);
caco != Constants.COLORLESS;
ax++, ca = cnfa.arcs[ax], caco = Cnfa.carcColor(ca), catarget = Cnfa.carcTarget(ca)) {
if (caco == co) {
work.set(catarget);
gotstate = true;
if (catarget == cnfa.post) {
ispost = true;
}
// get target state, index arcs, get color, compare to 0.
if (0 == Cnfa.carcColor(cnfa.arcs[cnfa.states[catarget]])) {
noprogress = false;
}
}
}
}
}
boolean dolacons = gotstate && cnfa.hasLacons;
boolean sawlacons = false;
while (dolacons) { /* transitive closure */
dolacons = false;
for (int i = 0; i < nstates; i++) {
if (work.get(i)) {
long ca;
int ax;
short caco;
int catarget;
for (ax = cnfa.states[i] + 1, ca = cnfa.arcs[ax], caco = Cnfa.carcColor(ca), catarget = Cnfa.carcTarget(ca);
caco != Constants.COLORLESS;
ax++, ca = cnfa.arcs[ax], caco = Cnfa.carcColor(ca), catarget = Cnfa.carcTarget(ca)) {
if (caco <= ncolors) {
continue; /* NOTE CONTINUE */
}
sawlacons = true;
if (work.get(catarget)) {
continue; /* NOTE CONTINUE */
}
if (!lacon(cp, caco)) {
continue; /* NOTE CONTINUE */
}
work.set(catarget);
dolacons = true;
if (catarget == cnfa.post) {
ispost = true;
}
if (0 == Cnfa.carcColor(cnfa.arcs[cnfa.states[catarget]])) {
noprogress = false;
}
}
}
}
}
if (!gotstate) {
return null;
}
StateSet stateSet = stateSets.get(work);
if (stateSet == null) {
stateSet = new StateSet(work, ncolors);
stateSet.ins = new Arcp(null, Constants.WHITE);
stateSet.poststate = ispost;
stateSet.noprogress |= noprogress;
/* lastseen to be dealt with by caller */
stateSets.put(work, stateSet);
}
if (!sawlacons) {
css.outs[co] = stateSet;
css.inchain[co] = stateSet.ins;
stateSet.ins = new Arcp(css, co);
}
return stateSet;
}
boolean lacon(int cp, short co) {
int end;
int n = co - cnfa.ncolors;
// compare this to com.basistech.tclre.Nfa.compact(), the LACONS case.
// that adds a.co to ncolors. So that means that you'd think that the lacons
// indexing would be related... The 'arc' should have a 'color' which is an index
//
RuntimeSubexpression subex = runtime.g.lookaheadConstraintMachine(n);
Dfa d = new Dfa(runtime, subex.machine);
end = d.longest(cp, runtime.data.length(), null);
return (subex.number != 0) ? (end != -1) : (end == -1);
}
/**
* longest - longest-preferred matching engine
*
* @return endpoint or -1
*/
int longest(int start, int stop, boolean[] hitstop) {
int cp;
int realstop = (stop == runtime.dataLength) ? stop : stop + 1;
short co;
StateSet css;
int post;
/* initialize */
css = initialize(start);
cp = start;
if (hitstop != null) {
hitstop[0] = false;
}
/* startup */
if (cp == 0) {
co = cnfa.bos[0 != (runtime.eflags & Flags.REG_NOTBOL) ? 0 : 1];
} else {
char theChar = runtime.data.charAt(cp - 1);
if (Character.isLowSurrogate(theChar)) {
// collect the other end of the surrogate.
theChar = runtime.data.charAt(cp - 2);
char high = theChar;
int codepoint = Character.toCodePoint(high, theChar);
co = cm.getcolor(codepoint); // and get a color for the pair.
} else {
co = cm.getcolor(theChar);
}
}
css = miss(css, co, cp);
if (css == null) {
return -1;
}
css.setLastSeen(cp);
StateSet ss;
/* main loop */
while (cp < realstop) {
char theChar = runtime.data.charAt(cp);
int increment = 1;
if (Character.isHighSurrogate(theChar)) {
int codepoint = Character.toCodePoint(theChar, runtime.data.charAt(cp + 1));
co = cm.getcolor(codepoint);
increment = 2;
} else {
co = cm.getcolor(theChar);
}
ss = css.outs[co];
if (ss == null) {
ss = miss(css, co, cp + increment);
if (ss == null) {
break; /* NOTE BREAK OUT */
}
}
cp = cp + increment;
ss.setLastSeen(cp);
css = ss;
}
/* shutdown */
if (cp == runtime.dataLength && stop == runtime.dataLength) {
if (hitstop != null) {
hitstop[0] = true;
}
co = cnfa.eos[0 != (runtime.eflags & Flags.REG_NOTEOL) ? 0 : 1];
ss = miss(css, co, cp);
/* special case: match ended at eol? */
if (ss != null && ss.poststate) {
return cp;
} else if (ss != null) {
ss.setLastSeen(cp); /* to be tidy */
}
}
/* find last match, if any */
post = -1;
for (StateSet thisSS : stateSets.values()) { //.object2ObjectEntrySet()) {
if (thisSS.poststate && post != thisSS.getLastSeen()
&& (post == -1 || post < thisSS.getLastSeen())) {
post = thisSS.getLastSeen();
}
}
if (post != -1) { /* found one */
/* Post points after the codepoint after the last one in the match (!) */
/* So, if that is an SMP codepoint, we need to back up 2 to get to the beginning of it,
* and thus be just after the last character of the match. */
char postChar = runtime.data.charAt(post - 1);
if (Character.isLowSurrogate(postChar)) {
return post - 2;
} else {
return post - 1;
}
}
return -1;
}
/**
* shortest - shortest-preferred matching engine
*
* @param start where the match should start
* @param min match must end at or after here
* @param max match must end at or before here
* @param coldp store coldstart pointer here, if non-null. This is the _beginning_ of the match region.
* @param hitstop record whether hit end of total input
* @return endpoint or -1
*/
int shortest(int start, int min, int max, int[] coldp, boolean[] hitstop) {
int cp;
int realmin = min == runtime.dataLength ? min : min + 1;
int realmax = max == runtime.dataLength ? max : max + 1;
short co;
StateSet ss;
StateSet css;
/* initialize */
css = initialize(start);
cp = start;
if (hitstop != null) {
hitstop[0] = false;
}
/* startup */
if (cp == 0) {
/* If the NOTBOL flag is true, we take color as bos[0], else 1. bos[0] is really BOS, while [1] is supposed to be BOL. So, I guess, if it's NOTBOL, it's BOS. */
/* The combination of NOTBOL and lookingAt is not defined. */
co = cnfa.bos[0 != (runtime.eflags & Flags.REG_NOTBOL) ? 0 : 1];
} else {
/* Not at bos at all, set color based on prior character. */
char theChar = runtime.data.charAt(cp - 1);
if (Character.isLowSurrogate(theChar)) {
int codepoint = Character.toCodePoint(runtime.data.charAt(cp - 2), theChar);
co = cm.getcolor(codepoint);
} else {
co = cm.getcolor(theChar);
}
}
css = miss(css, co, cp);
if (css == null) {
return -1;
}
css.setLastSeen(cp);
ss = css;
/* main loop */
while (cp < realmax) {
int increment = 1;
char theChar = runtime.data.charAt(cp);
if (Character.isHighSurrogate(theChar)) {
int codepoint = Character.toCodePoint(theChar, runtime.data.charAt(cp + 1));
co = cm.getcolor(codepoint);
increment = 2;
} else {
co = cm.getcolor(theChar);
}
ss = css.outs[co];
if (ss == null) {
ss = miss(css, co, cp + increment);
if (ss == null) {
break; /* NOTE BREAK OUT */
}
}
cp = cp + increment;
ss.setLastSeen(cp);
css = ss;
if (ss.poststate && cp >= realmin) {
break; /* NOTE BREAK OUT */
}
}
if (ss == null) {
return -1;
}
int matchStart = lastcold();
if (coldp != null) { /* report last no-progress state set, if any */
coldp[0] = matchStart;
}
if (ss.poststate && cp > min) {
assert cp >= realmin;
cp--;
if (Character.isLowSurrogate(runtime.data.charAt(cp))) {
cp--;
}
} else if (cp == runtime.dataLength && max == runtime.dataLength) {
co = cnfa.eos[0 != (runtime.eflags & Flags.REG_NOTEOL) ? 0 : 1];
ss = miss(css, co, cp);
/* match might have ended at eol */
if ((ss == null || !ss.poststate) && hitstop != null) {
hitstop[0] = true;
}
}
if (ss == null || !ss.poststate) {
return -1;
}
return cp;
}
/**
* lastcold - determine last point at which no progress had been made
*
* @return offset or -1
*/
int lastcold() {
int nopr = 0;
for (StateSet ss : stateSets.values()) {
if (ss.noprogress && nopr < ss.getLastSeen()) {
nopr = ss.getLastSeen();
}
}
return nopr;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy