All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.basistech.tclre.Runtime Maven / Gradle / Ivy

The newest version!
/*
* Copyright 2014 Basis Technology Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.basistech.tclre;

import java.util.Arrays;
import java.util.EnumSet;
import java.util.List;

import com.google.common.collect.Lists;

/**
 * The internal implementation of matching.
 */
class Runtime {
    private static final int UNTRIED = 0;   /* not yet tried at all */
    private static final int TRYING = 1;    /* top matched, trying submatches */
    private static final int TRIED = 2;     /* top didn't match or submatches exhausted */

    Guts g;
    int eflags;
    List match;
    CharSequence data;
    int dataLength; // cache this, it gets examined _a lot_.
    private HsrePattern re;
    private int[] mem; // backtracking.

    /**
     * exec - match regular expression
     */
    boolean exec(HsrePattern re, CharSequence data, EnumSet execFlags) throws RegexException {
    /* sanity checks */
    /* setup */


        if (0 != (re.guts.info & Flags.REG_UIMPOSSIBLE)) {
            throw new RegexException("Regex marked impossible");
        }

        eflags = 0;
        for (ExecFlags ef : execFlags) {
            switch (ef) {
            case NOTBOL:
                eflags |= Flags.REG_NOTBOL;
                break;
            case NOTEOL:
                eflags |= Flags.REG_NOTEOL;
                break;
            case LOOKING_AT:
                eflags |= Flags.REG_LOOKING_AT;
                break;
            default:
                throw new RuntimeException("impossible exec flag");
            }
        }

        this.re = re;
        this.g = re.guts;
        this.data = data;
        this.dataLength = this.data.length();
        if (this.match != null) {
            this.match.clear();
        } else {
            this.match = Lists.newArrayList();
        }
        match.add(null); // make room for 1.
        if (0 != (g.info & Flags.REG_UBACKREF)) {
            while (match.size() < g.nsub + 1) {
                match.add(null);
            }
        }
        if (mem != null && mem.length >= g.ntree) {
            Arrays.fill(mem, 0);
        } else {
            mem = new int[g.ntree];
        }
       
    /* do it */
        assert g.tree != null;

        if (0 != (g.info & Flags.REG_UBACKREF)) {
            return cfind(g.tree.machine);
        } else {
            return find(g.tree.machine);
        }
    }

    /**
     * find - find a match for the main NFA (no-complications case)
     * 

* First, it runs the 'search machine' in non-greedy mode (shortest). The 'search machine' is the NFA with * .* added to the front and the back, more or less. See 'makesearch'. In many cases, running the search machine * 'shortest' is considerably faster than running the actual regular expression in greedy (longest). So, * even though this may seem like it would take extra time doing an extra scan, the alternative is much slower. * The search machine tells you if the expression can be found anywhere, and, if so, where is the furthest possible * end. *

*

* If the search machine succeeds, the does an iteration to find the exact bounds; * the loop uses 'longest' or 'shortest' as appropriate to the flags. In C, there was an option to * _only_ run the search machine and return a simple boolean with no bounds. * We have no API for that via an ExecFlag. *

*

* If the top-level API call is 'lookingAt', we never want to scan down the data looking for matches. But 'shortest' * can still be much faster than 'longest'. So, the code runs the original machine first. If non-greedy expressions * were very common, I suppose that it would be faster to omit this step in that case. Thereafter, the loop has * a check to bail if these is no match at the beginning of the data, which is the constraint of lookingAt. *

* */ boolean find(Cnfa cnfa) { int begin; int end = -1; int cold; int open; /* open and close of range of possible starts */ int close; boolean hitend; boolean shorter = 0 != (g.tree.flags & Subre.SHORTER); boolean lookingAt = 0 != (eflags & Flags.REG_LOOKING_AT); int[] coldp = new int[1]; Dfa d = new Dfa(this, cnfa); if (lookingAt) { /* * shortest is faster than longest. So, we want to check with it. * However, since we aren't making a 'search re' with an extra .* on * the front, we don't add an extra requirement to make progress on the * very first arc. If the expression has something like a* at the front, * it can 'no-progress' consuming the a characters. * All of this casts doubts on the 'requireInitialProgress' feature -- at all. * These initial calls to shortest should be all the opportunity we need * to do 'lookingAt'. */ close = d.shortest(0, 0, data.length(), coldp, null); cold = 0; } else { /* First, a shot with the search RE. */ Dfa s = new Dfa(this, g.search); close = s.shortest(0, 0, data.length(), coldp, null); cold = coldp[0]; } if (close == -1) { /* not found */ return false; } /* find starting point and match */ open = cold; cold = -1; for (begin = open; begin <= close; begin++) { /* * if LOOKING_AT, we can't validly have a 'begin' after 'open'. * I'm not sure this test can even ever go off, since the 'shortest' test * up above should accomplish the same thing. */ if (begin > 0 && lookingAt) { return false; } boolean[] hitendp = new boolean[1]; if (shorter) { end = d.shortest(begin, begin, data.length(), null, hitendp); } else { end = d.longest(begin, data.length(), hitendp); } hitend = hitendp[0]; if (hitend && cold == -1) { cold = begin; } if (end != -1) { /* success */ break; /* NOTE BREAK OUT */ } } if (end == -1) { return false; } /* and pin down details */ match.set(0, new RegMatch(begin, end)); // no need to do the work. return re.nsub <= 0 || dissect(g.tree, begin, end); } /** * cfind - find a match for the main NFA (with complications) */ private boolean cfind(Cnfa cnfa) { int[] cold = new int[1]; Dfa s = new Dfa(this, g.search); Dfa d = new Dfa(this, cnfa); return cfindloop(d, s, cold); } /** * cfindloop - the heart of cfind */ private boolean cfindloop(Dfa d, Dfa s, int[] coldp) { int begin; int end; int cold; int open; /* open and close of range of possible starts */ int close; int estart; int estop; boolean shorter = 0 != (g.tree.flags & Subre.SHORTER); boolean hitend[] = new boolean[1]; boolean lookingAt = 0 != (eflags & Flags.REG_LOOKING_AT); assert d != null && s != null; close = 0; do { int[] cold0 = new int[1]; /* * Call search NFA to see if this is possible at all. */ if (lookingAt) { // in the looking at case, we use the un-search-ified RE. close = d.shortest(close, close, data.length(), cold0, null); cold = 0; } else { close = s.shortest(close, close, data.length(), cold0, null); cold = cold0[0]; } if (close == -1) { break; /* NOTE BREAK */ } assert cold != -1; open = cold; cold = -1; for (begin = open; begin <= close; begin++) { if (begin > 0 && lookingAt) { // Is this possible given the looking-at constraint in the call to shortest above? return false; } estart = begin; estop = data.length(); for (;;) { if (shorter) { end = d.shortest(begin, estart, estop, null, hitend); } else { end = d.longest(begin, estop, hitend); } if (hitend[0] && cold == -1) { cold = begin; } if (end == -1) { break; /* NOTE BREAK OUT */ } for (int x = 0; x < match.size(); x++) { match.set(x, null); } int maxsubno = getMaxSubno(g.tree, 0); mem = new int[maxsubno + 1]; boolean matched = cdissect(g.tree, begin, end); if (matched) { // indicate the full match bounds. match.set(0, new RegMatch(begin, end)); coldp[0] = cold; return true; } if (shorter ? end == estop : end == begin) { /* no point in trying again */ coldp[0] = cold; return false; } /* go around and try again */ if (shorter) { estart = end + 1; } else { estop = end - 1; } } } } while (close < data.length()); coldp[0] = cold; return false; } /** * subset - set any subexpression relevant to a successful subre */ private void subset(RuntimeSubexpression sub, int begin, int end) { int n = sub.number; assert n > 0; while (match.size() < (n + 1)) { match.add(null); } match.set(n, new RegMatch(begin, end)); } /** * dissect - determine subexpression matches (uncomplicated case) */ private boolean dissect(RuntimeSubexpression t, int begin, int end) { switch (t.op) { case '=': /* terminal node */ assert t.left == null && t.right == null; return true; /* no action, parent did the work */ case '|': /* alternation */ assert t.left != null; return altdissect(t, begin, end); case 'b': /* back ref -- shouldn't be calling us! */ throw new RuntimeException("impossible backref"); case '.': /* concatenation */ assert t.left != null && t.right != null; return condissect(t, begin, end); case '(': /* capturing */ assert t.left != null && t.right == null; assert t.number > 0; subset(t, begin, end); return dissect(t.left, begin, end); default: throw new RuntimeException("Impossible op"); } } /** * condissect - determine concatenation subexpression matches (uncomplicated) */ private boolean condissect(RuntimeSubexpression t, int begin, int end) { Dfa d; Dfa d2; int mid; assert t.op == '.'; assert t.left != null && t.left.machine.states.length > 0; assert t.right != null && t.right.machine.states.length > 0; boolean shorter = (t.left.flags & Subre.SHORTER) != 0; int stop = shorter ? end : begin; d = new Dfa(this, t.left.machine); d2 = new Dfa(this, t.right.machine); /* pick a tentative midpoint */ if (shorter) { mid = d.shortest(begin, begin, end, null, null); } else { mid = d.longest(begin, end, null); } if (mid == -1) { throw new RuntimeException("Impossible mid."); } /* iterate until satisfaction or failure */ while (d2.longest(mid, end, null) != end) { /* that midpoint didn't work, find a new one */ if (mid == stop) { /* all possibilities exhausted! */ throw new RuntimeException("no midpoint"); } if (shorter) { mid = d.shortest(begin, mid + 1, end, null, null); } else { mid = d.longest(begin, mid - 1, null); } if (mid == -1) { throw new RuntimeException("Failed midpoint"); } } /* satisfaction */ boolean dissectMatch = dissect(t.left, begin, mid); if (!dissectMatch) { return false; } return dissect(t.right, mid, end); } /** * altdissect - determine alternative subexpression matches (uncomplicated) */ private boolean altdissect(RuntimeSubexpression t, int begin, int end) { Dfa d; assert t != null; assert t.op == '|'; for (; t != null; t = t.right) { assert t.left != null && t.left.machine.states.length > 0; d = new Dfa(this, t.left.machine); if (d.longest(begin, end, null) == end) { return dissect(t.left, begin, end); } } throw new RuntimeException("none matched"); } /** * cdissect - determine subexpression matches (with complications) * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". */ private boolean cdissect(RuntimeSubexpression t, int begin, int end) { assert t != null; switch (t.op) { case '=': /* terminal node */ assert t.left == null && t.right == null; return true; /* no action, parent did the work */ case '|': /* alternation */ assert t.left != null; return caltdissect(t, begin, end); case 'b': /* back ref -- shouldn't be calling us! */ assert t.left == null && t.right == null; return cbrdissect(t, begin, end); case '.': /* concatenation */ assert t.left != null && t.right != null; return ccondissect(t, begin, end); case '(': /* capturing */ assert t.left != null && t.right == null; assert t.number > 0; boolean cdmatch = cdissect(t.left, begin, end); if (cdmatch) { subset(t, begin, end); } return cdmatch; default: throw new RuntimeException("Impossible op"); } } /** * - ccondissect - concatenation subexpression matches (with complications) * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". */ private boolean ccondissect(RuntimeSubexpression t, int begin, int end) { Dfa d; Dfa d2; int mid; assert t.op == '.'; assert t.left != null && t.left.machine.states.length > 0; assert t.right != null && t.right.machine.states.length > 0; if (0 != (t.left.flags & Subre.SHORTER)) { /* reverse scan */ return crevdissect(t, begin, end); } d = new Dfa(this, t.left.machine); d2 = new Dfa(this, t.right.machine); /* pick a tentative midpoint */ if (mem[t.retry] == 0) { mid = d.longest(begin, end, null); if (mid == -1) { return false; } mem[t.retry] = (mid - begin) + 1; } else { mid = begin + (mem[t.retry] - 1); } /* iterate until satisfaction or failure */ for (;;) { /* try this midpoint on for size */ boolean cdmatch = cdissect(t.left, begin, mid); if (cdmatch && d2.longest(mid, end, null) == end && (cdissect(t.right, mid, end))) { break; /* NOTE BREAK OUT */ } /* that midpoint didn't work, find a new one */ if (mid == begin) { /* all possibilities exhausted */ return false; } mid = d.longest(begin, mid - 1, null); if (mid == -1) { /* failed to find a new one */ return false; } mem[t.retry] = (mid - begin) + 1; zapmem(t.left); zapmem(t.right); } /* satisfaction */ return true; } private void zapmem(RuntimeSubexpression t) { mem[t.retry] = 0; while (match.size() < t.number + 1) { match.add(null); } if (t.left != null) { zapmem(t.left); } if (t.right != null) { zapmem(t.right); } } /** * crevdissect - determine backref shortest-first subexpression matches * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". */ private boolean crevdissect(RuntimeSubexpression t, int begin, int end) { Dfa d; Dfa d2; int mid; assert t.op == '.'; assert t.left != null && t.left.machine.states.length > 0; assert t.right != null && t.right.machine.states.length > 0; assert 0 != (t.left.flags & Subre.SHORTER); /* concatenation -- need to split the substring between parts */ d = new Dfa(this, t.left.machine); d2 = new Dfa(this, t.right.machine); /* pick a tentative midpoint */ if (mem[t.retry] == 0) { mid = d.shortest(begin, begin, end, null, null); if (mid == -1) { return false; } mem[t.retry] = (mid - begin) + 1; } else { mid = begin + (mem[t.retry] - 1); } /* iterate until satisfaction or failure */ for (;;) { /* try this midpoint on for size */ boolean cdmatch = cdissect(t.left, begin, mid); if (cdmatch && d2.longest(mid, end, null) == end && (cdissect(t.right, mid, end))) { break; /* NOTE BREAK OUT */ } /* that midpoint didn't work, find a new one */ if (mid == end) { /* all possibilities exhausted */ return false; } mid = d.shortest(begin, mid + 1, end, null, null); if (mid == -1) { /* failed to find a new one */ return false; } mem[t.retry] = (mid - begin) + 1; zapmem(t.left); zapmem(t.right); } /* satisfaction */ return true; } /** * cbrdissect - determine backref subexpression matches */ private boolean cbrdissect(RuntimeSubexpression t, int begin, int end) { int i; int n = t.number; int len; int paren; int p; int stop; int min = t.min; int max = t.max; assert t.op == 'b'; assert n >= 0; //TODO: could this get be out of range? if (match.get(n) == null) { return false; } paren = match.get(n).start; len = match.get(n).end - match.get(n).start; /* no room to maneuver -- retries are pointless */ if (0 != mem[t.retry]) { return false; } mem[t.retry] = 1; /* special-case zero-length string */ if (len == 0) { return begin == end; } /* and too-short string */ assert end >= begin; if ((end - begin) < len) { return false; } stop = end - len; /* count occurrences */ i = 0; for (p = begin; p <= stop && (i < max || max == Compiler.INFINITY); p += len) { // paren is index of if (g.compare.compare(data, paren, p, len) != 0) { break; } i++; } /* and sort it out */ if (p != end) { /* didn't consume all of it */ return false; } return min <= i && (i <= max || max == Compiler.INFINITY); } /* - caltdissect - determine alternative subexpression matches (w. complications) ^ static int caltdissect(struct vars *, struct Subre , int , int ); */ private boolean caltdissect(RuntimeSubexpression t, int begin, int end) { Dfa d; if (t == null) { return false; } assert t.op == '|'; if (mem[t.retry] == TRIED) { return caltdissect(t.right, begin, end); } if (mem[t.retry] == UNTRIED) { d = new Dfa(this, t.left.machine); if (d.longest(begin, end, null) != end) { mem[t.retry] = TRIED; return caltdissect(t.right, begin, end); } mem[t.retry] = TRYING; } boolean cdmatch = cdissect(t.left, begin, end); if (cdmatch) { return true; } mem[t.retry] = TRIED; return caltdissect(t.right, begin, end); } private int getMaxSubno(RuntimeSubexpression tree, int i) { i = Math.max(i, tree.retry); if (tree.left != null) { i = Math.max(i, getMaxSubno(tree.left, i)); } if (tree.right != null) { i = Math.max(i, getMaxSubno(tree.right, i)); } return i; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy