All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.joni.Matcher Maven / Gradle / Ivy

The newest version!
/*
 * Permission is hereby granted, free of charge, to any person obtaining a copy of
 * this software and associated documentation files (the "Software"), to deal in
 * the Software without restriction, including without limitation the rights to
 * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
 * of the Software, and to permit persons to whom the Software is furnished to do
 * so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package org.joni;

import static org.joni.Option.isFindLongest;

import org.jcodings.Encoding;
import org.jcodings.IntHolder;
import org.jcodings.constants.CharacterType;
import org.jcodings.specific.ASCIIEncoding;
import org.joni.constants.internal.AnchorType;

public abstract class Matcher extends IntHolder {
    public static final int FAILED = -1;
    public static final int INTERRUPTED = -2;

    protected final Regex regex;
    protected final Encoding enc;

    protected final byte[]bytes;
    protected final int str;
    protected final int end;

    protected int msaStart;
    protected int msaOptions;
    protected final Region msaRegion;
    protected int msaBestLen;
    protected int msaBestS;
    protected int msaGpos;

    protected int msaBegin;
    protected int msaEnd;

    Matcher(Regex regex, Region region, byte[]bytes, int p, int end) {
        this.regex = regex;
        this.enc = regex.enc;
        this.bytes = bytes;
        this.str = p;
        this.end = end;
        this.msaRegion = region;
    }

    // main matching method
    protected abstract int matchAt(int range, int sstart, int sprev, boolean interrupt) throws InterruptedException;

    protected abstract void stateCheckBuffInit(int strLength, int offset, int stateNum);
    protected abstract void stateCheckBuffClear();

    public abstract void interrupt();

    public final Region getRegion() {
        return msaRegion;
    }

    public final Region getEagerRegion() {
        return msaRegion != null ? msaRegion : new Region(msaBegin, msaEnd);
    }

    public final int getBegin() {
        return msaBegin;
    }

    public final int getEnd() {
        return msaEnd;
    }

    protected final void msaInit(int option, int start, int gpos) {
        msaOptions = option;
        msaStart = start;
        msaGpos = gpos;
        if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) msaBestLen = -1;
    }

    public final int match(int at, int range, int option) {
        try {
            return matchCommon(at, range, option, false);
        } catch (InterruptedException ex) {
            return INTERRUPTED;
        }
    }

    public final int matchInterruptible(int at, int range, int option) throws InterruptedException {
        return matchCommon(at, range, option, true);
    }

    private final int matchCommon(int at, int range, int option, boolean interrupt) throws InterruptedException {
        msaInit(option, at, at);

        if (Config.USE_CEC) {
            int offset = at = str;
            stateCheckBuffInit(end - str, offset, regex.numCombExpCheck); // move it to construction?
        } // USE_COMBINATION_EXPLOSION_CHECK

        int prev = enc.prevCharHead(bytes, str, at, end);

        if (Config.USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE) {
            return matchAt(end /*range*/, at, prev, interrupt);
        } else {
            return matchAt(range /*range*/, at, prev, interrupt);
        }
    }

    int low, high; // these are the return values
    private final boolean forwardSearchRange(byte[]bytes, int str, int end, int s, int range, IntHolder lowPrev) {
        int pprev = -1;
        int p = s;

        if (Config.DEBUG_SEARCH) debugForwardSearchRange(str, end, s, range);

        if (regex.dMin > 0) {
            if (enc.isSingleByte()) {
                p += regex.dMin;
            } else {
                int q = p + regex.dMin;
                while (p < q && p < end) p += enc.length(bytes, p, end);
            }
        }

        retry:while (true) {
            if (Config.DEBUG_SEARCH) debugSearch(regex.forward.getName(), p, end, range);
            p = regex.forward.search(this, bytes, p, end, range);

            if (p != -1 && p < range) {
                if (p - regex.dMin < s) {
                    // retry_gate:
                    pprev = p;
                    p += enc.length(bytes, p, end);
                    continue retry;
                }

                if (regex.subAnchor != 0) {
                    switch (regex.subAnchor) {
                    case AnchorType.BEGIN_LINE:
                        if (p != str) {
                            int prev = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, p, end);
                            if (!enc.isNewLine(bytes, prev, end)) {
                                // goto retry_gate;
                                pprev = p;
                                p += enc.length(bytes, p, end);
                                continue retry;
                            }
                        }
                        break;

                    case AnchorType.END_LINE:
                        if (p == end) {
                            if (!Config.USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE) {
                                int prev = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, p, end);
                                if (prev != -1 && enc.isNewLine(bytes, prev, end)) {
                                    // goto retry_gate;
                                    pprev = p;
                                    p += enc.length(bytes, p, end);
                                    continue retry;
                                }
                            }
                        } else if (!enc.isNewLine(bytes, p, end) && (!Config.USE_CRNL_AS_LINE_TERMINATOR || !enc.isMbcCrnl(bytes, p, end))) {
                            //if () break;
                            // goto retry_gate;
                            pprev = p;
                            p += enc.length(bytes, p, end);
                            continue retry;
                        }
                        break;
                    } // switch
                }

                if (regex.dMax == 0) {
                    low = p;
                    if (lowPrev != null) { // ??? // remove null checks
                        if (low > s) {
                            lowPrev.value = enc.prevCharHead(bytes, s, p, end);
                        } else {
                            lowPrev.value = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, p, end);
                        }
                    }
                } else {
                    if (regex.dMax != MinMaxLen.INFINITE_DISTANCE) {
                        low = p - regex.dMax;

                        if (low > s) {
                            low = enc.rightAdjustCharHeadWithPrev(bytes, s, low, end, lowPrev);
                            if (lowPrev != null && lowPrev.value == -1) {
                                lowPrev.value = enc.prevCharHead(bytes, (pprev != -1) ? pprev : s, low, end);
                            }
                        } else {
                            if (lowPrev != null) {
                                lowPrev.value = enc.prevCharHead(bytes, (pprev != -1) ? pprev : str, low, end);
                            }
                        }
                    }
                }
                /* no needs to adjust *high, *high is used as range check only */
                high = p - regex.dMin;
                if (Config.DEBUG_SEARCH) debugForwardSearchRangeSuccess(str, low, high);
                return true;    /* success */
            }
            return false;   /* fail */
        } //while
    }

    // low, high
    private final boolean backwardSearchRange(byte[]bytes, int str, int end, int s, int range, int adjrange) {
        range += regex.dMin;
        int p = s;

        retry:while (true) {
            p = regex.backward.search(this, bytes, range, adjrange, end, p, s, range);

            if (p != -1) {
                if (regex.subAnchor != 0) {
                    switch (regex.subAnchor) {
                    case AnchorType.BEGIN_LINE:
                        if (p != str) {
                            int prev = enc.prevCharHead(bytes, str, p, end);
                            if (!enc.isNewLine(bytes, prev, end)) {
                                p = prev;
                                continue retry;
                            }
                        }
                        break;

                    case AnchorType.END_LINE:
                        if (p == end) {
                            if (!Config.USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE) {
                                int prev = enc.prevCharHead(bytes, adjrange, p, end);
                                if (prev == -1) return false;
                                if (enc.isNewLine(bytes, prev, end)) {
                                    p = prev;
                                    continue retry;
                                }
                            }
                        } else if (!enc.isNewLine(bytes, p, end) && (!Config.USE_CRNL_AS_LINE_TERMINATOR || !enc.isMbcCrnl(bytes, p, end))) {
                            p = enc.prevCharHead(bytes, adjrange, p, end);
                            if (p == -1) return false;
                            continue retry;
                        }
                        break;
                    } // switch
                }

                /* no needs to adjust *high, *high is used as range check only */
                if (regex.dMax != MinMaxLen.INFINITE_DISTANCE) {
                    low = p - regex.dMax;
                    high = p - regex.dMin;
                    high = enc.rightAdjustCharHead(bytes, adjrange, high, end);
                }

                if (Config.DEBUG_SEARCH) debugBackwardSearchRange(str, low, high);
                return true;
            }

            if (Config.DEBUG_SEARCH) Config.log.println("backward_search_range: fail.");
            return false;
        } // while
    }

    // MATCH_AND_RETURN_CHECK
    private boolean matchCheck(int upperRange, int s, int prev, boolean interrupt) throws InterruptedException {
        if (Config.USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE) {
            if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) {
                //range = upperRange;
                if (matchAt(upperRange, s, prev, interrupt) != -1) {
                    if (!isFindLongest(regex.options)) return true;
                }
            } else {
                //range = upperRange;
                if (matchAt(upperRange, s, prev, interrupt) != -1) return true;
            }
        } else {
            if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) {
                if (matchAt(end, s, prev, interrupt) != -1) {
                    //range = upperRange;
                    if (!isFindLongest(regex.options)) return true;
                }
            } else {
                //range = upperRange;
                if (matchAt(end, s, prev, interrupt) != -1) return true;
            }
        }
        return false;
    }

    public final int search(int start, int range, int option) {
        try {
            return searchCommon(start, start, range, option, false);
        } catch (InterruptedException ex) {
            return INTERRUPTED;
        }
    }

    public final int search(int gpos, int start, int range, int option) {
        try {
            return searchCommon(gpos, start, range, option, false);
        } catch (InterruptedException ex) {
            return INTERRUPTED;
        }
    }

    public final int searchInterruptible(int start, int range, int option) throws InterruptedException {
        return searchCommon(start, start, range, option, true);
    }

    public final int searchInterruptible(int gpos, int start, int range, int option) throws InterruptedException {
        return searchCommon(gpos, start, range, option, true);
    }

    private final int searchCommon(int gpos, int start, int range, int option, boolean interrupt) throws InterruptedException {
        int s, prev;
        int origStart = start;
        int origRange = range;

        if (Config.DEBUG_SEARCH) debugSearch(str, end, start, range);

        if (start > end || start < str) return FAILED;

        /* anchor optimize: resume search range */
        if (regex.anchor != 0 && str < end) {
            int minSemiEnd, maxSemiEnd;

            if ((regex.anchor & AnchorType.BEGIN_POSITION) != 0) {
                /* search start-position only */
                // !begin_position:!
                if (range > start) {
                    if (gpos > start) {
                        if (gpos < range) {
                            range = gpos + 1;
                        } else {
                            range = start + 1;
                        }
                    }
                } else {
                    range = start;
                }
            } else if ((regex.anchor & AnchorType.BEGIN_BUF) != 0) {
                /* search str-position only */
                if (range > start) {
                    if (start != str) return FAILED; // mismatch_no_msa;
                    range = str + 1;
                } else {
                    if (range <= str) {
                        start = str;
                        range = str;
                    } else {
                        return FAILED; // mismatch_no_msa;
                    }
                }
            } else if ((regex.anchor & AnchorType.END_BUF) != 0) {
                minSemiEnd = maxSemiEnd = end;
                // !end_buf:!
                if (endBuf(start, range, minSemiEnd, maxSemiEnd)) return FAILED; // mismatch_no_msa;
            } else if ((regex.anchor & AnchorType.SEMI_END_BUF) != 0) {
                int preEnd = enc.stepBack(bytes, str, end, end, 1);
                maxSemiEnd = end;
                if (enc.isNewLine(bytes, preEnd, end)) {
                    minSemiEnd = preEnd;
                    if (Config.USE_CRNL_AS_LINE_TERMINATOR) {
                        preEnd = enc.stepBack(bytes, str, preEnd, end, 1);
                        if (preEnd != -1 && enc.isMbcCrnl(bytes, preEnd, end)) {
                            minSemiEnd = preEnd;
                        }
                    }
                    if (minSemiEnd > str && start <= minSemiEnd) {
                        // !goto end_buf;!
                        if (endBuf(start, range, minSemiEnd, maxSemiEnd)) return FAILED; // mismatch_no_msa;
                    }
                } else {
                    minSemiEnd = end;
                    // !goto end_buf;!
                    if (endBuf(start, range, minSemiEnd, maxSemiEnd)) return FAILED; // mismatch_no_msa;
                }
            } else if ((regex.anchor & AnchorType.ANYCHAR_STAR_ML) != 0) {
                // goto !begin_position;!
                if (range > start) {
                    if (gpos > start) {
                        if (gpos < range) {
                            range = gpos + 1;
                        } else {
                            range = start + 1;
                        }
                    }
                } else {
                    range = start;
                }
            }

        } else if (str == end) { /* empty string */
            // empty address ?
            if (Config.DEBUG_SEARCH) Config.log.println("onig_search: empty string.");

            if (regex.thresholdLength == 0) {
                s = start = str;
                prev = -1;
                msaInit(option, start, start);

                if (Config.USE_CEC) stateCheckBuffClear();

                if (matchCheck(end, s, prev, interrupt)) return match(s);
                return mismatch();
            }
            return FAILED; // goto mismatch_no_msa;
        }

        if (Config.DEBUG_SEARCH) debugSearch(str, end, start, range);

        msaInit(option, origStart, gpos);
        if (Config.USE_CEC) {
            int offset = Math.min(start, range) - str;
            stateCheckBuffInit(end - str, offset, regex.numCombExpCheck);
        }

        s = start;
        if (range > start) {    /* forward search */
            if (s > str) {
                prev = enc.prevCharHead(bytes, str, s, end);
            } else {
                prev = 0; // -1
            }

            if (regex.forward != null) {
                int schRange = range;
                if (regex.dMax != 0) {
                    if (regex.dMax == MinMaxLen.INFINITE_DISTANCE) {
                        schRange = end;
                    } else {
                        schRange += regex.dMax;
                        if (schRange > end) schRange = end;
                    }
                }
                if ((end - start) < regex.thresholdLength) return mismatch();

                if (regex.dMax != MinMaxLen.INFINITE_DISTANCE) {
                    do {
                        if (!forwardSearchRange(bytes, str, end, s, schRange, this)) return mismatch(); // low, high, lowPrev
                        if (s < low) {
                            s = low;
                            prev = value;
                        }
                        while (s <= high) {
                            if (matchCheck(origRange, s, prev, interrupt)) return match(s); // ???
                            prev = s;
                            s += enc.length(bytes, s, end);
                        }
                    } while (s < range);
                    return mismatch();

                } else { /* check only. */
                    if (!forwardSearchRange(bytes, str, end, s, schRange, null)) return mismatch();

                    if ((regex.anchor & AnchorType.ANYCHAR_STAR) != 0) {
                        do {
                            if (matchCheck(origRange, s, prev, interrupt)) return match(s);
                            prev = s;
                            s += enc.length(bytes, s, end);

                            if ((regex.anchor & (AnchorType.LOOK_BEHIND | AnchorType.PREC_READ_NOT)) == 0) {
                                while (!enc.isNewLine(bytes, prev, end) && s < range) {
                                    prev = s;
                                    s += enc.length(bytes, s, end);
                                }
                            }
                        } while (s < range);
                        return mismatch();
                    }
                }
            }

            do {
                if (matchCheck(origRange, s, prev, interrupt)) return match(s);
                prev = s;
                s += enc.length(bytes, s, end);
            } while (s < range);

            if (s == range) { /* because empty match with /$/. */
                if (matchCheck(origRange, s, prev, interrupt)) return match(s);
            }
        } else { /* backward search */
            if (Config.USE_MATCH_RANGE_MUST_BE_INSIDE_OF_SPECIFIED_RANGE) {
                if (origStart < end) {
                    origStart += enc.length(bytes, origStart, end); // /* is upper range */
                }
            }

            if (regex.backward != null) {
                int adjrange;
                if (range < end) {
                    adjrange = enc.leftAdjustCharHead(bytes, str, range, end);
                } else {
                    adjrange = end;
                }
                if (regex.dMax != MinMaxLen.INFINITE_DISTANCE && (end - range) >= regex.thresholdLength) {
                    do {
                        int schStart = s + regex.dMax;
                        if (schStart > end) schStart = end;
                        if (!backwardSearchRange(bytes, str, end, schStart, range, adjrange)) return mismatch(); // low, high
                        if (s > high) s = high;
                        while (s != -1 && s >= low) {
                            prev = enc.prevCharHead(bytes, str, s, end);
                            if (matchCheck(origStart, s, prev, interrupt)) return match(s);
                            s = prev;
                        }
                    } while (s >= range);
                    return mismatch();
                } else { /* check only. */
                    if ((end - range) < regex.thresholdLength) return mismatch();

                    int schStart = s;
                    if (regex.dMax != 0) {
                        if (regex.dMax == MinMaxLen.INFINITE_DISTANCE) {
                            schStart = end;
                        } else {
                            schStart += regex.dMax;
                            if (schStart > end) {
                                schStart = end;
                            } else {
                                schStart = enc.leftAdjustCharHead(bytes, start, schStart, end);
                            }
                        }
                    }
                    if (!backwardSearchRange(bytes, str, end, schStart, range, adjrange)) return mismatch();
                }
            }

            do {
                prev = enc.prevCharHead(bytes, str, s, end);
                if (matchCheck(origStart, s, prev, interrupt)) return match(s);
                s = prev;
            } while (s >= range);

        }
        return mismatch();
    }

    private final boolean endBuf(int start, int range, int minSemiEnd, int maxSemiEnd) {
        if ((maxSemiEnd - str) < regex.anchorDmin) return true; // mismatch_no_msa;

        if (range > start) {
            if ((minSemiEnd - start) > regex.anchorDmax) {
                start = minSemiEnd - regex.anchorDmax;
                if (start < end) {
                    start = enc.rightAdjustCharHead(bytes, str, start, end);
                } else { /* match with empty at end */
                    start = enc.prevCharHead(bytes, str, end, end);
                }
            }
            if ((maxSemiEnd - (range - 1)) < regex.anchorDmin) {
                range = maxSemiEnd - regex.anchorDmin + 1;
            }
            if (start >= range) return true; // mismatch_no_msa;
        } else {
            if ((minSemiEnd - range) > regex.anchorDmax) {
                range = minSemiEnd - regex.anchorDmax;
            }
            if ((maxSemiEnd - start) < regex.anchorDmin) {
                start = maxSemiEnd - regex.anchorDmin;
                start = enc.leftAdjustCharHead(bytes, str, start, end);
            }
            if (range > start) return true; // mismatch_no_msa;
        }
        return false;
    }

    private final int match(int s) {
        return s - str; // sstart ???
    }

    private final int mismatch() {
        if (Config.USE_FIND_LONGEST_SEARCH_ALL_OF_RANGE) {
            if (msaBestLen >= 0) {
                int s = msaBestS;
                return match(s);
            }
        }
        // falls through finish:
        return FAILED;
    }

    private byte[]icbuf;

    protected final byte[]icbuf() {
        return icbuf == null ? icbuf = new byte[Config.ENC_MBC_CASE_FOLD_MAXLEN] : icbuf;
    }

    static boolean isMbcAsciiWord(Encoding enc, byte[]bytes, int p, int end) { // ONIGENC_IS_MBC_ASCII_WORD
        return ASCIIEncoding.INSTANCE.isCodeCType(enc.mbcToCode(bytes, p, end), CharacterType.WORD);
    }

    private final void debugForwardSearchRange(int str, int end, int s, int range) {
        if (Config.DEBUG_SEARCH) {
            Config.log.println("forward_search_range: " +
                "str: " + str +
                ", end: " + end +
                ", s: " + s +
                ", range: " + range);
        }
    }

    private final void debugForwardSearchRangeSuccess(int str, int low, int high) {
        if (Config.DEBUG_SEARCH) {
            Config.log.println("forward_search_range success: " +
                "low: " + (low - str) +
                ", high: " + (high - str) +
                ", dmin: " + regex.dMin +
                ", dmax: " + regex.dMax);
        }
    }

    private final void debugSearch(int str, int end, int start, int range) {
        if (Config.DEBUG_SEARCH) {
            Config.log.println("onig_search (entry point): " +
                "str: " + str +
                ", end: " + (end - str) +
                ", start: " + (start - str) +
                ", range " + (range - str));
        }
    }

    private final void debugBackwardSearchRange(int str, int low, int high) {
        if (Config.DEBUG_SEARCH) {
            Config.log.println("backward_search_range: "+
                "low: " + (low - str) +
                ", high: " + (high - str));
        }
    }

    static void debugSearch(String name, int textP, int textEnd, int textRange) {
        Config.log.println(name + ": text: " + textP + ", text_end: " + textEnd + ", text_range: " + textRange);
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy