All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.likematcher.NfaMatcher Maven / Gradle / Ivy

There is a newer version: 468
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.likematcher;

import java.util.Arrays;
import java.util.List;

final class NfaMatcher
        implements Matcher
{
    private static final int ANY = -1;
    private static final int NONE = -2;
    private static final int INVALID_CODEPOINT = -1;

    private final boolean exact;

    private final boolean[] loopback;
    private final int[] match;
    private final int acceptState;
    private final int stateCount;

    public NfaMatcher(List pattern, int start, int end, boolean exact)
    {
        this.exact = exact;

        stateCount = calculateStateCount(pattern, start, end);

        loopback = new boolean[stateCount];
        match = new int[stateCount];
        Arrays.fill(match, NONE);
        acceptState = stateCount - 1;

        int state = 0;
        for (int j = start; j <= end; j++) {
            Pattern element = pattern.get(j);
            if (element instanceof Pattern.Literal literal) {
                for (int i = 0; i < literal.value().length(); i++) {
                    match[state++] = literal.value().charAt(i);
                }
            }
            else if (element instanceof Pattern.Any any) {
                for (int i = 0; i < any.length(); i++) {
                    match[state++] = ANY;
                }
            }
            else if (element instanceof Pattern.ZeroOrMore) {
                loopback[state] = true;
            }
        }
    }

    private static int calculateStateCount(List pattern, int start, int end)
    {
        int states = 1;
        for (int i = start; i <= end; i++) {
            Pattern element = pattern.get(i);
            if (element instanceof Pattern.Literal literal) {
                states += literal.value().length();
            }
            else if (element instanceof Pattern.Any any) {
                states += any.length();
            }
        }
        return states;
    }

    @Override
    public boolean match(byte[] input, int offset, int length)
    {
        boolean[] seen = new boolean[stateCount + 1];
        int[] currentStates = new int[stateCount];
        int[] nextStates = new int[stateCount];
        int currentStatesIndex = 0;
        int nextStatesIndex;

        currentStates[currentStatesIndex++] = 0;

        int limit = offset + length;
        int current = offset;
        boolean accept = false;
        while (current < limit) {
            int codepoint = INVALID_CODEPOINT;

            // decode the next UTF-8 codepoint
            int header = input[current] & 0xFF;
            if (header < 0x80) {
                // normal ASCII
                // 0xxx_xxxx
                codepoint = header;
                current++;
            }
            else if ((header & 0b1110_0000) == 0b1100_0000) {
                // 110x_xxxx 10xx_xxxx
                if (current + 1 < limit) {
                    codepoint = ((header & 0b0001_1111) << 6) | (input[current + 1] & 0b0011_1111);
                    current += 2;
                }
            }
            else if ((header & 0b1111_0000) == 0b1110_0000) {
                // 1110_xxxx 10xx_xxxx 10xx_xxxx
                if (current + 2 < limit) {
                    codepoint = ((header & 0b0000_1111) << 12) | ((input[current + 1] & 0b0011_1111) << 6) | (input[current + 2] & 0b0011_1111);
                    current += 3;
                }
            }
            else if ((header & 0b1111_1000) == 0b1111_0000) {
                // 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx
                if (current + 3 < limit) {
                    codepoint = ((header & 0b0000_0111) << 18) | ((input[current + 1] & 0b0011_1111) << 12) | ((input[current + 2] & 0b0011_1111) << 6) | (input[current + 3] & 0b0011_1111);
                    current += 4;
                }
            }

            if (codepoint == INVALID_CODEPOINT) {
                return false;
            }

            accept = false;
            nextStatesIndex = 0;
            Arrays.fill(seen, false);
            for (int i = 0; i < currentStatesIndex; i++) {
                int state = currentStates[i];
                if (!seen[state] && loopback[state]) {
                    nextStates[nextStatesIndex++] = state;
                    accept |= state == acceptState;
                    seen[state] = true;
                }
                int next = state + 1;
                if (!seen[next] && (match[state] == ANY || match[state] == codepoint)) {
                    nextStates[nextStatesIndex++] = next;
                    accept |= next == acceptState;
                    seen[next] = true;
                }
            }

            if (nextStatesIndex == 0) {
                return false;
            }

            if (!exact && accept) {
                return true;
            }

            int[] tmp = currentStates;
            currentStates = nextStates;
            nextStates = tmp;
            currentStatesIndex = nextStatesIndex;
        }

        return accept;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy