org.hsqldb.lib.KMPSearchAlgorithm Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hsqldb Show documentation
HSQLDB - Lightweight 100% Java SQL Database Engine
There is a newer version: 2.7.2
/* Copyright (c) 2001-2019, The HSQL Development Group
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this
 * list of conditions and the following disclaimer.
 *
 * Redistributions in binary form must reproduce the above copyright notice,
 * this list of conditions and the following disclaimer in the documentation
 * and/or other materials provided with the distribution.
 *
 * Neither the name of the HSQL Development Group nor the names of its
 * contributors may be used to endorse or promote products derived from this
 * software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL HSQL DEVELOPMENT GROUP, HSQLDB.ORG,
 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


package org.hsqldb.lib;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;

/**
 * Implements the Knuth-Morris-Pratt string search algorithm for searching
 * streams or arrays of octets or characters. 
 *
 * This algorithm is a good choice for searching large, forward-only access
 * streams for repeated search using pre-processed small to medium sized
 * patterns.  

 *
 * This is because in addition to the facts that it:
 *
 * 

 * does not require pre-processing the searched data (only the pattern)
 * 
scans strictly left-to-right
 * 
does not need to perform back tracking
 * 
does not need to employ reverse scan order
 * 
does not need to perform effectively random access lookups against
 *     the searched data or pattern
 * 
 *
 * it also has:
 *
 * 
 * a very simple, highly predictable behavior
 * 
an O(n) complexity once the a search pattern is preprocessed
 * 
an O(m) complexity for preprocessing search patterns
 * 
a worst case performance characteristic of only 2n
 * 
a typical performance characteristic that is deemed to be
 *     2-3 times better than the naive search algorithm employed by
 *     {@link String#indexOf(java.lang.String,int)}.
 * 
 *
 * Note that the Boyer-Moore algorithm is generally considered to be the better
 * practical, all-round exact sub-string search algorithm, but due to its
 * reverse pattern scan order, performance considerations dictate that it
 * requires more space and that is somewhat more complex to implement
 * efficiently for searching forward-only access streams. 
 *
 * In  particular, its higher average performance is biased toward larger
 * search patterns, due to its ability to skip ahead further and with fewer
 * tests under reverse pattern scan.  But when searching forward-only access
 * streams, overall performance considerations require the use a circular buffer
 * of the same size as the search pattern to hold data from the searched stream
 * as it is being compared in reverse order to the search pattern.  Hence,
 * Boyer-Moore requires at minimum twice the memory required by Knuth-Morris-Pratt
 * to search for the same pattern and that factor has the greatest impact
 * precisely on the same class of patterns (larger) for which it is most
 * outperforms Knuth-Morris-Pratt.
 *
 * @author Campbell Burnet (campbell-burnet@users dot sourceforge.net)
 * @version 2.1
 * @since 2.1
 * @see Knuth-Morris-Pratt algorithm
 */
public class KMPSearchAlgorithm {

    /**
     * Searches the given octet stream for the given octet pattern
     * returning the zero-based offset from the initial stream position
     * at which the first match is detected. 

     *
     * Note that the signature includes a slot for the table so that
     * searches for a pattern can be performed multiple times without
     * incurring the overhead of computing the table each time.
     *
     * @param inputStream in which to search
     * @param pattern for which to search
     * @param table computed from the pattern that optimizes the search.
     *        If null, automatically computed.
     * @return zero-based offset of first match; -1 if no match found.
     * @throws IOException when an error occurs accessing the input stream.
     */
    public static long search(final InputStream inputStream,
                              final byte[] pattern,
                              int[] table) throws IOException {

        if (inputStream == null || pattern == null || pattern.length == 0) {
            return -1;
        }

        //
        final int patternLength = pattern.length;

        //
        long streamIndex = -1;
        int  currentByte;

        if (patternLength == 1) {
            final int byteToFind = pattern[0];

            while (-1 != (currentByte = inputStream.read())) {
                streamIndex++;

                if (currentByte == byteToFind) {
                    return streamIndex;
                }
            }

            return -1;
        }

        int patternIndex = 0;

        if (table == null) {
            table = computeTable(pattern);
        }

        while (-1 != (currentByte = inputStream.read())) {
            streamIndex++;

            if (currentByte == pattern[patternIndex]) {
                patternIndex++;
            } else if (patternIndex > 0) {
                patternIndex = table[patternIndex];

                patternIndex++;
            }

            if (patternIndex == patternLength) {
                return streamIndex - (patternLength - 1);
            }
        }

        return -1;
    }

    /**
     * Searches the given character stream for the given character pattern
     * returning the zero-based offset from the initial stream position
     * at which the first match is detected. 

     *
     * Note that the signature includes a slot for the table so that
     * searches for a pattern can be performed multiple times without
     * incurring the overhead of computing the table each time.
     *
     * @param reader in which to search
     * @param pattern for which to search
     * @param table computed from the pattern that optimizes the search
     *        If null, automatically computed.
     * @return zero-based offset of first match; -1 if no match found.
     * @throws IOException when an error occurs accessing the input stream.
     */
    public static long search(final Reader reader, final char[] pattern,
                              int[] table) throws IOException {

        if (reader == null || pattern == null || pattern.length == 0) {
            return -1;
        }

        //
        final int patternLength = pattern.length;

        //
        long streamIndex = -1;
        int  currentCharacter;

        if (patternLength == 1) {
            final int characterToFind = pattern[0];

            while (-1 != (currentCharacter = reader.read())) {
                streamIndex++;

                if (currentCharacter == characterToFind) {
                    return streamIndex;
                }
            }

            return -1;
        }

        int patternIndex = 0;

        if (table == null) {
            table = computeTable(pattern);
        }

        while (-1 != (currentCharacter = reader.read())) {
            streamIndex++;

            if (currentCharacter == pattern[patternIndex]) {
                patternIndex++;
            } else if (patternIndex > 0) {
                patternIndex = table[patternIndex];

                patternIndex++;
            }

            if (patternIndex == patternLength) {
                return streamIndex - (patternLength - 1);
            }
        }

        return -1;
    }

    /**
     * Searches the given octet string for the given octet pattern
     * returning the zero-based offset from given start position
     * at which the first match is detected. 
     *
     * Note that the signature includes a slot for the table so that
     * searches for a pattern can be performed multiple times without
     * incurring the overhead of computing the table each time.
     *
     * @param source array in which to search
     * @param pattern to be matched
     * @param table computed from the pattern that optimizes the search
     *        If null, automatically computed.
     * @param start position in source at which to start the search
     */
    public static int search(final byte[] source, final byte[] pattern,
                             int[] table, final int start) {

        if (source == null || pattern == null || pattern.length == 0) {
            return -1;
        }

        //
        final int sourceLength  = source.length;
        final int patternLength = pattern.length;

        //
        int sourceIndex = start;

        if (patternLength == 1) {
            final int byteToFind = pattern[0];

            for (; sourceIndex < sourceLength; sourceIndex++) {
                if (source[sourceIndex] == byteToFind) {
                    return sourceIndex;
                }
            }

            return -1;
        }

        //
        int matchStart   = start;
        int patternIndex = 0;

        //
        if (table == null) {
            table = computeTable(pattern);
        }

        //
        while ((sourceIndex < sourceLength)
                && (patternIndex < patternLength)) {
            if (source[sourceIndex] == pattern[patternIndex]) {
                patternIndex++;
            } else {
                final int tableValue = table[patternIndex];

                matchStart += (patternIndex - tableValue);

                if (patternIndex > 0) {
                    patternIndex = tableValue;
                }

                patternIndex++;
            }

            sourceIndex = (matchStart + patternIndex);
        }

        if (patternIndex == patternLength) {
            return matchStart;
        } else {
            return -1;
        }
    }

    /**
     * Searches the given character array for the given character pattern
     * returning the zero-based offset from given start position
     * at which the first match is detected.
     *
     * @param source array in which to search
     * @param pattern to be matched
     * @param table computed from the pattern that optimizes the search
     *        If null, automatically computed.
     * @param start position in source at which to start the search
     */
    public static int search(final char[] source, final char[] pattern,
                             int[] table, final int start) {

        if (source == null || pattern == null || pattern.length == 0) {
            return -1;
        }

        final int sourceLength  = source.length;
        final int patternLength = pattern.length;
        int       sourceIndex   = start;

        if (patternLength == 1) {
            final int characterToFind = pattern[0];

            for (; sourceIndex < sourceLength; sourceIndex++) {
                if (source[sourceIndex] == characterToFind) {
                    return sourceIndex;
                }
            }

            return -1;
        }

        //
        int matchStart   = start;
        int patternIndex = 0;

        //
        if (table == null) {
            table = computeTable(pattern);
        }

        //
        while ((sourceIndex < sourceLength)
                && (patternIndex < patternLength)) {
            if (source[sourceIndex] == pattern[patternIndex]) {
                patternIndex++;
            } else {
                final int tableValue = table[patternIndex];

                matchStart += (patternIndex - tableValue);

                if (patternIndex > 0) {
                    patternIndex = tableValue;
                }

                patternIndex++;
            }

            sourceIndex = (matchStart + patternIndex);
        }

        if (patternIndex == patternLength) {
            return matchStart;
        } else {
            return -1;
        }
    }

    /**
     * Searches the given String object for the given character pattern
     * returning the zero-based offset from given start position
     * at which the first match is detected.
     *
     * @param source array to be searched
     * @param pattern to be matched
     * @param table computed from the pattern that optimizes the search
     * @param start position in source at which to start the search
     */
    public static int search(final String source, final String pattern,
                             int[] table, final int start) {

        if (source == null || pattern == null || pattern.length() == 0) {
            return -1;
        }

        final int patternLength = pattern.length();

        //
        if (patternLength == 1) {
            return source.indexOf(pattern, start);
        }

        //
        final int sourceLength = source.length();

        //
        int matchStart   = start;
        int sourceIndex  = start;
        int patternIndex = 0;

        //
        if (table == null) {
            table = computeTable(pattern);
        }

        //
        while ((sourceIndex < sourceLength)
                && (patternIndex < patternLength)) {
            if (source.charAt(sourceIndex) == pattern.charAt(patternIndex)) {
                patternIndex++;
            } else {
                final int tableValue = table[patternIndex];

                matchStart += (patternIndex - tableValue);

                if (patternIndex > 0) {
                    patternIndex = tableValue;
                }

                patternIndex++;
            }

            sourceIndex = matchStart + patternIndex;
        }

        if (patternIndex == patternLength) {
            return matchStart;
        } else {
            return -1;
        }
    }

    /**
     * computes the table used to optimize octet pattern search
     *
     * @param pattern for which to compute the table.
     * @return the table computed from the octet pattern.
     */
    public static int[] computeTable(final byte[] pattern) {

        if (pattern == null) {
            throw new IllegalArgumentException("Pattern must  not be null.");
        } else if (pattern.length < 2) {
            throw new IllegalArgumentException("Pattern length must be > 1.");
        }

        //
        final int[] table = new int[pattern.length];
        int         i     = 2;
        int         j     = 0;

        //
        table[0] = -1;
        table[1] = 0;

        //
        while (i < pattern.length) {
            if (pattern[i - 1] == pattern[j]) {
                table[i] = j + 1;

                j++;
                i++;
            } else if (j > 0) {
                j = table[j];
            } else {
                table[i] = 0;

                i++;

                j = 0;
            }
        }

        //
        return table;
    }

    public static int[] computeTable(final char[] pattern) {

        if (pattern == null) {
            throw new IllegalArgumentException("Pattern must  not be null.");
        } else if (pattern.length < 2) {
            throw new IllegalArgumentException("Pattern length must be > 1.");
        }

        int[] table = new int[pattern.length];
        int   i     = 2;
        int   j     = 0;

        table[0] = -1;
        table[1] = 0;

        while (i < pattern.length) {
            if (pattern[i - 1] == pattern[j]) {
                table[i] = j + 1;

                j++;
                i++;
            } else if (j > 0) {
                j = table[j];
            } else {
                table[i] = 0;

                i++;

                j = 0;
            }
        }

        return table;
    }

    public static int[] computeTable(final String pattern) {

        if (pattern == null) {
            throw new IllegalArgumentException("Pattern must  not be null.");
        } else if (pattern.length() < 2) {
            throw new IllegalArgumentException("Pattern length must be > 1.");
        }

        final int patternLength = pattern.length();

        //
        int[] table = new int[patternLength];
        int   i     = 2;
        int   j     = 0;

        table[0] = -1;
        table[1] = 0;

        while (i < patternLength) {
            if (pattern.charAt(i - 1) == pattern.charAt(j)) {
                table[i] = j + 1;

                j++;
                i++;
            } else if (j > 0) {
                j = table[j];
            } else {
                table[i] = 0;

                i++;

                j = 0;
            }
        }

        return table;
    }
}