org.hsqldb.lib.KMPSearchAlgorithm Maven / Gradle / Ivy
/* Copyright (c) 2001-2011, The HSQL Development Group
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* Neither the name of the HSQL Development Group nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL HSQL DEVELOPMENT GROUP, HSQLDB.ORG,
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.hsqldb.lib;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
/**
* Implements the Knuth-Morris-Pratt string search algorithm for searching
* streams or arrays of octets or characters.
*
* This algorithm is a good choice for searching large, forward-only access
* streams for repeated search using pre-processed small to medium sized
* patterns.
*
* This is because in addition to the facts that it:
*
*
* - does not require pre-processing the searched data (only the pattern)
*
- scans strictly left-to-right
*
- does not need to perform back tracking
*
- does not need to employ reverse scan order
*
- does not need to perform effectively random access lookups against
* the searched data or pattern
*
*
* it also has:
*
*
* - a very simple, highly predictable behavior
*
- an O(n) complexity once the a search pattern is preprocessed
*
- an O(m) complexity for preprocessing search patterns
*
- a worst case performance characteristic of only 2n
*
- a typical performance characteristic that is deemed to be
* 2-3 times better than the naive search algorithm employed by
* {@link String#indexOf(java.lang.String,int)}.
*
*
* Note that the Boyer-Moore algorithm is generally considered to be the better
* practical, all-round exact sub-string search algorithm, but due to its
* reverse pattern scan order, performance considerations dictate that it
* requires more space and that is somewhat more complex to implement
* efficiently for searching forward-only access streams.
*
* In particular, its higher average performance is biased toward larger
* search patterns, due to its ability to skip ahead further and with fewer
* tests under reverse pattern scan. But when searching forward-only access
* streams, overall performance considerations require the use a circular buffer
* of the same size as the search pattern to hold data from the searched stream
* as it is being compared in reverse order to the search pattern. Hence,
* Boyer-Moore requires at minimum twice the memory required by Knuth-Morris-Pratt
* to search for the same pattern and that factor has the greatest impact
* precisely on the same class of patterns (larger) for which it is most
* outperforms Knuth-Morris-Pratt.
*
* @author Campbell Boucher-Burnet (boucherb@users dot sourceforge.net)
* @version 2.1
* @since 2.1
* @see Knuth-Morris-Pratt algorithm
*/
public class KMPSearchAlgorithm {
/**
* Searches the given octet stream for the given octet pattern
* returning the zero-based offset from the initial stream position
* at which the first match is detected.
*
* Note that the signature includes a slot for the table so that
* searches for a pattern can be performed multiple times without
* incurring the overhead of computing the table each time.
*
* @param inputStream in which to search
* @param pattern for which to search
* @param table computed from the pattern that optimizes the search.
* If null, automatically computed.
* @return zero-based offset of first match; -1 if no match found.
* @throws IOException when an error occurs accessing the input stream.
*/
public static long search(final InputStream inputStream,
final byte[] pattern,
int[] table) throws IOException {
if (inputStream == null || pattern == null || pattern.length == 0) {
return -1;
}
//
final int patternLength = pattern.length;
//
long streamIndex = -1;
int currentByte;
if (patternLength == 1) {
final int byteToFind = pattern[0];
while (-1 != (currentByte = inputStream.read())) {
streamIndex++;
if (currentByte == byteToFind) {
return streamIndex;
}
}
return -1;
}
int patternIndex = 0;
if (table == null) {
table = computeTable(pattern);
}
while (-1 != (currentByte = inputStream.read())) {
streamIndex++;
if (currentByte == pattern[patternIndex]) {
patternIndex++;
} else if (patternIndex > 0) {
patternIndex = table[patternIndex];
patternIndex++;
}
if (patternIndex == patternLength) {
return streamIndex - (patternLength - 1);
}
}
return -1;
}
/**
* Searches the given character stream for the given character pattern
* returning the zero-based offset from the initial stream position
* at which the first match is detected.
*
* Note that the signature includes a slot for the table so that
* searches for a pattern can be performed multiple times without
* incurring the overhead of computing the table each time.
*
* @param reader in which to search
* @param pattern for which to search
* @param table computed from the pattern that optimizes the search
* If null, automatically computed.
* @return zero-based offset of first match; -1 if no match found.
* @throws IOException when an error occurs accessing the input stream.
*/
public static long search(final Reader reader, final char[] pattern,
int[] table) throws IOException {
if (reader == null || pattern == null || pattern.length == 0) {
return -1;
}
//
final int patternLength = pattern.length;
//
long streamIndex = -1;
int currentCharacter;
if (patternLength == 1) {
final int characterToFind = pattern[0];
while (-1 != (currentCharacter = reader.read())) {
streamIndex++;
if (currentCharacter == characterToFind) {
return streamIndex;
}
}
return -1;
}
int patternIndex = 0;
if (table == null) {
table = computeTable(pattern);
}
while (-1 != (currentCharacter = reader.read())) {
streamIndex++;
if (currentCharacter == pattern[patternIndex]) {
patternIndex++;
} else if (patternIndex > 0) {
patternIndex = table[patternIndex];
patternIndex++;
}
if (patternIndex == patternLength) {
return streamIndex - (patternLength - 1);
}
}
return -1;
}
/**
* Searches the given octet string for the given octet pattern
* returning the zero-based offset from given start position
* at which the first match is detected.
*
* Note that the signature includes a slot for the table so that
* searches for a pattern can be performed multiple times without
* incurring the overhead of computing the table each time.
*
* @param source array in which to search
* @param pattern to be matched
* @param table computed from the pattern that optimizes the search
* If null, automatically computed.
* @param start position in source at which to start the search
*/
public static int search(final byte[] source, final byte[] pattern,
int[] table, final int start) {
if (source == null || pattern == null || pattern.length == 0) {
return -1;
}
//
final int sourceLength = source.length;
final int patternLength = pattern.length;
//
int sourceIndex = start;
if (patternLength == 1) {
final int byteToFind = pattern[0];
for (; sourceIndex < sourceLength; sourceIndex++) {
if (source[sourceIndex] == byteToFind) {
return sourceIndex;
}
}
return -1;
}
//
int matchStart = start;
int patternIndex = 0;
//
if (table == null) {
table = computeTable(pattern);
}
//
while ((sourceIndex < sourceLength)
&& (patternIndex < patternLength)) {
if (source[sourceIndex] == pattern[patternIndex]) {
patternIndex++;
} else {
final int tableVaue = table[patternIndex];
matchStart += (patternIndex - tableVaue);
if (patternIndex > 0) {
patternIndex = tableVaue;
}
patternIndex++;
}
sourceIndex = (matchStart + patternIndex);
}
if (patternIndex == patternLength) {
return matchStart;
} else {
return -1;
}
}
/**
* Searches the given character array for the given character pattern
* returning the zero-based offset from given start position
* at which the first match is detected.
*
* @param source array in which to search
* @param pattern to be matched
* @param table computed from the pattern that optimizes the search
* If null, automatically computed.
* @param start position in source at which to start the search
*/
public static int search(final char[] source, final char[] pattern,
int[] table, final int start) {
if (source == null || pattern == null || pattern.length == 0) {
return -1;
}
final int sourceLength = source.length;
final int patternLength = pattern.length;
int sourceIndex = start;
if (patternLength == 1) {
final int characterToFind = pattern[0];
for (; sourceIndex < sourceLength; sourceIndex++) {
if (source[sourceIndex] == characterToFind) {
return sourceIndex;
}
}
return -1;
}
//
int matchStart = start;
int patternIndex = 0;
//
if (table == null) {
table = computeTable(pattern);
}
//
while ((sourceIndex < sourceLength)
&& (patternIndex < patternLength)) {
if (source[sourceIndex] == pattern[patternIndex]) {
patternIndex++;
} else {
final int tableValue = table[patternIndex];
matchStart += (patternIndex - tableValue);
if (patternIndex > 0) {
patternIndex = tableValue;
}
patternIndex++;
}
sourceIndex = (matchStart + patternIndex);
}
if (patternIndex == patternLength) {
return matchStart;
} else {
return -1;
}
}
/**
* Searches the given String object for the given character pattern
* returning the zero-based offset from given start position
* at which the first match is detected.
*
* @param source array to be searched
* @param pattern to be matched
* @param table computed from the pattern that optimizes the search
* @param start position in source at which to start the search
*/
public static int search(final String source, final String pattern,
int[] table, final int start) {
if (source == null || pattern == null || pattern.length() == 0) {
return -1;
}
final int patternLength = pattern.length();
//
if (patternLength == 1) {
return source.indexOf(pattern, start);
}
//
final int sourceLength = source.length();
//
int matchStart = start;
int sourceIndex = start;
int patternIndex = 0;
//
if (table == null) {
table = computeTable(pattern);
}
//
while ((sourceIndex < sourceLength)
&& (patternIndex < patternLength)) {
if (source.charAt(sourceIndex) == pattern.charAt(patternIndex)) {
patternIndex++;
} else {
final int tableValue = table[patternIndex];
matchStart += (patternIndex - tableValue);
if (patternIndex > 0) {
patternIndex = tableValue;
}
patternIndex++;
}
sourceIndex = matchStart + patternIndex;
}
if (patternIndex == patternLength) {
return matchStart;
} else {
return -1;
}
}
/**
* computes the table used to optimize octet pattern search
*
* @param pattern for which to compute the table.
* @return the table computed from the octet pattern.
*/
public static int[] computeTable(final byte[] pattern) {
if (pattern == null) {
throw new IllegalArgumentException("Pattern must not be null.");
} else if (pattern.length < 2) {
throw new IllegalArgumentException("Pattern length must be > 1.");
}
//
final int[] table = new int[pattern.length];
int i = 2;
int j = 0;
//
table[0] = -1;
table[1] = 0;
//
while (i < pattern.length) {
if (pattern[i - 1] == pattern[j]) {
table[i] = j + 1;
j++;
i++;
} else if (j > 0) {
j = table[j];
} else {
table[i] = 0;
i++;
j = 0;
}
}
//
return table;
}
public static int[] computeTable(final char[] pattern) {
if (pattern == null) {
throw new IllegalArgumentException("Pattern must not be null.");
} else if (pattern.length < 2) {
throw new IllegalArgumentException("Pattern length must be > 1.");
}
int[] table = new int[pattern.length];
int i = 2;
int j = 0;
table[0] = -1;
table[1] = 0;
while (i < pattern.length) {
if (pattern[i - 1] == pattern[j]) {
table[i] = j + 1;
j++;
i++;
} else if (j > 0) {
j = table[j];
} else {
table[i] = 0;
i++;
j = 0;
}
}
return table;
}
public static int[] computeTable(final String pattern) {
if (pattern == null) {
throw new IllegalArgumentException("Pattern must not be null.");
} else if (pattern.length() < 2) {
throw new IllegalArgumentException("Pattern length must be > 1.");
}
final int patternLength = pattern.length();
//
int[] table = new int[patternLength];
int i = 2;
int j = 0;
table[0] = -1;
table[1] = 0;
while (i < patternLength) {
if (pattern.charAt(i - 1) == pattern.charAt(j)) {
table[i] = j + 1;
j++;
i++;
} else if (j > 0) {
j = table[j];
} else {
table[i] = 0;
i++;
j = 0;
}
}
return table;
}
}