io.netty.buffer.search.AbstractMultiSearchProcessorFactory Maven / Gradle / Ivy
/*
* Copyright 2020 The Netty Project
*
* The Netty Project licenses this file to you under the Apache License, version 2.0 (the
* "License"); you may not use this file except in compliance with the License. You may obtain a
* copy of the License at:
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package io.netty.buffer.search;
/**
* Base class for precomputed factories that create {@link MultiSearchProcessor}s.
*
* The purpose of {@link MultiSearchProcessor} is to perform efficient simultaneous search for multiple {@code needles}
* in the {@code haystack}, while scanning every byte of the input sequentially, only once. While it can also be used
* to search for just a single {@code needle}, using a {@link SearchProcessorFactory} would be more efficient for
* doing that.
*
* See the documentation of {@link AbstractSearchProcessorFactory} for a comprehensive description of common usage.
* In addition to the functionality provided by {@link SearchProcessor}, {@link MultiSearchProcessor} adds
* a method to get the index of the {@code needle} found at the current position of the {@link MultiSearchProcessor} -
* {@link MultiSearchProcessor#getFoundNeedleId()}.
*
* Note: in some cases one {@code needle} can be a suffix of another {@code needle}, eg. {@code {"BC", "ABC"}},
* and there can potentially be multiple {@code needles} found ending at the same position of the {@code haystack}.
* In such case {@link MultiSearchProcessor#getFoundNeedleId()} returns the index of the longest matching {@code needle}
* in the array of {@code needles}.
*
* Usage example (given that the {@code haystack} is a {@link io.netty.buffer.ByteBuf} containing "ABCD" and the
* {@code needles} are "AB", "BC" and "CD"):
*
* MultiSearchProcessorFactory factory = MultiSearchProcessorFactory.newAhoCorasicSearchProcessorFactory(
* "AB".getBytes(CharsetUtil.UTF_8), "BC".getBytes(CharsetUtil.UTF_8), "CD".getBytes(CharsetUtil.UTF_8));
* MultiSearchProcessor processor = factory.newSearchProcessor();
*
* int idx1 = haystack.forEachByte(processor);
* // idx1 is 1 (index of the last character of the occurrence of "AB" in the haystack)
* // processor.getFoundNeedleId() is 0 (index of "AB" in needles[])
*
* int continueFrom1 = idx1 + 1;
* // continue the search starting from the next character
*
* int idx2 = haystack.forEachByte(continueFrom1, haystack.readableBytes() - continueFrom1, processor);
* // idx2 is 2 (index of the last character of the occurrence of "BC" in the haystack)
* // processor.getFoundNeedleId() is 1 (index of "BC" in needles[])
*
* int continueFrom2 = idx2 + 1;
*
* int idx3 = haystack.forEachByte(continueFrom2, haystack.readableBytes() - continueFrom2, processor);
* // idx3 is 3 (index of the last character of the occurrence of "CD" in the haystack)
* // processor.getFoundNeedleId() is 2 (index of "CD" in needles[])
*
* int continueFrom3 = idx3 + 1;
*
* int idx4 = haystack.forEachByte(continueFrom3, haystack.readableBytes() - continueFrom3, processor);
* // idx4 is -1 (no more occurrences of any of the needles)
*
* // This search session is complete, processor should be discarded.
* // To search for the same needles again, reuse the same {@link AbstractMultiSearchProcessorFactory}
* // to get a new MultiSearchProcessor.
*
*/
public abstract class AbstractMultiSearchProcessorFactory implements MultiSearchProcessorFactory {
/**
* Creates a {@link MultiSearchProcessorFactory} based on
* Aho–Corasick
* string search algorithm.
*
* Precomputation (this method) time is linear in the size of input ({@code O(Σ|needles|)}).
*
* The factory allocates and retains an array of 256 * X ints plus another array of X ints, where X
* is the sum of lengths of each entry of {@code needles} minus the sum of lengths of repeated
* prefixes of the {@code needles}.
*
* Search (the actual application of {@link MultiSearchProcessor}) time is linear in the size of
* {@link io.netty.buffer.ByteBuf} on which the search is performed ({@code O(|haystack|)}).
* Every byte of {@link io.netty.buffer.ByteBuf} is processed only once, sequentually, regardles of
* the number of {@code needles} being searched for.
*
* @param needles a varargs array of arrays of bytes to search for
* @return a new instance of {@link AhoCorasicSearchProcessorFactory} precomputed for the given {@code needles}
*/
public static AhoCorasicSearchProcessorFactory newAhoCorasicSearchProcessorFactory(byte[] ...needles) {
return new AhoCorasicSearchProcessorFactory(needles);
}
}