All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.byteseek.matcher.multisequence.TrieMultiSequenceMatcher Maven / Gradle / Ivy

There is a newer version: 2.0.3
Show newest version
/*
 * Copyright Matt Palmer 2009-2012, All rights reserved.
 *
 * This code is licensed under a standard 3-clause BSD license:
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice, 
 *    this list of conditions and the following disclaimer.
 * 
 *  * Redistributions in binary form must reproduce the above copyright notice, 
 *    this list of conditions and the following disclaimer in the documentation 
 *    and/or other materials provided with the distribution.
 * 
 *  * The names of its contributors may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 * 
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
 * POSSIBILITY OF SUCH DAMAGE.
 */

package net.byteseek.matcher.multisequence;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import net.byteseek.automata.State;
import net.byteseek.automata.trie.Trie;
import net.byteseek.automata.trie.TrieFactory;
import net.byteseek.io.reader.windows.Window;
import net.byteseek.io.reader.WindowReader;
import net.byteseek.matcher.automata.SequenceMatcherTrieFactory;
import net.byteseek.matcher.sequence.SequenceMatcher;
import net.byteseek.utils.ArgUtils;

/**
 * A {@link MultiSequenceMatcher} uses a {@link Trie} structure to match 
 * with. A Trie is a deterministic automata, arranged as a prefix tree of all
 * the sequences in it.  This means that no matter how many SequenceMatchers
 * are added to it (hundreds, thousands, mjllions...), to match it performs no more
 * comparisons than required for the longest sequence in the Trie (and usually less). 
 * 

* This is highly time-efficient, but takes more space to hold the Trie automata * states in addition to the original list of SequenceMatchers used to construct * the Trie. *

* Note that for a very low number of SequenceMatchers, it is possible that a simpler * matcher, such as the {@link ListMultiSequenceMatcher} may be faster, due to lower * complexity and fewer objects required. *

* The TrieMultiSequenceMatcher is immutable, so can be safely used in multi- * threaded applications. * * @author Matt Palmer */ public final class TrieMultiSequenceMatcher implements MultiSequenceMatcher { private final static TrieFactory DEFAULT_TRIE_FACTORY = new SequenceMatcherTrieFactory(); private final Trie trie; /** * Constructs an immutable TrieMultiSequenceMatcher from a collection of {@link SequenceMatcher}s, * using a default {@link SequenceMatcherTrieFactory} to create the Trie with. * * @param matchers The collection of sequences to construct the TrieMultiSequenceMatcher from. */ public TrieMultiSequenceMatcher(final Collection matchers) { this(DEFAULT_TRIE_FACTORY, matchers); } /** * Constructs an immutable TrieMultiSequenceMatcher from a collection of {@link SequenceMatcher}s, * using the {@link TrieFactory} provided to build the Trie. * * @param factory The factory to create a {@link Trie} with. * @param matchers The collection of sequences to construct the TrieMultiSequenceMatcher from. */ public TrieMultiSequenceMatcher(final TrieFactory factory, final Collection matchers) { ArgUtils.checkNullObject(factory, "factory"); ArgUtils.checkNullOrEmptyCollection(matchers, "matchers"); this.trie = factory.create(matchers); } /** * {@inheritDoc} */ @Override public Collection allMatches(final WindowReader reader, final long matchPosition) throws IOException { List result = Collections.emptyList(); State state = trie.getInitialState(); long currentPosition = matchPosition; Window window = reader.getWindow(matchPosition); while (window != null) { final int windowLength = window.length(); final byte[] array = window.getArray(); int windowPosition = reader.getWindowOffset(currentPosition); while (windowPosition < windowLength) { final byte currentByte = array[windowPosition++]; state = state.getNextState(currentByte); if (state == null) { return result; } if (state.isFinal()) { final Collection matching = state.getAssociations(); if (result.isEmpty()) { result = new ArrayList(matching.size() * 2); } result.addAll(matching); } } currentPosition += windowLength; window = reader.getWindow(matchPosition); } return result; } /** * {@inheritDoc} */ @Override public Collection allMatches(final byte[] bytes, final int matchPosition) { List result = Collections.emptyList(); final int noOfBytes = bytes.length; final int minimumLength = trie.getMinimumLength(); if (matchPosition >= minimumLength - 1 && matchPosition + minimumLength < noOfBytes) { State state = trie.getInitialState(); int currentPosition = matchPosition; while (state != null && currentPosition < noOfBytes) { final byte currentByte = bytes[currentPosition++]; state = state.getNextState(currentByte); if (state != null && state.isFinal()) { final Collection matching = state.getAssociations(); if (result.isEmpty()) { result = new ArrayList(matching.size() * 2); } result.addAll(matching); } } } return result; } /** * {@inheritDoc} */ @Override public Collection allMatchesBackwards(final WindowReader reader, final long matchPosition) throws IOException { List result = Collections.emptyList(); State state = trie.getInitialState(); long currentPosition = matchPosition; Window window = reader.getWindow(matchPosition); while (window != null) { final int windowLength = window.length(); final byte[] array = window.getArray(); int windowPosition = reader.getWindowOffset(currentPosition); while (windowPosition >= 0) { final byte currentByte = array[windowPosition--]; state = state.getNextState(currentByte); if (state == null) { return result; } if (state.isFinal()) { final Collection matching = state.getAssociations(); if (result.isEmpty()) { result = new ArrayList(matching.size() * 2); } result.addAll(matching); } } currentPosition -= windowLength; window = reader.getWindow(matchPosition); } return result; } /** * {@inheritDoc} */ @Override public Collection allMatchesBackwards(final byte[] bytes, final int matchPosition) { List result = Collections.emptyList(); final int noOfBytes = bytes.length; final int minimumLength = trie.getMinimumLength(); if (matchPosition >= minimumLength - 1 && matchPosition + minimumLength < noOfBytes) { State state = trie.getInitialState(); int currentPosition = matchPosition; while (state != null && currentPosition >= 0) { final byte currentByte = bytes[currentPosition--]; state = state.getNextState(currentByte); if (state != null && state.isFinal()) { final Collection matching = state.getAssociations(); if (result.isEmpty()) { result = new ArrayList(matching.size() * 2); } result.addAll(matching); } } } return result; } /** * {@inheritDoc} */ @Override public SequenceMatcher firstMatch(final WindowReader reader, final long matchPosition) throws IOException { State state = trie.getInitialState(); long currentPosition = matchPosition; Window window = reader.getWindow(matchPosition); while (window != null) { final int windowLength = window.length(); final byte[] array = window.getArray(); int windowPosition = reader.getWindowOffset(currentPosition); while (windowPosition < windowLength) { final byte currentByte = array[windowPosition++]; state = state.getNextState(currentByte); if (state == null) { return null; } if (state.isFinal()) { return getFirstAssociation(state); } } currentPosition += windowLength; window = reader.getWindow(matchPosition); } return null; } /** * {@inheritDoc} */ @Override public SequenceMatcher firstMatch(final byte[] bytes, final int matchPosition) { if (matchPosition >= 0) { final int noOfBytes = bytes.length; State state = trie.getInitialState(); int currentPosition = matchPosition; while (state != null && currentPosition < noOfBytes) { final byte currentByte = bytes[currentPosition++]; state = state.getNextState(currentByte); if (state != null && state.isFinal()) { return getFirstAssociation(state); } } } return null; } /** * {@inheritDoc} */ @Override public SequenceMatcher firstMatchBackwards(final WindowReader reader, final long matchPosition) throws IOException { State state = trie.getInitialState(); long currentPosition = matchPosition; Window window = reader.getWindow(matchPosition); while (window != null) { final int windowLength = window.length(); final byte[] array = window.getArray(); int windowPosition = reader.getWindowOffset(currentPosition); while (windowPosition >= 0) { final byte currentByte = array[windowPosition--]; state = state.getNextState(currentByte); if (state == null) { return null; } if (state.isFinal()) { return getFirstAssociation(state); } } currentPosition -= windowLength; window = reader.getWindow(matchPosition); } return null; } /** * {@inheritDoc} */ @Override public SequenceMatcher firstMatchBackwards(final byte[] bytes, final int matchPosition) { final int noOfBytes = bytes.length; if (matchPosition < noOfBytes) { State state = trie.getInitialState(); int currentPosition = matchPosition; while (state != null && currentPosition >= 0) { final byte currentByte = bytes[currentPosition--]; state = state.getNextState(currentByte); if (state != null && state.isFinal()) { return getFirstAssociation(state); } } } return null; } /** * {@inheritDoc} */ @Override public boolean matches(final WindowReader reader, final long matchPosition) throws IOException { return firstMatch(reader, matchPosition) != null; } /** * {@inheritDoc} */ @Override public boolean matches(final byte[] bytes, final int matchPosition) { return firstMatch(bytes, matchPosition) != null; } /** * {@inheritDoc} */ @Override public boolean matchesBackwards(final WindowReader reader, final long matchPosition) throws IOException { return firstMatchBackwards(reader, matchPosition) != null; } /** * {@inheritDoc} */ @Override public boolean matchesBackwards(final byte[] bytes, final int matchPosition) { return firstMatchBackwards(bytes, matchPosition) != null; } /** * {@inheritDoc} */ @Override public int getMinimumLength() { return trie.getMinimumLength(); } /** * {@inheritDoc} */ @Override public int getMaximumLength() { return trie.getMaximumLength(); } /** * {@inheritDoc} */ @Override public MultiSequenceMatcher reverse() { return new TrieMultiSequenceMatcher(MultiSequenceUtils.reverseMatchers(trie.getSequences())); } /** * {@inheritDoc} */ @Override public MultiSequenceMatcher newInstance(Collection sequences) { return new TrieMultiSequenceMatcher(sequences); } /** * {@inheritDoc} */ @Override public List getSequenceMatchers() { return new ArrayList(trie.getSequences()); } /** * Returns a string representation of this matcher. The format is subject * to change, but it will generally return the name of the matching class * and regular expressions defining the sequences matched by the matcher. * * @return A string representing this matcher. */ @Override public String toString() { return getClass().getSimpleName() + "[sequences:" + trie.getSequences() + ']'; } /** * Returns the SequenceMatcher which happens to be the first one associated * with an automata State. A State may be associated with zero to many * SequenceMatchers. This is to support the firstMatch functions. * * @param state The State to get the first associated SequenceMatcher from. * @return The first associated SequenceMatcher, or null if there are none. */ private SequenceMatcher getFirstAssociation(final State state) { final Iterator associationIterator = state.associationIterator(); if (associationIterator.hasNext()) { return associationIterator.next(); } return null; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy