org.apache.lucene.analysis.TokenStream Maven / Gradle / Ivy
Show all versions of aem-sdk-api Show documentation
/*
* COPIED FROM APACHE LUCENE 4.7.2
*
* Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
*
* (see https://issues.apache.org/jira/browse/OAK-10786 for details)
*/
package org.apache.lucene.analysis;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Closeable;
import java.lang.reflect.Modifier;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeSource;
/**
* A TokenStream
enumerates the sequence of tokens, either from
* {@link Field}s of a {@link Document} or from query text.
*
* This is an abstract class; concrete subclasses are:
*
* - {@link Tokenizer}, a
TokenStream
whose input is a Reader; and
* - {@link TokenFilter}, a
TokenStream
whose input is another
* TokenStream
.
*
* A new TokenStream
API has been introduced with Lucene 2.9. This API
* has moved from being {@link Token}-based to {@link Attribute}-based. While
* {@link Token} still exists in 2.9 as a convenience class, the preferred way
* to store the information of a {@link Token} is to use {@link AttributeImpl}s.
*
* TokenStream
now extends {@link AttributeSource}, which provides
* access to all of the token {@link Attribute}s for the TokenStream
.
* Note that only one instance per {@link AttributeImpl} is created and reused
* for every token. This approach reduces object creation and allows local
* caching of references to the {@link AttributeImpl}s. See
* {@link #incrementToken()} for further details.
*
* The workflow of the new TokenStream
API is as follows:
*
* - Instantiation of
TokenStream
/{@link TokenFilter}s which add/get
* attributes to/from the {@link AttributeSource}.
* - The consumer calls {@link TokenStream#reset()}.
*
- The consumer retrieves attributes from the stream and stores local
* references to all attributes it wants to access.
*
- The consumer calls {@link #incrementToken()} until it returns false
* consuming the attributes after each call.
*
- The consumer calls {@link #end()} so that any end-of-stream operations
* can be performed.
*
- The consumer calls {@link #close()} to release any resource when finished
* using the
TokenStream
.
*
* To make sure that filters and consumers know which attributes are available,
* the attributes must be added during instantiation. Filters and consumers are
* not required to check for availability of attributes in
* {@link #incrementToken()}.
*
* You can find some example code for the new API in the analysis package level
* Javadoc.
*
* Sometimes it is desirable to capture a current state of a TokenStream
,
* e.g., for buffering purposes (see {@link CachingTokenFilter},
* TeeSinkTokenFilter). For this usecase
* {@link AttributeSource#captureState} and {@link AttributeSource#restoreState}
* can be used.
*
The {@code TokenStream}-API in Lucene is based on the decorator pattern.
* Therefore all non-abstract subclasses must be final or have at least a final
* implementation of {@link #incrementToken}! This is checked when Java
* assertions are enabled.
*/
public abstract class TokenStream extends AttributeSource implements Closeable {
/**
* A TokenStream using the default attribute factory.
*/
protected TokenStream() {
super();
assert assertFinal();
}
/**
* A TokenStream that uses the same attributes as the supplied one.
*/
protected TokenStream(AttributeSource input) {
super(input);
assert assertFinal();
}
/**
* A TokenStream using the supplied AttributeFactory for creating new {@link Attribute} instances.
*/
protected TokenStream(AttributeFactory factory) {
super(factory);
assert assertFinal();
}
private boolean assertFinal() {
try {
final Class> clazz = getClass();
if (!clazz.desiredAssertionStatus())
return true;
assert clazz.isAnonymousClass() ||
(clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
Modifier.isFinal(clazz.getMethod("incrementToken").getModifiers()) :
"TokenStream implementation classes or at least their incrementToken() implementation must be final";
return true;
} catch (NoSuchMethodException nsme) {
return false;
}
}
/**
* Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to
* the next token. Implementing classes must implement this method and update
* the appropriate {@link AttributeImpl}s with the attributes of the next
* token.
*
* The producer must make no assumptions about the attributes after the method
* has been returned: the caller may arbitrarily change it. If the producer
* needs to preserve the state for subsequent calls, it can use
* {@link #captureState} to create a copy of the current attribute state.
*
* This method is called for every token of a document, so an efficient
* implementation is crucial for good performance. To avoid calls to
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)},
* references to all {@link AttributeImpl}s that this stream uses should be
* retrieved during instantiation.
*
* To ensure that filters and consumers know which attributes are available,
* the attributes must be added during instantiation. Filters and consumers
* are not required to check for availability of attributes in
* {@link #incrementToken()}.
*
* @return false for end of stream; true otherwise
*/
public abstract boolean incrementToken() throws IOException;
/**
* This method is called by the consumer after the last token has been
* consumed, after {@link #incrementToken()} returned false
* (using the new TokenStream
API). Streams implementing the old API
* should upgrade to use this feature.
*
* This method can be used to perform any end-of-stream operations, such as
* setting the final offset of a stream. The final offset of a stream might
* differ from the offset of the last token eg in case one or more whitespaces
* followed after the last token, but a WhitespaceTokenizer was used.
*
* Additionally any skipped positions (such as those removed by a stopfilter)
* can be applied to the position increment, or any adjustment of other
* attributes where the end-of-stream value may be important.
*
* If you override this method, always call {@code super.end()}.
*
* @throws IOException If an I/O error occurs
*/
public void end() throws IOException {
clearAttributes(); // LUCENE-3849: don't consume dirty atts
if (hasAttribute(PositionIncrementAttribute.class)) {
getAttribute(PositionIncrementAttribute.class).setPositionIncrement(0);
}
}
/**
* This method is called by a consumer before it begins consumption using
* {@link #incrementToken()}.
*
* Resets this stream to a clean state. Stateful implementations must implement
* this method so that they can be reused, just as if they had been created fresh.
*
* If you override this method, always call {@code super.reset()}, otherwise
* some internal state will not be correctly reset (e.g., {@link Tokenizer} will
* throw {@link IllegalStateException} on further usage).
*/
public void reset() throws IOException {}
/** Releases resources associated with this stream.
*
* If you override this method, always call {@code super.close()}, otherwise
* some internal state will not be correctly reset (e.g., {@link Tokenizer} will
* throw {@link IllegalStateException} on reuse).
*/
@Override
public void close() throws IOException {}
}