All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.Analyzer Maven / Gradle / Ivy

There is a newer version: 6.4.2_1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis;

import java.io.Closeable;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import java.util.function.Consumer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.AttributeFactory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CloseableThreadLocal;

/**
 * An Analyzer builds TokenStreams, which analyze text. It thus represents a policy for extracting
 * index terms from text.
 *
 * 

In order to define what analysis is done, subclasses must define their {@link * TokenStreamComponents TokenStreamComponents} in {@link #createComponents(String)}. The components * are then reused in each call to {@link #tokenStream(String, Reader)}. * *

Simple example: * *

 * Analyzer analyzer = new Analyzer() {
 *  {@literal @Override}
 *   protected TokenStreamComponents createComponents(String fieldName) {
 *     Tokenizer source = new FooTokenizer(reader);
 *     TokenStream filter = new FooFilter(source);
 *     filter = new BarFilter(filter);
 *     return new TokenStreamComponents(source, filter);
 *   }
 *   {@literal @Override}
 *   protected TokenStream normalize(String fieldName, TokenStream in) {
 *     // Assuming FooFilter is about normalization and BarFilter is about
 *     // stemming, only FooFilter should be applied
 *     return new FooFilter(in);
 *   }
 * };
 * 
* * For more examples, see the {@link org.apache.lucene.analysis Analysis package documentation}. * *

For some concrete implementations bundled with Lucene, look in the analysis modules: * *

    *
  • Common: Analyzers for * indexing content in different languages and domains. *
  • ICU: Exposes functionality * from ICU to Apache Lucene. *
  • Kuromoji: Morphological * analyzer for Japanese text. *
  • Morfologik: * Dictionary-driven lemmatization for the Polish language. *
  • Phonetic: Analysis for * indexing phonetic signatures (for sounds-alike search). *
  • Smart Chinese: Analyzer * for Simplified Chinese, which indexes words. *
  • Stempel: Algorithmic * Stemmer for the Polish Language. *
* * @since 3.1 */ public abstract class Analyzer implements Closeable { private final ReuseStrategy reuseStrategy; // non final as it gets nulled if closed; pkg private for access by ReuseStrategy's final helper // methods: CloseableThreadLocal storedValue = new CloseableThreadLocal<>(); /** * Create a new Analyzer, reusing the same set of components per-thread across calls to {@link * #tokenStream(String, Reader)}. */ protected Analyzer() { this(GLOBAL_REUSE_STRATEGY); } /** * Expert: create a new Analyzer with a custom {@link ReuseStrategy}. * *

NOTE: if you just want to reuse on a per-field basis, it's easier to use a subclass of * {@link AnalyzerWrapper} such as * PerFieldAnalyzerWrapper instead. */ protected Analyzer(ReuseStrategy reuseStrategy) { this.reuseStrategy = reuseStrategy; } /** * Creates a new {@link TokenStreamComponents} instance for this analyzer. * * @param fieldName the name of the fields content passed to the {@link TokenStreamComponents} * sink as a reader * @return the {@link TokenStreamComponents} for this analyzer. */ protected abstract TokenStreamComponents createComponents(String fieldName); /** * Wrap the given {@link TokenStream} in order to apply normalization filters. The default * implementation returns the {@link TokenStream} as-is. This is used by {@link #normalize(String, * String)}. */ protected TokenStream normalize(String fieldName, TokenStream in) { return in; } /** * Returns a TokenStream suitable for fieldName, tokenizing the contents of * reader. * *

This method uses {@link #createComponents(String)} to obtain an instance of {@link * TokenStreamComponents}. It returns the sink of the components and stores the components * internally. Subsequent calls to this method will reuse the previously stored components after * resetting them through {@link TokenStreamComponents#setReader(Reader)}. * *

NOTE: After calling this method, the consumer must follow the workflow described in * {@link TokenStream} to properly consume its contents. See the {@link org.apache.lucene.analysis * Analysis package documentation} for some examples demonstrating this. * *

NOTE: If your data is available as a {@code String}, use {@link #tokenStream(String, * String)} which reuses a {@code StringReader}-like instance internally. * * @param fieldName the name of the field the created TokenStream is used for * @param reader the reader the streams source reads from * @return TokenStream for iterating the analyzed content of reader * @throws AlreadyClosedException if the Analyzer is closed. * @see #tokenStream(String, String) */ public final TokenStream tokenStream(final String fieldName, final Reader reader) { TokenStreamComponents components = reuseStrategy.getReusableComponents(this, fieldName); final Reader r = initReader(fieldName, reader); if (components == null) { components = createComponents(fieldName); reuseStrategy.setReusableComponents(this, fieldName, components); } components.setReader(r); return components.getTokenStream(); } /** * Returns a TokenStream suitable for fieldName, tokenizing the contents of * text. * *

This method uses {@link #createComponents(String)} to obtain an instance of {@link * TokenStreamComponents}. It returns the sink of the components and stores the components * internally. Subsequent calls to this method will reuse the previously stored components after * resetting them through {@link TokenStreamComponents#setReader(Reader)}. * *

NOTE: After calling this method, the consumer must follow the workflow described in * {@link TokenStream} to properly consume its contents. See the {@link org.apache.lucene.analysis * Analysis package documentation} for some examples demonstrating this. * * @param fieldName the name of the field the created TokenStream is used for * @param text the String the streams source reads from * @return TokenStream for iterating the analyzed content of reader * @throws AlreadyClosedException if the Analyzer is closed. * @see #tokenStream(String, Reader) */ public final TokenStream tokenStream(final String fieldName, final String text) { TokenStreamComponents components = reuseStrategy.getReusableComponents(this, fieldName); @SuppressWarnings("resource") final ReusableStringReader strReader = (components == null || components.reusableStringReader == null) ? new ReusableStringReader() : components.reusableStringReader; strReader.setValue(text); final Reader r = initReader(fieldName, strReader); if (components == null) { components = createComponents(fieldName); reuseStrategy.setReusableComponents(this, fieldName, components); } components.setReader(r); components.reusableStringReader = strReader; return components.getTokenStream(); } /** * Normalize a string down to the representation that it would have in the index. * *

This is typically used by query parsers in order to generate a query on a given term, * without tokenizing or stemming, which are undesirable if the string to analyze is a partial * word (eg. in case of a wildcard or fuzzy query). * *

This method uses {@link #initReaderForNormalization(String, Reader)} in order to apply * necessary character-level normalization and then {@link #normalize(String, TokenStream)} in * order to apply the normalizing token filters. */ public final BytesRef normalize(final String fieldName, final String text) { try { // apply char filters final String filteredText; try (Reader reader = new StringReader(text)) { Reader filterReader = initReaderForNormalization(fieldName, reader); char[] buffer = new char[64]; StringBuilder builder = new StringBuilder(); for (; ; ) { final int read = filterReader.read(buffer, 0, buffer.length); if (read == -1) { break; } builder.append(buffer, 0, read); } filteredText = builder.toString(); } catch (IOException e) { throw new IllegalStateException("Normalization threw an unexpected exception", e); } final AttributeFactory attributeFactory = attributeFactory(fieldName); try (TokenStream ts = normalize( fieldName, new StringTokenStream(attributeFactory, filteredText, text.length()))) { final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); ts.reset(); if (ts.incrementToken() == false) { throw new IllegalStateException( "The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " + this + " and input \"" + text + "\""); } final BytesRef term = BytesRef.deepCopyOf(termAtt.getBytesRef()); if (ts.incrementToken()) { throw new IllegalStateException( "The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\""); } ts.end(); return term; } } catch (IOException e) { throw new IllegalStateException("Normalization threw an unexpected exception", e); } } /** * Override this if you want to add a CharFilter chain. * *

The default implementation returns reader unchanged. * * @param fieldName IndexableField name being indexed * @param reader original Reader * @return reader, optionally decorated with CharFilter(s) */ protected Reader initReader(String fieldName, Reader reader) { return reader; } /** * Wrap the given {@link Reader} with {@link CharFilter}s that make sense for normalization. This * is typically a subset of the {@link CharFilter}s that are applied in {@link #initReader(String, * Reader)}. This is used by {@link #normalize(String, String)}. */ protected Reader initReaderForNormalization(String fieldName, Reader reader) { return reader; } /** * Return the {@link AttributeFactory} to be used for {@link #tokenStream analysis} and {@link * #normalize(String, String) normalization} on the given {@code FieldName}. The default * implementation returns {@link TokenStream#DEFAULT_TOKEN_ATTRIBUTE_FACTORY}. */ protected AttributeFactory attributeFactory(String fieldName) { return TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY; } /** * Invoked before indexing a IndexableField instance if terms have already been added to that * field. This allows custom analyzers to place an automatic position increment gap between * IndexbleField instances using the same field name. The default value position increment gap is * 0. With a 0 position increment gap and the typical default token position increment of 1, all * terms in a field, including across IndexableField instances, are in successive positions, * allowing exact PhraseQuery matches, for instance, across IndexableField instance boundaries. * * @param fieldName IndexableField name being indexed. * @return position increment gap, added to the next token emitted from {@link * #tokenStream(String,Reader)}. This value must be {@code >= 0}. */ public int getPositionIncrementGap(String fieldName) { return 0; } /** * Just like {@link #getPositionIncrementGap}, except for Token offsets instead. By default this * returns 1. This method is only called if the field produced at least one token for indexing. * * @param fieldName the field just indexed * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}. * This value must be {@code >= 0}. */ public int getOffsetGap(String fieldName) { return 1; } /** Returns the used {@link ReuseStrategy}. */ public final ReuseStrategy getReuseStrategy() { return reuseStrategy; } /** Frees persistent resources used by this Analyzer */ @Override public void close() { if (storedValue != null) { storedValue.close(); storedValue = null; } } /** * This class encapsulates the outer components of a token stream. It provides access to the * source (a {@link Reader} {@link Consumer} and the outer end (sink), an instance of {@link * TokenFilter} which also serves as the {@link TokenStream} returned by {@link * Analyzer#tokenStream(String, Reader)}. */ public static final class TokenStreamComponents { /** Original source of the tokens. */ protected final Consumer source; /** * Sink tokenstream, such as the outer tokenfilter decorating the chain. This can be the source * if there are no filters. */ protected final TokenStream sink; /** Internal cache only used by {@link Analyzer#tokenStream(String, String)}. */ transient ReusableStringReader reusableStringReader; /** * Creates a new {@link TokenStreamComponents} instance. * * @param source the source to set the reader on * @param result the analyzer's resulting token stream */ public TokenStreamComponents(final Consumer source, final TokenStream result) { this.source = source; this.sink = result; } /** * Creates a new {@link TokenStreamComponents} instance * * @param tokenizer the analyzer's Tokenizer * @param result the analyzer's resulting token stream */ public TokenStreamComponents(final Tokenizer tokenizer, final TokenStream result) { this(tokenizer::setReader, result); } /** Creates a new {@link TokenStreamComponents} from a Tokenizer */ public TokenStreamComponents(final Tokenizer tokenizer) { this(tokenizer::setReader, tokenizer); } /** * Resets the encapsulated components with the given reader. If the components cannot be reset, * an Exception should be thrown. * * @param reader a reader to reset the source component */ private void setReader(final Reader reader) { source.accept(reader); } /** * Returns the sink {@link TokenStream} * * @return the sink {@link TokenStream} */ public TokenStream getTokenStream() { return sink; } /** Returns the component's source */ public Consumer getSource() { return source; } } /** * Strategy defining how TokenStreamComponents are reused per call to {@link * Analyzer#tokenStream(String, java.io.Reader)}. */ public abstract static class ReuseStrategy { /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ // Explicitly declared so that we have non-empty javadoc protected ReuseStrategy() {} /** * Gets the reusable TokenStreamComponents for the field with the given name. * * @param analyzer Analyzer from which to get the reused components. Use {@link * #getStoredValue(Analyzer)} and {@link #setStoredValue(Analyzer, Object)} to access the * data on the Analyzer. * @param fieldName Name of the field whose reusable TokenStreamComponents are to be retrieved * @return Reusable TokenStreamComponents for the field, or {@code null} if there was no * previous components for the field */ public abstract TokenStreamComponents getReusableComponents( Analyzer analyzer, String fieldName); /** * Stores the given TokenStreamComponents as the reusable components for the field with the give * name. * * @param fieldName Name of the field whose TokenStreamComponents are being set * @param components TokenStreamComponents which are to be reused for the field */ public abstract void setReusableComponents( Analyzer analyzer, String fieldName, TokenStreamComponents components); /** * Returns the currently stored value. * * @return Currently stored value or {@code null} if no value is stored * @throws AlreadyClosedException if the Analyzer is closed. */ protected final Object getStoredValue(Analyzer analyzer) { if (analyzer.storedValue == null) { throw new AlreadyClosedException("this Analyzer is closed"); } return analyzer.storedValue.get(); } /** * Sets the stored value. * * @param storedValue Value to store * @throws AlreadyClosedException if the Analyzer is closed. */ protected final void setStoredValue(Analyzer analyzer, Object storedValue) { if (analyzer.storedValue == null) { throw new AlreadyClosedException("this Analyzer is closed"); } analyzer.storedValue.set(storedValue); } } /** A predefined {@link ReuseStrategy} that reuses the same components for every field. */ public static final ReuseStrategy GLOBAL_REUSE_STRATEGY = new ReuseStrategy() { @Override public TokenStreamComponents getReusableComponents(Analyzer analyzer, String fieldName) { return (TokenStreamComponents) getStoredValue(analyzer); } @Override public void setReusableComponents( Analyzer analyzer, String fieldName, TokenStreamComponents components) { setStoredValue(analyzer, components); } }; /** * A predefined {@link ReuseStrategy} that reuses components per-field by maintaining a Map of * TokenStreamComponent per field name. */ public static final ReuseStrategy PER_FIELD_REUSE_STRATEGY = new ReuseStrategy() { @SuppressWarnings("unchecked") @Override public TokenStreamComponents getReusableComponents(Analyzer analyzer, String fieldName) { Map componentsPerField = (Map) getStoredValue(analyzer); return componentsPerField != null ? componentsPerField.get(fieldName) : null; } @SuppressWarnings("unchecked") @Override public void setReusableComponents( Analyzer analyzer, String fieldName, TokenStreamComponents components) { Map componentsPerField = (Map) getStoredValue(analyzer); if (componentsPerField == null) { componentsPerField = new HashMap<>(); setStoredValue(analyzer, componentsPerField); } componentsPerField.put(fieldName, components); } }; private static final class StringTokenStream extends TokenStream { private final String value; private final int length; private boolean used = true; private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); StringTokenStream(AttributeFactory attributeFactory, String value, int length) { super(attributeFactory); this.value = value; this.length = length; } @Override public void reset() { used = false; } @Override public boolean incrementToken() { if (used) { return false; } clearAttributes(); termAttribute.append(value); offsetAttribute.setOffset(0, length); used = true; return true; } @Override public void end() throws IOException { super.end(); offsetAttribute.setOffset(length, length); } } }