All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.analysis.ComboAnalyzer Maven / Gradle / Ivy

Go to download

The Combo Analyzer plugin for ElasticSearch provides with a new analyzer type that combines the output of multiple analyzers into one.

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis;

import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter;
import org.apache.lucene.util.CloseableThreadLocal;
import org.apache.lucene.util.ReaderCloneFactory;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.ESLoggerFactory;

import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;

/**
 * An analyzer that combines multiple sub-analyzers into one.
 *
 * It internally uses {@link ReaderCloneFactory} in order to feed the multiple
 * sub-analyzers from a single input.
 * If you analyzer big inputs or have a performance critical application,
 * please see the remarks of the latter's documentation.
 *
 * The instances are thread safe with regards to the reused TokenStreams.
 * As duplicated sub-analyzer instance would break this safety,
 * there is a special case detection for it.
 * However, there can still be cases where an {@link Analyzer} is used from
 * multiple places at a time before the {@link ComboTokenStream} is fully consumed,
 * and this causes problem.
 * For a solution, see {@link #setTokenStreamCachingEnabled(boolean)}
 * and {@link #setTokenStreamCachingEnabled(boolean)}.
 */
public class ComboAnalyzer extends Analyzer {

    protected static final ESLogger logger = ESLoggerFactory.getLogger(ComboAnalyzer.class.getSimpleName());

    /**
     * Default value for the enabled state of {@link TokenStream} caching.
     */
    public static final boolean TOKENSTREAM_CACHING_ENABLED_DEFAULT = false;

    /**
     * Default value for the enabled state of token deduplication.
     */
    public static final boolean DEDUPLICATION_ENABLED_DEFAULT = false;

    private Analyzer[] subAnalyzers;

    private boolean hasDuplicatedAnalyzers;
    private Set duplicatedAnalyzers;

    private boolean cacheTokenStreams = TOKENSTREAM_CACHING_ENABLED_DEFAULT;

    private boolean deduplication = DEDUPLICATION_ENABLED_DEFAULT;

    private CloseableThreadLocal lastTokenStreams = new CloseableThreadLocal();
    private CloseableThreadLocal tempTokenStreams = new CloseableThreadLocal();
    private CloseableThreadLocal lastComboTokenStream = new CloseableThreadLocal();

    public ComboAnalyzer(Version version, Analyzer... subAnalyzers) {
        super(new GlobalReuseStrategy());

        this.subAnalyzers = subAnalyzers;

        // Detect duplicates in analyzers
        // this prevents reusable token streams
        hasDuplicatedAnalyzers = false;
        duplicatedAnalyzers = new HashSet();
        Set seen = new HashSet(subAnalyzers.length);
        for (Analyzer analyzer : subAnalyzers) {
            if (!seen.add(analyzer)) {
                hasDuplicatedAnalyzers = true;
                duplicatedAnalyzers.add(analyzer);
            }
        }
    }

    /**
     * Enable or disable the systematic caching of {@link Analyzer} {@link TokenStream}s.
     *
     * {@link TokenStream}s gotten from the {@link Analyzer}s will be cached upfront.
     * This helps with one of the {@link Analyzer}s being reused before having completely
     * consumed our {@link ComboTokenStream}.
     * Note that this can happen too, if the same {@link Analyzer} instance is given twice.
     *
     * @param value {@code true} to enable the caching of {@link TokenStream}s,
     *              {@code false} to disable it.
     *
     * @return This instance, for chainable construction.
     *
     * @see #TOKENSTREAM_CACHING_ENABLED_DEFAULT
     */
    public ComboAnalyzer setTokenStreamCachingEnabled(boolean value) {
        cacheTokenStreams = value;
        return this;
    }

    /**
     * Enable the systematic caching of {@link Analyzer} {@link TokenStream}s.
     * @return This instance, for chainable construction.
     * @see #setTokenStreamCachingEnabled(boolean)
     */
    public ComboAnalyzer enableTokenStreamCaching() {
        cacheTokenStreams = true;
        return this;
    }

    /**
     * Disable the systematic caching of {@link Analyzer} {@link TokenStream}s.
     * @return This instance, for chainable construction.
     * @see #setTokenStreamCachingEnabled(boolean)
     */
    public ComboAnalyzer disableTokenStreamCaching() {
        cacheTokenStreams = false;
        return this;
    }

    /**
     * Enable or disable deduplication of repeated tokens at the same position.
     *
     * @param value {@code true} to enable the deduplication of tokens,
     *              {@code false} to disable it.
     *
     * @return This instance, for chainable construction.
     *
     * @see #DEDUPLICATION_ENABLED_DEFAULT
     */
    public ComboAnalyzer setDeduplicationEnabled(boolean value) {
        deduplication = value;
        return this;
    }

    /**
     * Enable deduplication of repeated tokens at the same position.
     * @return This instance, for chainable construction.
     * @see #setDeduplicationEnabled(boolean)
     */
    public ComboAnalyzer enableDeduplication() {
        deduplication = true;
        return this;
    }

    /**
     * Disable deduplication of repeated tokens at the same position.
     * @return This instance, for chainable construction.
     * @see #setDeduplicationEnabled(boolean)
     */
    public ComboAnalyzer disableDeduplication() {
        deduplication = false;
        return this;
    }

    protected ReaderCloneFactory.ReaderCloner cloneReader(Reader originalReader) {
        ReaderCloneFactory.ReaderCloner rtn;

        // Duplication of the original reader, to feed all sub-analyzers
        if (subAnalyzers.length <= 1) {

            // Can reuse the only reader we have, there will be no need of duplication
            // Usage of the AtomicReference ensures that the same reader won't be duplicated.
            ReaderCloneFactory.ReaderCloner useOnceReaderCloner = new ReaderCloneFactory.ReaderCloner() {
                private AtomicReference singleUsageReference = null;
                public void init(Reader originalReader) throws IOException {
                    singleUsageReference = new AtomicReference(originalReader);
                }
                public Reader giveAClone() {
                    return singleUsageReference.getAndSet(null);
                }
            };
            try {
                useOnceReaderCloner.init(originalReader);
            } catch (Throwable fail) {
                useOnceReaderCloner = null;
            }
            rtn = useOnceReaderCloner;

        } else {

            rtn = ReaderCloneFactory.getCloner(originalReader); // internally uses the default "should always work" implementation

        }

        if (rtn == null) {
            throw new IllegalArgumentException("Could not duplicate the original reader to feed multiple sub-readers");
        }
        return rtn;
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader originalReader) {
        // Duplication of the original reader, to feed all sub-analyzers
        ReaderCloneFactory.ReaderCloner readerCloner = cloneReader(originalReader);

        // We remember last used TokenStreams because many times Analyzers can provide a reusable TokenStream
        // Detecting that all sub-TokenStreams are reusable permits to reuse our ComboTokenStream as well.
        if (tempTokenStreams.get() == null) tempTokenStreams.set(new TokenStream[subAnalyzers.length]); // each time non reusability has been detected
        if (lastTokenStreams.get() == null) lastTokenStreams.set(new TokenStream[subAnalyzers.length]); // only at first run
        TokenStream[] tempTokenStreams_local = tempTokenStreams.get();
        TokenStream[] lastTokenStreams_local = lastTokenStreams.get();
        ReusableTokenStreamComponents lastComboTokenStream_local = lastComboTokenStream.get();
        if (lastComboTokenStream_local == null)
            lastComboTokenStream_local = new ReusableTokenStreamComponents(fieldName, this);

        // Get sub-TokenStreams from sub-analyzers
        for (int i = subAnalyzers.length-1 ; i >= 0 ; --i) {

            // Feed the troll
            Reader reader = readerCloner.giveAClone();
            tempTokenStreams_local[i] = null;
            try {
                tempTokenStreams_local[i] = subAnalyzers[i].tokenStream(fieldName, reader);
            } catch (IOException ignored) {
                logger.debug("Ignoring {}th analyzer [{}]. Could not get a TokenStream.", ignored, i, subAnalyzers[i]);
            }
            // Use caching if asked or if required in case of duplicated analyzers
            if (cacheTokenStreams || hasDuplicatedAnalyzers && duplicatedAnalyzers.contains(subAnalyzers[i])) {
                CachingTokenStream cache = new CachingTokenStream(tempTokenStreams_local[i]);
                try {
                    tempTokenStreams_local[i].reset();
                    cache.fillCache();
                } catch (IOException ignored) {
                    logger.debug("Got an error when caching TokenStream from the {}th analyzer [{}]", i, subAnalyzers[i]);
                }
                try {
                    // Close original stream, all tokens are buffered
                    tempTokenStreams_local[i].close();
                } catch (IOException ignored) {
                    logger.debug("Got an error when closing TokenStream from the {}th analyzer [{}]", i, subAnalyzers[i]);
                }
                tempTokenStreams_local[i] = cache;
            }
            // Detect non reusability
            if (tempTokenStreams_local[i] != lastTokenStreams_local[i]) {
                lastComboTokenStream_local.setTokenStream(null);
            }
        }

        // If last ComboTokenStream is not available create a new one
        // This happens in the first call and in case of non reusability
        if (lastComboTokenStream_local.getTokenStream() == null) {
            // Clear old invalid references (preferred over allocating a new array)
            Arrays.fill(lastTokenStreams_local, null);
            // Swap temporary and last (non reusable) TokenStream references
            lastTokenStreams.set(tempTokenStreams_local);
            tempTokenStreams.set(lastTokenStreams_local);
            // New ComboTokenStream to use
            lastComboTokenStream_local.setTokenStream(new ComboTokenStream(tempTokenStreams_local));
            if (deduplication)
                lastComboTokenStream_local.setTokenStream(new UniqueTokenFilter(lastComboTokenStream_local.getTokenStream(), true));
            lastComboTokenStream.set(lastComboTokenStream_local);
        }
        return lastComboTokenStream_local;
    }

    @Override public void close() {
        super.close();
        lastTokenStreams.close();
        tempTokenStreams.close();
        lastComboTokenStream.close();
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy