All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.stratio.cassandra.lucene.schema.analysis.SnowballAnalyzerBuilder Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (C) 2014 Stratio (http://stratio.com)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.stratio.cassandra.lucene.schema.analysis;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.stratio.cassandra.lucene.IndexException;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;

import java.util.ArrayList;
import java.util.List;

/**
 * {@link AnalyzerBuilder} for tartarus.org snowball {@link Analyzer}.
 *
 * The supported languages are English, French, Spanish, Portuguese, Italian, Romanian, German, Dutch, Swedish,
 * Norwegian, Danish, Russian, Finnish, Hungarian and Turkish.
 *
 * @author Andres de la Pena {@literal }
 */
public class SnowballAnalyzerBuilder extends AnalyzerBuilder {

    @JsonProperty("language")
    private final String language;

    @JsonProperty("stopwords")
    private final String stopwords;

    /**
     * Builds a new {@link SnowballAnalyzerBuilder} for the specified language and stopwords.
     *
     * @param language The language. The supported languages are English, French, Spanish, Portuguese, Italian,
     * Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Hungarian and Turkish. Basque and
     * Catalan.
     * @param stopwords the comma separated stopwords list.
     */
    @JsonCreator
    public SnowballAnalyzerBuilder(@JsonProperty("language") String language,
                                   @JsonProperty("stopwords") String stopwords) {

        // Check language
        if (StringUtils.isBlank(language)) {
            throw new IndexException("Language must be specified");
        }

        this.language = language;
        this.stopwords = stopwords;
    }

    /** {@inheritDoc} */
    @Override
    public Analyzer analyzer() {
        // Setup stopwords
        CharArraySet stops = stopwords == null ? getDefaultStopwords(language) : getStopwords(stopwords);
        return buildAnalyzer(language, stops);
    }

    /**
     * Returns the snowball {@link Analyzer} for the specified language and stopwords.
     *
     * @param language The language code. The supported languages are English, French, Spanish, Portuguese, Italian,
     * Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian, Turkish, Armenian,
     * Basque and Catalan.
     * @param stopwords the stop words list
     * @return a new snowball analyzer
     */
    private static Analyzer buildAnalyzer(final String language, final CharArraySet stopwords) {
        return new SnowballAnalyzer(language, stopwords);
    }

    /**
     * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
     *
     * @param stopwords a {@code String} comma separated stopwords list
     * @return the stopwords list as a char array set
     */
    private static CharArraySet getStopwords(String stopwords) {
        List stopwordsList = new ArrayList<>();
        for (String stop : stopwords.split(",")) {
            stopwordsList.add(stop.trim());
        }
        return new CharArraySet(stopwordsList, true);
    }

    /**
     * Returns the default stopwords set used by Lucene language analyzer for the specified language.
     *
     * @param language The language for which the stopwords are. The supported languages are English, French, Spanish,
     * Portuguese, Italian, Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian,
     * Turkish, Armenian, Basque and Catalan.
     * @return the default stopwords set used by Lucene language analyzers
     */
    private static CharArraySet getDefaultStopwords(String language) {
        return StandardStopwords.get(language);
    }

    /**
     * A tartarus.org snowball {@link Analyzer}.
     */
    public static class SnowballAnalyzer extends Analyzer {

        private final String language;
        private final CharArraySet stopwords;

        /**
         * Builds a new {@link SnowballAnalyzer} for the specified language and stopwords.
         *
         * @param language The language. The supported languages are English, French, Spanish, Portuguese, Italian,
         * Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian, Turkish, Armenian,
         * Basque and Catalan.
         * @param stopwords the comma separated stopwords {@code String}
         */
        public SnowballAnalyzer(String language, CharArraySet stopwords) {
            this.language = language;
            this.stopwords = stopwords;
        }

        /** {@inheritDoc} */
        @Override
        protected Analyzer.TokenStreamComponents createComponents(String fieldName) {
            final Tokenizer source = new StandardTokenizer();
            TokenStream result = new StandardFilter(source);
            result = new LowerCaseFilter(result);
            result = new StopFilter(result, stopwords);
            result = new SnowballFilter(result, language);
            return new TokenStreamComponents(source, result);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy