All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.arangodb.entity.arangosearch.analyzer.TextAnalyzerProperties Maven / Gradle / Ivy

There is a newer version: 7.9.0
Show newest version
/*
 * DISCLAIMER
 *
 * Copyright 2016 ArangoDB GmbH, Cologne, Germany
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright holder is ArangoDB GmbH, Cologne, Germany
 */

package com.arangodb.entity.arangosearch.analyzer;


import com.arangodb.shaded.fasterxml.jackson.annotation.JsonProperty;

import java.util.Collections;
import java.util.List;
import java.util.Objects;

/**
 * @author Michele Rastelli
 */
public final class TextAnalyzerProperties {

    private String locale;
    private boolean accent;
    @JsonProperty("case")
    private SearchAnalyzerCase analyzerCase;
    private boolean stemming;
    private EdgeNgram edgeNgram;
    private List stopwords;
    private String stopwordsPath;

    public TextAnalyzerProperties() {
        stopwords = Collections.emptyList();
    }

    /**
     * @return a locale in the format `language[_COUNTRY][.encoding][@variant]` (square brackets denote optional parts),
     * e.g. `de.utf-8` or `en_US.utf-8`. Only UTF-8 encoding is meaningful in ArangoDB.
     * @see
     * Supported Languages
     */
    public String getLocale() {
        return locale;
    }

    public void setLocale(String locale) {
        this.locale = locale;
    }

    /**
     * @return true to preserve accented characters (default)
     * false to convert accented characters to their base characters
     */
    public boolean isAccent() {
        return accent;
    }

    public void setAccent(boolean accent) {
        this.accent = accent;
    }

    public SearchAnalyzerCase getAnalyzerCase() {
        return analyzerCase;
    }

    /**
     * @param analyzerCase defaults to {@link SearchAnalyzerCase#lower}
     */
    public void setAnalyzerCase(SearchAnalyzerCase analyzerCase) {
        this.analyzerCase = analyzerCase;
    }

    /**
     * @return true to apply stemming on returned words (default)
     * false to leave the tokenized words as-is
     */
    public boolean isStemming() {
        return stemming;
    }

    public void setStemming(boolean stemming) {
        this.stemming = stemming;
    }

    /**
     * @return if present, then edge n-grams are generated for each token (word). That is, the start of the n-gram is
     * anchored to the beginning of the token, whereas the ngram Analyzer would produce all possible substrings from a
     * single input token (within the defined length restrictions). Edge n-grams can be used to cover word-based
     * auto-completion queries with an index, for which you should set the following other options:
     * - accent: false
     * - case: {@link SearchAnalyzerCase#lower}
     * - stemming: false
     */
    public EdgeNgram getEdgeNgram() {
        return edgeNgram;
    }

    public void setEdgeNgram(EdgeNgram edgeNgram) {
        this.edgeNgram = edgeNgram;
    }

    /**
     * @return an array of strings with words to omit from result. Default: load words from stopwordsPath. To disable
     * stop-word filtering provide an empty array []. If both stopwords and stopwordsPath are provided then both word
     * sources are combined.
     */
    public List getStopwords() {
        return stopwords;
    }

    public void setStopwords(List stopwords) {
        this.stopwords = stopwords;
    }

    /**
     * @return path with a language sub-directory (e.g. en for a locale en_US.utf-8) containing files with words to
     * omit.
     * Each word has to be on a separate line. Everything after the first whitespace character on a line will be ignored
     * and can be used for comments. The files can be named arbitrarily and have any file extension (or none).
     * 

* Default: if no path is provided then the value of the environment variable IRESEARCH_TEXT_STOPWORD_PATH is used * to determine the path, or if it is undefined then the current working directory is assumed. If the stopwords * attribute is provided then no stop-words are loaded from files, unless an explicit stopwordsPath is also * provided. *

* Note that if the stopwordsPath can not be accessed, is missing language sub-directories or has no files for a * language required by an Analyzer, then the creation of a new Analyzer is refused. If such an issue is discovered * for an existing Analyzer during startup then the server will abort with a fatal error. */ public String getStopwordsPath() { return stopwordsPath; } public void setStopwordsPath(String stopwordsPath) { this.stopwordsPath = stopwordsPath; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; TextAnalyzerProperties that = (TextAnalyzerProperties) o; return accent == that.accent && stemming == that.stemming && Objects.equals(locale, that.locale) && analyzerCase == that.analyzerCase && Objects.equals(edgeNgram, that.edgeNgram) && Objects.equals(stopwords, that.stopwords) && Objects.equals(stopwordsPath, that.stopwordsPath); } @Override public int hashCode() { return Objects.hash(locale, accent, analyzerCase, stemming, edgeNgram, stopwords, stopwordsPath); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy