All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.index.sasi.analyzer.StandardTokenizerOptions Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.sasi.analyzer;

import java.util.Locale;
import java.util.Map;

/**
 * Various options for controlling tokenization and enabling
 * or disabling features
 */
public class StandardTokenizerOptions
{
    public static final String TOKENIZATION_ENABLE_STEMMING = "tokenization_enable_stemming";
    public static final String TOKENIZATION_SKIP_STOP_WORDS = "tokenization_skip_stop_words";
    public static final String TOKENIZATION_LOCALE = "tokenization_locale";
    public static final String TOKENIZATION_NORMALIZE_LOWERCASE = "tokenization_normalize_lowercase";
    public static final String TOKENIZATION_NORMALIZE_UPPERCASE = "tokenization_normalize_uppercase";

    public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
    public static final int DEFAULT_MIN_TOKEN_LENGTH = 0;

    private boolean stemTerms;
    private boolean ignoreStopTerms;
    private Locale locale;
    private boolean caseSensitive;
    private boolean allTermsToUpperCase;
    private boolean allTermsToLowerCase;
    private int minTokenLength;
    private int maxTokenLength;

    public boolean shouldStemTerms()
    {
        return stemTerms;
    }

    public void setStemTerms(boolean stemTerms)
    {
        this.stemTerms = stemTerms;
    }

    public boolean shouldIgnoreStopTerms()
    {
        return ignoreStopTerms;
    }

    public void setIgnoreStopTerms(boolean ignoreStopTerms)
    {
        this.ignoreStopTerms = ignoreStopTerms;
    }

    public Locale getLocale()
    {
        return locale;
    }

    public void setLocale(Locale locale)
    {
        this.locale = locale;
    }

    public boolean isCaseSensitive()
    {
        return caseSensitive;
    }

    public void setCaseSensitive(boolean caseSensitive)
    {
        this.caseSensitive = caseSensitive;
    }

    public boolean shouldUpperCaseTerms()
    {
        return allTermsToUpperCase;
    }

    public void setAllTermsToUpperCase(boolean allTermsToUpperCase)
    {
        this.allTermsToUpperCase = allTermsToUpperCase;
    }

    public boolean shouldLowerCaseTerms()
    {
        return allTermsToLowerCase;
    }

    public void setAllTermsToLowerCase(boolean allTermsToLowerCase)
    {
        this.allTermsToLowerCase = allTermsToLowerCase;
    }

    public int getMinTokenLength()
    {
        return minTokenLength;
    }

    public void setMinTokenLength(int minTokenLength)
    {
        this.minTokenLength = minTokenLength;
    }

    public int getMaxTokenLength()
    {
        return maxTokenLength;
    }

    public void setMaxTokenLength(int maxTokenLength)
    {
        this.maxTokenLength = maxTokenLength;
    }

    public static class OptionsBuilder 
    {
        private boolean stemTerms;
        private boolean ignoreStopTerms;
        private Locale locale;
        private boolean caseSensitive;
        private boolean allTermsToUpperCase;
        private boolean allTermsToLowerCase;
        private int minTokenLength = DEFAULT_MIN_TOKEN_LENGTH;
        private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;

        public OptionsBuilder()
        {
        }

        public OptionsBuilder stemTerms(boolean stemTerms)
        {
            this.stemTerms = stemTerms;
            return this;
        }

        public OptionsBuilder ignoreStopTerms(boolean ignoreStopTerms)
        {
            this.ignoreStopTerms = ignoreStopTerms;
            return this;
        }

        public OptionsBuilder useLocale(Locale locale)
        {
            this.locale = locale;
            return this;
        }

        public OptionsBuilder caseSensitive(boolean caseSensitive)
        {
            this.caseSensitive = caseSensitive;
            return this;
        }

        public OptionsBuilder alwaysUpperCaseTerms(boolean allTermsToUpperCase)
        {
            this.allTermsToUpperCase = allTermsToUpperCase;
            return this;
        }

        public OptionsBuilder alwaysLowerCaseTerms(boolean allTermsToLowerCase)
        {
            this.allTermsToLowerCase = allTermsToLowerCase;
            return this;
        }

        /**
         * Set the min allowed token length.  Any token shorter
         * than this is skipped.
         */
        public OptionsBuilder minTokenLength(int minTokenLength)
        {
            if (minTokenLength < 1)
                throw new IllegalArgumentException("minTokenLength must be greater than zero");
            this.minTokenLength = minTokenLength;
            return this;
        }

        /**
         * Set the max allowed token length.  Any token longer
         * than this is skipped.
         */
        public OptionsBuilder maxTokenLength(int maxTokenLength)
        {
            if (maxTokenLength < 1)
                throw new IllegalArgumentException("maxTokenLength must be greater than zero");
            this.maxTokenLength = maxTokenLength;
            return this;
        }

        public StandardTokenizerOptions build()
        {
            if(allTermsToLowerCase && allTermsToUpperCase)
                throw new IllegalArgumentException("Options to normalize terms cannot be " +
                        "both uppercase and lowercase at the same time");

            StandardTokenizerOptions options = new StandardTokenizerOptions();
            options.setIgnoreStopTerms(ignoreStopTerms);
            options.setStemTerms(stemTerms);
            options.setLocale(locale);
            options.setCaseSensitive(caseSensitive);
            options.setAllTermsToLowerCase(allTermsToLowerCase);
            options.setAllTermsToUpperCase(allTermsToUpperCase);
            options.setMinTokenLength(minTokenLength);
            options.setMaxTokenLength(maxTokenLength);
            return options;
        }
    }

    public static StandardTokenizerOptions buildFromMap(Map optionsMap)
    {
        OptionsBuilder optionsBuilder = new OptionsBuilder();

        for (Map.Entry entry : optionsMap.entrySet())
        {
            switch(entry.getKey())
            {
                case TOKENIZATION_ENABLE_STEMMING:
                {
                    boolean bool = Boolean.parseBoolean(entry.getValue());
                    optionsBuilder = optionsBuilder.stemTerms(bool);
                    break;
                }
                case TOKENIZATION_SKIP_STOP_WORDS:
                {
                    boolean bool = Boolean.parseBoolean(entry.getValue());
                    optionsBuilder = optionsBuilder.ignoreStopTerms(bool);
                    break;
                }
                case TOKENIZATION_LOCALE:
                {
                    Locale locale = new Locale(entry.getValue());
                    optionsBuilder = optionsBuilder.useLocale(locale);
                    break;
                }
                case TOKENIZATION_NORMALIZE_UPPERCASE:
                {
                    boolean bool = Boolean.parseBoolean(entry.getValue());
                    optionsBuilder = optionsBuilder.alwaysUpperCaseTerms(bool);
                    break;
                }
                case TOKENIZATION_NORMALIZE_LOWERCASE:
                {
                    boolean bool = Boolean.parseBoolean(entry.getValue());
                    optionsBuilder = optionsBuilder.alwaysLowerCaseTerms(bool);
                    break;
                }
                default:
                {
                }
            }
        }
        return optionsBuilder.build();
    }

    public static StandardTokenizerOptions getDefaultOptions()
    {
        return new OptionsBuilder()
                .ignoreStopTerms(true).alwaysLowerCaseTerms(true)
                .stemTerms(false).useLocale(Locale.ENGLISH).build();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy