org.apache.cassandra.index.sasi.analyzer.StandardAnalyzer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
A fork of the Apache Cassandra Project ready to embed Elasticsearch.
There is a newer version: 3.11.12.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.sasi.analyzer;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Map;

import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.UTF8Type;
import org.apache.cassandra.index.sasi.analyzer.filter.BasicResultFilters;
import org.apache.cassandra.index.sasi.analyzer.filter.FilterPipelineBuilder;
import org.apache.cassandra.index.sasi.analyzer.filter.FilterPipelineExecutor;
import org.apache.cassandra.index.sasi.analyzer.filter.FilterPipelineTask;
import org.apache.cassandra.index.sasi.analyzer.filter.StemmingFilters;
import org.apache.cassandra.index.sasi.analyzer.filter.StopWordFilters;
import org.apache.cassandra.io.util.DataInputBuffer;
import org.apache.cassandra.utils.ByteBufferUtil;

import com.carrotsearch.hppc.IntObjectHashMap;
import com.carrotsearch.hppc.IntObjectMap;
import com.google.common.annotations.VisibleForTesting;

public class StandardAnalyzer extends AbstractAnalyzer
{
    public enum TokenType
    {
        EOF(-1),
        ALPHANUM(0),
        NUM(6),
        SOUTHEAST_ASIAN(9),
        IDEOGRAPHIC(10),
        HIRAGANA(11),
        KATAKANA(12),
        HANGUL(13);

        private static final IntObjectMap TOKENS = new IntObjectHashMap<>();

        static
        {
            for (TokenType type : TokenType.values())
                TOKENS.put(type.value, type);
        }

        public final int value;

        TokenType(int value)
        {
            this.value = value;
        }

        public int getValue()
        {
            return value;
        }

        public static TokenType fromValue(int val)
        {
            return TOKENS.get(val);
        }
    }

    private AbstractType validator;

    private StandardTokenizerInterface scanner;
    private StandardTokenizerOptions options;
    private FilterPipelineTask filterPipeline;

    protected Reader inputReader = null;

    public String getToken()
    {
        return scanner.getText();
    }

    public final boolean incrementToken() throws IOException
    {
        while(true)
        {
            TokenType currentTokenType = TokenType.fromValue(scanner.getNextToken());
            if (currentTokenType == TokenType.EOF)
                return false;
            if (scanner.yylength() <= options.getMaxTokenLength()
                    && scanner.yylength() >= options.getMinTokenLength())
                return true;
        }
    }

    protected String getFilteredCurrentToken() throws IOException
    {
        String token = getToken();
        Object pipelineRes;

        while (true)
        {
            pipelineRes = FilterPipelineExecutor.execute(filterPipeline, token);
            if (pipelineRes != null)
                break;

            boolean reachedEOF = incrementToken();
            if (!reachedEOF)
                break;

            token = getToken();
        }

        return (String) pipelineRes;
    }

    private FilterPipelineTask getFilterPipeline()
    {
        FilterPipelineBuilder builder = new FilterPipelineBuilder(new BasicResultFilters.NoOperation());
        if (!options.isCaseSensitive() && options.shouldLowerCaseTerms())
            builder = builder.add("to_lower", new BasicResultFilters.LowerCase());
        if (!options.isCaseSensitive() && options.shouldUpperCaseTerms())
            builder = builder.add("to_upper", new BasicResultFilters.UpperCase());
        if (options.shouldIgnoreStopTerms())
            builder = builder.add("skip_stop_words", new StopWordFilters.DefaultStopWordFilter(options.getLocale()));
        if (options.shouldStemTerms())
            builder = builder.add("term_stemming", new StemmingFilters.DefaultStemmingFilter(options.getLocale()));
        return builder.build();
    }

    public void init(Map options, AbstractType validator)
    {
        init(StandardTokenizerOptions.buildFromMap(options), validator);
    }

    @VisibleForTesting
    protected void init(StandardTokenizerOptions options)
    {
        init(options, UTF8Type.instance);
    }

    public void init(StandardTokenizerOptions tokenizerOptions, AbstractType validator)
    {
        this.validator = validator;
        this.options = tokenizerOptions;
        this.filterPipeline = getFilterPipeline();

        Reader reader = new InputStreamReader(new DataInputBuffer(ByteBufferUtil.EMPTY_BYTE_BUFFER, false), StandardCharsets.UTF_8);
        this.scanner = new StandardTokenizerImpl(reader);
        this.inputReader = reader;
    }

    public boolean hasNext()
    {
        try
        {
            if (incrementToken())
            {
                if (getFilteredCurrentToken() != null)
                {
                    this.next = validator.fromString(normalize(getFilteredCurrentToken()));
                    return true;
                }
            }
        }
        catch (IOException e)
        {}

        return false;
    }

    public void reset(ByteBuffer input)
    {
        this.next = null;
        Reader reader = new InputStreamReader(new DataInputBuffer(input, false), StandardCharsets.UTF_8);
        scanner.yyreset(reader);
        this.inputReader = reader;
    }

    @VisibleForTesting
    public void reset(InputStream input)
    {
        this.next = null;
        Reader reader = new InputStreamReader(input, StandardCharsets.UTF_8);
        scanner.yyreset(reader);
        this.inputReader = reader;
    }

    public boolean isTokenizing()
    {
        return true;
    }
}