org.apache.cassandra.index.sasi.analyzer.DelimiterAnalyzer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
A fork of the Apache Cassandra Project ready to embed Elasticsearch.
There is a newer version: 3.11.12.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.sasi.analyzer;

import java.nio.CharBuffer;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import com.google.common.annotations.Beta;
import com.google.common.base.Preconditions;

import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.db.marshal.AsciiType;
import org.apache.cassandra.db.marshal.UTF8Type;
import org.apache.cassandra.utils.AbstractIterator;

@Beta
public class DelimiterAnalyzer extends AbstractAnalyzer
{

    private static final Map, Charset> VALID_ANALYZABLE_TYPES = new HashMap, Charset>()
    {{
        put(UTF8Type.instance, StandardCharsets.UTF_8);
        put(AsciiType.instance, StandardCharsets.US_ASCII);
    }};

    private char delimiter;
    private Charset charset;
    private Iterator iter;

    public DelimiterAnalyzer()
    {
    }

    public ByteBuffer next()
    {
        return iter.next();
    }

    public void init(Map options, AbstractType validator)
    {
        DelimiterTokenizingOptions tokenizingOptions = DelimiterTokenizingOptions.buildFromMap(options);
        delimiter = tokenizingOptions.getDelimiter();

        if (!VALID_ANALYZABLE_TYPES.containsKey(validator))
            throw new IllegalArgumentException(String.format("Only text types supported, got %s", validator));

        charset = VALID_ANALYZABLE_TYPES.get(validator);
    }

    public boolean hasNext()
    {
        return iter.hasNext();
    }

    public void reset(ByteBuffer input)
    {
        Preconditions.checkNotNull(input);
        final CharBuffer cb = charset.decode(input);

        this.iter = new AbstractIterator() {
            protected ByteBuffer computeNext() {

                if (!cb.hasRemaining())
                    return endOfData();

                CharBuffer readahead = cb.duplicate();
                // loop until we see the next delimiter character, or reach end of data
                boolean readaheadRemaining;
                while ((readaheadRemaining = readahead.hasRemaining()) && readahead.get() != delimiter);

                char[] chars = new char[readahead.position() - cb.position() - (readaheadRemaining ? 1 : 0)];
                cb.get(chars);
                Preconditions.checkState(!cb.hasRemaining() || cb.get() == delimiter);

                return 0 < chars.length
                        ? charset.encode(CharBuffer.wrap(chars))
                        // blank partition keys not permitted, ref ConcurrentRadixTree.putIfAbsent(..)
                        : computeNext();
            }
        };
    }


    public boolean isTokenizing()
    {
        return true;
    }

    @Override
    public boolean isCompatibleWith(AbstractType validator)
    {
        return VALID_ANALYZABLE_TYPES.containsKey(validator);
    }
}