org.dizitart.no2.fulltext.TextTokenizer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of nitrite Show documentation

There is a newer version: 4.3.0

/*
 *
 * Copyright 2017-2018 Nitrite author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.dizitart.no2.fulltext;

import java.io.IOException;
import java.util.Set;

/**
 * A stop-word based string tokenizer.
 *
 * @since 1.0
 * @author Anindya Chatterjee.
 * @see TextIndexingService
 * @see EnglishTextTokenizer
 * @see org.dizitart.no2.NitriteBuilder#textTokenizer(TextTokenizer)
 */
public interface TextTokenizer {
    /**
     * Tokenize a `text` and discards all stop-words from it.
     *
     * @param text the text to tokenize
     * @return the set of tokens.
     * @throws IOException if a low-level I/O error occurs.
     */
    Set tokenize(String text) throws IOException;

    /**
     * Gets all stop-words for a language.
     *
     * @return the set of all stop-words.
     */
    Set stopWords();
}