org.dizitart.no2.fulltext.BaseTextTokenizer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of nitrite Show documentation

There is a newer version: 4.3.0

/*
 *
 * Copyright 2017-2018 Nitrite author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.dizitart.no2.fulltext;

import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;

/**
 * An abstract text tokenizer which tokenizes a given string.
 * It discards certain words known as stop word depending on
 * the language chosen.
 *
 * @since 2.1.0
 * @author Anindya Chatterjee
 */
public abstract class BaseTextTokenizer implements TextTokenizer {
    private static final String WHITESPACE_CHARS = " \t\n\r\f+\"*%&/()=?'!,.;:-_#@|^~`{}[]<>\\";

    @Override
    public Set tokenize(String text) throws IOException {
        Set words = new HashSet<>();
        StringTokenizer tokenizer = new StringTokenizer(text, WHITESPACE_CHARS);
        while (tokenizer.hasMoreTokens()) {
            String word = tokenizer.nextToken();
            word = convertWord(word);
            if (word != null) {
                words.add(word);
            }
        }
        return words;
    }

    /**
     * Converts a `word` into all lower case and checks if it
     * is a known stop word. If it is, then the `word` will be
     * discarded and will not be considered as a valid token.
     *
     * @param word the word
     * @return the tokenized word in all upper case.
     */
    protected String convertWord(String word) {
        word = word.toLowerCase();
        if (stopWords().contains(word)) {
            return null;
        }
        return word;
    }
}