All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dizitart.no2.fulltext.UniversalTextTokenizer Maven / Gradle / Ivy

There is a newer version: 4.3.0
Show newest version
/*
 *
 * Copyright 2017-2018 Nitrite author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *        http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */

package org.dizitart.no2.fulltext;

import org.dizitart.no2.fulltext.languages.*;

import java.util.HashSet;
import java.util.Set;

/**
 * A {@link TextTokenizer} implementation for various languages.
 *
 * @see Languages
 * @author Anindya Chatterjee
 * @since 2.1.0
 */
public class UniversalTextTokenizer extends BaseTextTokenizer {
    private Set stopWords = new HashSet<>();

    @Override
    public Set stopWords() {
        return stopWords;
    }

    private void registerLanguage(Language language) {
        stopWords.addAll(language.stopWords());
    }

    public void loadLanguage(Languages... languages) {
        for (Languages language : languages) {
            switch (language) {
                case Afrikaans:
                    registerLanguage(new Afrikaans());
                    break;
                case Arabic:
                    registerLanguage(new Arabic());
                    break;
                case Armenian:
                    registerLanguage(new Armenian());
                    break;
                case Basque:
                    registerLanguage(new Basque());
                    break;
                case Bengali:
                    registerLanguage(new Bengali());
                    break;
                case Breton:
                    registerLanguage(new Breton());
                    break;
                case Bulgarian:
                    registerLanguage(new Bulgarian());
                    break;
                case Catalan:
                    registerLanguage(new Catalan());
                    break;
                case Chinese:
                    registerLanguage(new Chinese());
                    break;
                case Croatian:
                    registerLanguage(new Croatian());
                    break;
                case Czech:
                    registerLanguage(new Czech());
                    break;
                case Danish:
                    registerLanguage(new Danish());
                    break;
                case Dutch:
                    registerLanguage(new Dutch());
                    break;
                case English:
                    registerLanguage(new English());
                    break;
                case Esperanto:
                    registerLanguage(new Esperanto());
                    break;
                case Estonian:
                    registerLanguage(new Estonian());
                    break;
                case Finnish:
                    registerLanguage(new Finnish());
                    break;
                case French:
                    registerLanguage(new French());
                    break;
                case Galician:
                    registerLanguage(new Galician());
                    break;
                case German:
                    registerLanguage(new German());
                    break;
                case Greek:
                    registerLanguage(new Greek());
                    break;
                case Hausa:
                    registerLanguage(new Hausa());
                    break;
                case Hebrew:
                    registerLanguage(new Hebrew());
                    break;
                case Hindi:
                    registerLanguage(new Hindi());
                    break;
                case Hungarian:
                    registerLanguage(new Hungarian());
                    break;
                case Indonesian:
                    registerLanguage(new Indonesian());
                    break;
                case Irish:
                    registerLanguage(new Irish());
                    break;
                case Italian:
                    registerLanguage(new Italian());
                    break;
                case Japanese:
                    registerLanguage(new Japanese());
                    break;
                case Korean:
                    registerLanguage(new Korean());
                    break;
                case Kurdish:
                    registerLanguage(new Kurdish());
                    break;
                case Latin:
                    registerLanguage(new Latin());
                    break;
                case Latvian:
                    registerLanguage(new Latvian());
                    break;
                case Lithuanian:
                    registerLanguage(new Lithuanian());
                    break;
                case Malay:
                    registerLanguage(new Malay());
                    break;
                case Marathi:
                    registerLanguage(new Marathi());
                    break;
                case Norwegian:
                    registerLanguage(new Norwegian());
                    break;
                case Persian:
                    registerLanguage(new Persian());
                    break;
                case Polish:
                    registerLanguage(new Polish());
                    break;
                case Portuguese:
                    registerLanguage(new Portuguese());
                    break;
                case Romanian:
                    registerLanguage(new Romanian());
                    break;
                case Russian:
                    registerLanguage(new Russian());
                    break;
                case Sesotho:
                    registerLanguage(new Sesotho());
                    break;
                case Slovak:
                    registerLanguage(new Slovak());
                    break;
                case Slovenian:
                    registerLanguage(new Slovenian());
                    break;
                case Somali:
                    registerLanguage(new Somali());
                    break;
                case Spanish:
                    registerLanguage(new Spanish());
                    break;
                case Swahili:
                    registerLanguage(new Swahili());
                    break;
                case Swedish:
                    registerLanguage(new Swedish());
                    break;
                case Tagalog:
                    registerLanguage(new Tagalog());
                    break;
                case Thai:
                    registerLanguage(new Thai());
                    break;
                case Turkish:
                    registerLanguage(new Turkish());
                    break;
                case Ukrainian:
                    registerLanguage(new Ukrainian());
                    break;
                case Urdu:
                    registerLanguage(new Urdu());
                    break;
                case Vietnamese:
                    registerLanguage(new Vietnamese());
                    break;
                case Yoruba:
                    registerLanguage(new Yoruba());
                    break;
                case Zulu:
                    registerLanguage(new Zulu());
                    break;
            }
        }
    }

    public void loadAllLanguages() {
        loadLanguage(Languages.values());
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy