All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.carrot2.text.linguistic.DefaultStemmerFactory Maven / Gradle / Ivy


/*
 * Carrot2 project.
 *
 * Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
 * All rights reserved.
 *
 * Refer to the full license file "carrot2.LICENSE"
 * in the root folder of the repository checkout or at:
 * http://www.carrot2.org/carrot2.LICENSE
 */

package org.carrot2.text.linguistic;

import java.util.EnumMap;

import org.carrot2.core.LanguageCode;
import org.carrot2.text.linguistic.lucene.ArabicStemmerAdapter;
import org.carrot2.text.linguistic.lucene.HindiStemmerAdapter;
import org.carrot2.text.linguistic.lucene.SnowballStemmerFactory;
import org.carrot2.text.linguistic.morfologik.MorfologikStemmerAdapter;
import org.carrot2.util.annotations.ThreadSafe;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.factory.FallbackFactory;
import org.carrot2.util.factory.IFactory;
import org.carrot2.util.factory.NewClassInstanceFactory;
import org.carrot2.util.factory.SingletonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.carrot2.shaded.guava.common.base.Predicate;
import org.carrot2.shaded.guava.common.collect.Maps;

@Bindable
@ThreadSafe
public class DefaultStemmerFactory implements IStemmerFactory
{
    private final static Logger logger = LoggerFactory.getLogger(DefaultStemmerFactory.class);

    private final static EnumMap> stemmerFactories;
    
    /**
     * Functional verification for {@link IStemmer}.
     */
    private final static Predicate stemmerVerifier = new Predicate()
    {
        @Override
        public boolean apply(IStemmer stemmer)
        {
            // Assume functional if there's no exception.
            stemmer.stem("verification");
            return true;
        }
    };

    /**
     * Initialize factories.
     */
    static
    {
        stemmerFactories = createDefaultStemmers();
    }


    @Override
    public IStemmer getStemmer(LanguageCode languageCode)
    {
        return stemmerFactories.get(languageCode).createInstance();
    }

    /**
     * Create default stemmer factories.
     */
    private static EnumMap> createDefaultStemmers()
    {
        final IFactory identity = new SingletonFactory(new IdentityStemmer());
        final EnumMap> map = Maps.newEnumMap(LanguageCode.class);

        // Adapters to third-party libraries.
        map.put(LanguageCode.POLISH,     new NewClassInstanceFactory(MorfologikStemmerAdapter.class));
        map.put(LanguageCode.ARABIC,     new NewClassInstanceFactory(ArabicStemmerAdapter.class));

        // Adapters to snowball.
        map.put(LanguageCode.DANISH,     new SnowballStemmerFactory("org.tartarus.snowball.ext.DanishStemmer"));
        map.put(LanguageCode.DUTCH,      new SnowballStemmerFactory("org.tartarus.snowball.ext.DutchStemmer"));
        map.put(LanguageCode.ENGLISH,    new SnowballStemmerFactory("org.tartarus.snowball.ext.EnglishStemmer"));
        map.put(LanguageCode.FINNISH,    new SnowballStemmerFactory("org.tartarus.snowball.ext.FinnishStemmer"));
        map.put(LanguageCode.FRENCH,     new SnowballStemmerFactory("org.tartarus.snowball.ext.FrenchStemmer"));
        map.put(LanguageCode.GERMAN,     new SnowballStemmerFactory("org.tartarus.snowball.ext.GermanStemmer"));
        map.put(LanguageCode.HUNGARIAN,  new SnowballStemmerFactory("org.tartarus.snowball.ext.HungarianStemmer"));
        map.put(LanguageCode.ITALIAN,    new SnowballStemmerFactory("org.tartarus.snowball.ext.ItalianStemmer"));
        map.put(LanguageCode.NORWEGIAN,  new SnowballStemmerFactory("org.tartarus.snowball.ext.NorwegianStemmer"));
        map.put(LanguageCode.PORTUGUESE, new SnowballStemmerFactory("org.tartarus.snowball.ext.PortugueseStemmer"));
        map.put(LanguageCode.ROMANIAN,   new SnowballStemmerFactory("org.tartarus.snowball.ext.RomanianStemmer"));
        map.put(LanguageCode.RUSSIAN,    new SnowballStemmerFactory("org.tartarus.snowball.ext.RussianStemmer"));
        map.put(LanguageCode.SPANISH,    new SnowballStemmerFactory("org.tartarus.snowball.ext.SpanishStemmer"));
        map.put(LanguageCode.SWEDISH,    new SnowballStemmerFactory("org.tartarus.snowball.ext.SwedishStemmer"));
        map.put(LanguageCode.TURKISH,    new SnowballStemmerFactory("org.tartarus.snowball.ext.TurkishStemmer"));

        // Identity stemming for Chinese.
        map.put(LanguageCode.CHINESE_SIMPLIFIED, identity);
        
        // Specialized stemming for Hindi (ported from Lucene)
        map.put(LanguageCode.HINDI, new NewClassInstanceFactory(HindiStemmerAdapter.class));

        // Decorate everything with a fallback identity stemmer.
        for (LanguageCode lc : LanguageCode.values())
        {
            if (map.containsKey(lc))
            {
                IFactory factory = map.get(lc);
                if (factory != identity)
                {
                    factory = new FallbackFactory(
                        factory, identity, stemmerVerifier,
                        logger, "Stemmer for "
                            + lc.toString() + " (" + lc.getIsoCode() + ") is not available."
                            + " This may degrade clustering quality of " 
                            + lc.toString() + " content. Cause: {}");

                    map.put(lc, factory);
                }
            }
            else
            {
                map.put(lc, identity);
            }
        }

        return map;
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy