All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.search.DefaultAnalyzerFactory Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Dec 21, 2010
 */

package com.bigdata.search;

import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;

import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;

/**
 * This is the default implementation but should be regarded as legacy since
 * it fails to use the correct {@link Analyzer} for almost all languages (other than
 * English). It uses the correct natural language analyzer only for literals tagged with
 * certain three letter ISO 639 codes:
 * "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell",
 * "fra", "fre", "rus" and "tha". All other tags are treated as English.
 * These codes do not work if they are used with subtagse.g. "ger-AT" is treated as English.
 * No two letter code, other than "en" works correctly: note that the W3C and 
 * IETF recommend the use of the two letter forms instead of the three letter forms.
 * 
 * @author Bryan Thompson
 * @deprecated Using {@link ConfigurableAnalyzerFactory} with 
 *    the {@link ConfigurableAnalyzerFactory.Options#NATURAL_LANGUAGE_SUPPORT} 
 *    uses the appropriate natural language analyzers for the two letter codes
 *    and for tags which include sub-tags.
 * @version $Id$
 */
public class DefaultAnalyzerFactory implements IAnalyzerFactory {

    private final FullTextIndex fullTextIndex;
    
    public DefaultAnalyzerFactory(final FullTextIndex fullTextIndex) {

        if (fullTextIndex == null)
            throw new IllegalArgumentException();
        
        this.fullTextIndex = fullTextIndex;
        
    }
    
    public Analyzer getAnalyzer(final String languageCode, final boolean filterStopwords) {

        final IKeyBuilder keyBuilder = fullTextIndex.getKeyBuilder();

        Map map = getAnalyzers();
        
        AnalyzerConstructor ctor = null;
        
        if (languageCode == null) {
        
            if (keyBuilder.isUnicodeSupported()) {

                // The configured local for the database.
                final Locale locale = ((KeyBuilder) keyBuilder)
                        .getSortKeyGenerator().getLocale();

                // The analyzer for that locale.
                Analyzer a = getAnalyzer(locale.getLanguage(), filterStopwords);

                if (a != null)
                    return a;
            
            }
            
            // fall through
            
        } else {
            
            /*
             * Check the declared analyzers. We first check the three letter
             * language code. If we do not have a match there then we check the
             * 2 letter language code.
             */
            
            String code = languageCode;

            if (code.length() > 3) {

                code = code.substring(0, 2);

                ctor = map.get(languageCode);

            }

            if (ctor == null && code.length() > 2) {

                code = code.substring(0, 1);

                ctor = map.get(languageCode);
                
            }
            
        }
        
        if (ctor == null) {

            // request the default analyzer.
            
            ctor = map.get("");
            
            if (ctor == null) {

                throw new IllegalStateException("No entry for empty string?");
                
            }
            
        }

        Analyzer a = ctor.newInstance(filterStopwords);
        
        return a;
        
    }

    abstract private static class AnalyzerConstructor {
        
        abstract public Analyzer newInstance(final boolean filterStopwords);
        
    }

    /**
     * A map containing instances of the various kinds of analyzers that we know
     * about.
     * 

* Note: There MUST be an entry under the empty string (""). This entry will * be requested when there is no entry for the specified language code. */ private Map analyzers; /** * Initializes the various kinds of analyzers that we know about. *

* Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See ISO 639-2. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on frenchenglish, and other * languages that are not being assigned here. */ synchronized private Map getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap(); final CharArraySet emptyStopwords = CharArraySet.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer() : new BrazilianAnalyzer(emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new SmartChineseAnalyzer() : new SmartChineseAnalyzer(emptyStopwords); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer() : new CJKAnalyzer(emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor",a); analyzers.put("ko",a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer() : new CzechAnalyzer(emptyStopwords); } }; analyzers.put("ces",a); analyzers.put("cze",a); analyzers.put("cs",a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer() : new DutchAnalyzer(emptyStopwords); } }; analyzers.put("dut",a); analyzers.put("nld",a); analyzers.put("nl",a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer() : new FrenchAnalyzer(emptyStopwords); } }; analyzers.put("fra",a); analyzers.put("fre",a); analyzers.put("fr",a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer() : new GermanAnalyzer(emptyStopwords); } }; analyzers.put("deu",a); analyzers.put("ger",a); analyzers.put("de",a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer() : new GreekAnalyzer(emptyStopwords); } }; analyzers.put("gre",a); analyzers.put("ell",a); analyzers.put("el",a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer() : new RussianAnalyzer(emptyStopwords); } }; analyzers.put("rus",a); analyzers.put("ru",a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(); } }; analyzers.put("tha",a); analyzers.put("th",a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer() : new StandardAnalyzer(emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy