com.bigdata.search.DefaultAnalyzerFactory Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Dec 21, 2010
 */

package com.bigdata.search;

import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;

import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;

/**
 * This is the default implementation but should be regarded as legacy since
 * it fails to use the correct {@link Analyzer} for almost all languages (other than
 * English). It uses the correct natural language analyzer only for literals tagged with
 * certain three letter ISO 639 codes:
 * "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell",
 * "fra", "fre", "rus" and "tha". All other tags are treated as English.
 * These codes do not work if they are used with subtagse.g. "ger-AT" is treated as English.
 * No two letter code, other than "en" works correctly: note that the W3C and 
 * IETF recommend the use of the two letter forms instead of the three letter forms.
 * 
 * @author Bryan Thompson
 * @deprecated Using {@link ConfigurableAnalyzerFactory} with 
 *    the {@link ConfigurableAnalyzerFactory.Options#NATURAL_LANGUAGE_SUPPORT} 
 *    uses the appropriate natural language analyzers for the two letter codes
 *    and for tags which include sub-tags.
 * @version $Id$
 */
public class DefaultAnalyzerFactory implements IAnalyzerFactory {

    private final FullTextIndex fullTextIndex;
    
    public DefaultAnalyzerFactory(final FullTextIndex fullTextIndex) {

        if (fullTextIndex == null)
            throw new IllegalArgumentException();
        
        this.fullTextIndex = fullTextIndex;
        
    }
    
    public Analyzer getAnalyzer(final String languageCode, final boolean filterStopwords) {

        final IKeyBuilder keyBuilder = fullTextIndex.getKeyBuilder();

        Map map = getAnalyzers();
        
        AnalyzerConstructor ctor = null;
        
        if (languageCode == null) {
        
            if (keyBuilder.isUnicodeSupported()) {

                // The configured local for the database.
                final Locale locale = ((KeyBuilder) keyBuilder)
                        .getSortKeyGenerator().getLocale();

                // The analyzer for that locale.
                Analyzer a = getAnalyzer(locale.getLanguage(), filterStopwords);

                if (a != null)
                    return a;
            
            }
            
            // fall through
            
        } else {
            
            /*
             * Check the declared analyzers. We first check the three letter
             * language code. If we do not have a match there then we check the
             * 2 letter language code.
             */
            
            String code = languageCode;

            if (code.length() > 3) {

                code = code.substring(0, 2);

                ctor = map.get(languageCode);

            }

            if (ctor == null && code.length() > 2) {

                code = code.substring(0, 1);

                ctor = map.get(languageCode);
                
            }
            
        }
        
        if (ctor == null) {

            // request the default analyzer.
            
            ctor = map.get("");
            
            if (ctor == null) {

                throw new IllegalStateException("No entry for empty string?");
                
            }
            
        }

        Analyzer a = ctor.newInstance(filterStopwords);
        
        return a;
        
    }

    abstract private static class AnalyzerConstructor {
        
        abstract public Analyzer newInstance(final boolean filterStopwords);
        
    }

    /**
     * A map containing instances of the various kinds of analyzers that we know
     * about.
     * 
     * Note: There MUST be an entry under the empty string (""). This entry will
     * be requested when there is no entry for the specified language code.
     */
    private Map analyzers;
    
    /**
     * Initializes the various kinds of analyzers that we know about.
     * 
     * Note: Each {@link Analyzer} is registered under both the 3 letter and the
     * 2 letter language codes. See ISO 639-2.
     * 
     * @todo get some informed advice on which {@link Analyzer}s map onto which
     *       language codes.
     * 
     * @todo thread safety? Analyzers produce token processors so maybe there is
     *       no problem here once things are initialized. If so, maybe this
     *       could be static.
     * 
     * @todo configuration. Could be configured by a file containing a class
     *       name and a list of codes that are handled by that class.
     * 
     * @todo strip language code down to 2/3 characters during lookup.
     * 
     * @todo There are a lot of pidgins based on frenchenglish, and other
     *       languages that are not being assigned here.
     */
    synchronized private Map getAnalyzers() {
        
        if (analyzers != null) {

            return analyzers;
            
        }
        
        analyzers = new HashMap();
        
        final CharArraySet emptyStopwords = CharArraySet.EMPTY_SET;

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new BrazilianAnalyzer() :
                    	new BrazilianAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("por", a);
            analyzers.put("pt", a);
        }

        /*
         * Claims to handle Chinese. Does single character extraction. Claims to
         * produce smaller indices as a result.
         * 
         * Note: you can not tokenize with the Chinese analyzer and the do
         * search using the CJK analyzer and visa versa.
         * 
         * Note: I have no idea whether this would work for Japanese and Korean
         * as well. I expect so, but no real clue.
         */
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new SmartChineseAnalyzer() :
                		new SmartChineseAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("zho", a);
            analyzers.put("chi", a);
            analyzers.put("zh", a);
        }
        
        /*
         * Claims to handle Chinese, Japanese, Korean. Does double character
         * extraction with overlap.
         */
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new CJKAnalyzer() :
                		new CJKAnalyzer(emptyStopwords);
                }
            };
//            analyzers.put("zho", a);
//            analyzers.put("chi", a);
//            analyzers.put("zh", a);
            analyzers.put("jpn", a);
            analyzers.put("ja", a);
            analyzers.put("jpn", a);
            analyzers.put("kor",a);
            analyzers.put("ko",a);
        }

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new CzechAnalyzer() :
            			new CzechAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("ces",a);
            analyzers.put("cze",a);
            analyzers.put("cs",a);
        }

        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new DutchAnalyzer() :
            			new DutchAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("dut",a);
            analyzers.put("nld",a);
            analyzers.put("nl",a);
        }
        
        {  
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new FrenchAnalyzer() :
            			new FrenchAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("fra",a); 
            analyzers.put("fre",a); 
            analyzers.put("fr",a);
        }

        /*
         * Note: There are a lot of language codes for German variants that
         * might be useful here.
         */
        {  
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new GermanAnalyzer() :
            			new GermanAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("deu",a); 
            analyzers.put("ger",a); 
            analyzers.put("de",a);
        }
        
        // Note: ancient greek has a different code (grc).
        {  
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new GreekAnalyzer() :
            			new GreekAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("gre",a); 
            analyzers.put("ell",a); 
            analyzers.put("el",a);
        }        

        // @todo what about other Cyrillic scripts?
        {  
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new RussianAnalyzer() :
                    	new RussianAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("rus",a); 
            analyzers.put("ru",a); 
        }        
        
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return new ThaiAnalyzer();
                }
            };
            analyzers.put("tha",a); 
            analyzers.put("th",a); 
        }

        // English
        {
            AnalyzerConstructor a = new AnalyzerConstructor() {
                public Analyzer newInstance(final boolean filterStopwords) {
                    return filterStopwords ?
                		new StandardAnalyzer() :
                		new StandardAnalyzer(emptyStopwords);
                }
            };
            analyzers.put("eng", a);
            analyzers.put("en", a);
            /*
             * Note: There MUST be an entry under the empty string (""). This
             * entry will be requested when there is no entry for the specified
             * language code.
             */
            analyzers.put("", a);
        }

        return analyzers;
        
    }

}