com.bigdata.search.ConfigurableAnalyzerFactory Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 6, 2014 by Jeremy J. Carroll, Syapse Inc.
 */
package com.bigdata.search;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;

/**
 * This class can be used with the bigdata properties file to specify
 * which {@link Analyzer}s are used for which languages.
 * Languages are specified by the language tag on RDF literals, which conform
 * with RFC 5646.
 * Within bigdata plain literals are assigned to the default locale's language. 
 * 
 * The bigdata properties are used to map language ranges, as specified by 
 * RFC 4647 to classes which extend {@link Analyzer}.
 * Supported classes included all the natural language specific classes from Lucene, and also:
 * 
 * {@link PatternAnalyzer}
 * 
{@link TermCompletionAnalyzer}
 * 
{@link KeywordAnalyzer}
 * 
{@link SimpleAnalyzer}
 * 
{@link StopAnalyzer}
 * 
{@link WhitespaceAnalyzer}
 * 
{@link StandardAnalyzer}
 *  
 * More generally any subclass of  {@link Analyzer} that has at least one constructor matching:
 * 
 * no arguments
 * 
{@link Version}
 * 
{@link Version}, {@link Set}
 * 
 * is usable. If the class has a static method named getDefaultStopSet() then this is assumed
 * to do what it says on the can; some of the Lucene analyzers store their default stop words elsewhere,
 * and such stopwords are usable by this class. If no stop word set can be found, and there is a constructor without
 * stopwords and a constructor with stopwords, then the former is assumed to use a default stop word set.
 * 
 * Configuration is by means of the bigdata properties file.
 * All relevant properties start com.bigdata.search.ConfigurableAnalyzerFactory which we 
 * abbreviate to c.b.s.C in this documentation. 
 * Properties from {@link Options} apply to the factory.
 * 

 * Other properties, from {@link AnalyzerOptions} start with
 * c.b.s.C.analyzer.language-range where language-range conforms
 * with the extended language range construct from RFC 4647, section 2.2. 
 * There is an issue that bigdata does not allow '*' in property names, and we use the character '_' to
 * substitute for '*' in extended language ranges in property names.
 * These are used to specify an analyzer for the given language range.
 * 

 * If no analyzer is specified for the language range * then the {@link StandardAnalyzer} is used.
 * 

 * Given any specific language, then the analyzer matching the longest configured language range, 
 * measured in number of subtags is returned by {@link #getAnalyzer(String, boolean)} 
 * In the event of a tie, the alphabetically first language range is used.
 * The algorithm to find a match is "Extended Filtering" as defined in section 3.3.2 of RFC 4647.
 * 

 * Some useful analyzers are as follows:
 * 

 * {@link KeywordAnalyzer}
 * This treats every lexical value as a single search token
 * {@link WhitespaceAnalyzer}
 * This uses whitespace to tokenize
 * {@link PatternAnalyzer}
 * This uses a regular expression to tokenize
 * {@link TermCompletionAnalyzer}
 * This uses up to three regular expressions to specify multiple tokens for each word, to address term completion use cases.
 * {@link EmptyAnalyzer}
 * This suppresses the functionality, by treating every expression as a stop word.
 * 
 * there are in addition the language specific analyzers that are included
 * by using the option {@link Options#NATURAL_LANGUAGE_SUPPORT}
 * 
 * 
 * @author jeremycarroll
 *
 */
public class ConfigurableAnalyzerFactory implements IAnalyzerFactory {
	final private static transient Logger log = Logger.getLogger(ConfigurableAnalyzerFactory.class);

	/**
     * Options understood by the {@link ConfigurableAnalyzerFactory}.
     */
    public interface Options {
    	/**
    	 * By setting this option to true, then all the known Lucene Analyzers for natural
    	 * languages are used for a range of language tags.
    	 * These settings may then be overridden by the settings of the user.
    	 * Specifically the following properties are loaded, prior to loading the
    	 * user's specification (with c.b.s.C expanding to 
    	 * com.bigdata.search.ConfigurableAnalyzerFactory)
c.b.s.C.analyzer._.like=eng
c.b.s.C.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer
c.b.s.C.analyzer.pt.like=por
c.b.s.C.analyzer.zho.analyzerClass=org.apache.lucene.analysis.cn.ChineseAnalyzer
c.b.s.C.analyzer.chi.like=zho
c.b.s.C.analyzer.zh.like=zho
c.b.s.C.analyzer.jpn.analyzerClass=org.apache.lucene.analysis.cjk.CJKAnalyzer
c.b.s.C.analyzer.ja.like=jpn
c.b.s.C.analyzer.kor.like=jpn
c.b.s.C.analyzer.ko.like=kor
c.b.s.C.analyzer.ces.analyzerClass=org.apache.lucene.analysis.cz.CzechAnalyzer
c.b.s.C.analyzer.cze.like=ces
c.b.s.C.analyzer.cs.like=ces
c.b.s.C.analyzer.dut.analyzerClass=org.apache.lucene.analysis.nl.DutchAnalyzer
c.b.s.C.analyzer.nld.like=dut
c.b.s.C.analyzer.nl.like=dut
c.b.s.C.analyzer.deu.analyzerClass=org.apache.lucene.analysis.de.GermanAnalyzer
c.b.s.C.analyzer.ger.like=deu
c.b.s.C.analyzer.de.like=deu
c.b.s.C.analyzer.gre.analyzerClass=org.apache.lucene.analysis.el.GreekAnalyzer
c.b.s.C.analyzer.ell.like=gre
c.b.s.C.analyzer.el.like=gre
c.b.s.C.analyzer.rus.analyzerClass=org.apache.lucene.analysis.ru.RussianAnalyzer
c.b.s.C.analyzer.ru.like=rus
c.b.s.C.analyzer.tha.analyzerClass=org.apache.lucene.analysis.th.ThaiAnalyzer
c.b.s.C.analyzer.th.like=tha
c.b.s.C.analyzer.eng.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer
c.b.s.C.analyzer.en.like=eng

    	 * 
    	 * 
    	 */
        String NATURAL_LANGUAGE_SUPPORT = ConfigurableAnalyzerFactory.class.getName() + ".naturalLanguageSupport";
        /**
         * This is the prefix to all properties configuring the individual analyzers.
         */
        String ANALYZER = ConfigurableAnalyzerFactory.class.getName() + ".analyzer.";

        String DEFAULT_NATURAL_LANGUAGE_SUPPORT = "false";
    }
    /**
     * Options understood by analyzers created by {@link ConfigurableAnalyzerFactory}.
     * These options are appended to the RFC 4647 language range
     */
    public interface AnalyzerOptions {
    	/**
    	 * If specified this is the fully qualified name of a subclass of {@link Analyzer}
    	 * that has appropriate constructors.
    	 * This is set implicitly if some of the options below are selected (for example {@link #PATTERN}).
    	 * For each configured language range, if it is not set, either explicitly or implicitly, then 
    	 * {@link #LIKE}  must be specified.
    	 */
        String ANALYZER_CLASS = "analyzerClass";
        
        /**
         * The value of this property is a language range, for which
         * an analyzer is defined. 
         * Treat this language range in the same way as the specified 
         * language range.
         * 
         * {@link #LIKE} loops are not permitted.
         * 
         * If this is option is specified for a language range,
         * then no other option is permitted.
         */
        String LIKE = "like";
        
        /**
         * The value of this property is one of:
         * 
         * {@link #STOPWORDS_VALUE_NONE}
         * This analyzer is used without stop words.
         * {@link #STOPWORDS_VALUE_DEFAULT}
         * Use the default setting for stopwords for this analyzer. It is an error
         * to set this value on some analyzers such as {@link SimpleAnalyzer} that do not supprt stop words.
         * 
         * A fully qualified class name
         * ... of a subclass of {@link Analyzer} which
         * has a static method getDefaultStopSet(), in which case, the returned set of stop words is used.
         * 
         * 
         * If the {@link #ANALYZER_CLASS} does not support stop words then any value other than {@link #STOPWORDS_VALUE_NONE} is an error.
         * If the {@link #ANALYZER_CLASS} does support stop words then the default value is {@link #STOPWORDS_VALUE_DEFAULT}
         */
        String STOPWORDS = "stopwords";
        
        String STOPWORDS_VALUE_DEFAULT = "default";
        
        String STOPWORDS_VALUE_NONE = "none";
        /**
         * The value of the pattern parameter to
         * {@link PatternAnalyzer#PatternAnalyzer(Version, Pattern, boolean, Set)} 
         * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
         * It is an error if a different analyzer class is specified.
         */
        String PATTERN = "pattern";
        /**
         * The value of the wordBoundary parameter to
         * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} 
         * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
         * It is an error if a different analyzer class is specified.
         */
        String WORD_BOUNDARY = "wordBoundary";
        /**
         * The value of the subWordBoundary parameter to
         * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} 
         * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
         * It is an error if a different analyzer class is specified.
         */
        String SUB_WORD_BOUNDARY = "subWordBoundary";
        /**
         * The value of the softHyphens parameter to
         * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} 
         * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
         * It is an error if a different analyzer class is specified.
         */
        String SOFT_HYPHENS = "softHyphens";
        /**
         * The value of the alwaysRemoveSoftHypens parameter to
         * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} 
         * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled).
         * It is an error if a different analyzer class is specified.
         */
        String ALWAYS_REMOVE_SOFT_HYPHENS = "alwaysRemoveSoftHyphens";
        
        boolean DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS = false;

        /**
         * The default sub-word boundary is a pattern that never matches,
         * i.e. there are no sub-word boundaries.
         */
		Pattern DEFAULT_SUB_WORD_BOUNDARY = Pattern.compile("(?!)");
    	
    }

    /**
     * Initialization is a little tricky, because on the very first
     * call to the constructor with a new namespace or a new journal
     * the fullTextIndex is not ready for use.
     * Therefore we delegate to an unconfigured object
     * which on the first call to {@link NeedsConfiguringAnalyzerFactory#getAnalyzer(String, boolean)}
     * does the configuration and replaces itself here with a
     * {@link ConfiguredAnalyzerFactory}
     */
    IAnalyzerFactory delegate;

    /**
     * Builds a new ConfigurableAnalyzerFactory.
     * @param fullTextIndex
     */
    public ConfigurableAnalyzerFactory(final FullTextIndex fullTextIndex) {
    	delegate = new NeedsConfiguringAnalyzerFactory(this, fullTextIndex);
    }


	static int loggerIdCounter = 0;
	@Override
	public Analyzer getAnalyzer(final String languageCode, boolean filterStopwords) {

		final Analyzer unlogged = delegate.getAnalyzer(languageCode, filterStopwords);
		return unlogged;
		
	}

}