All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.bigdata.search.ConfigurableAnalyzerFactory Maven / Gradle / Ivy

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 6, 2014 by Jeremy J. Carroll, Syapse Inc.
 */
package com.bigdata.search;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;

/**
 * This class can be used with the bigdata properties file to specify
 * which {@link Analyzer}s are used for which languages.
 * Languages are specified by the language tag on RDF literals, which conform
 * with RFC 5646.
 * Within bigdata plain literals are assigned to the default locale's language. 
 * 
 * The bigdata properties are used to map language ranges, as specified by 
 * RFC 4647 to classes which extend {@link Analyzer}.
 * Supported classes included all the natural language specific classes from Lucene, and also:
 * 
    *
  • {@link PatternAnalyzer} *
  • {@link TermCompletionAnalyzer} *
  • {@link KeywordAnalyzer} *
  • {@link SimpleAnalyzer} *
  • {@link StopAnalyzer} *
  • {@link WhitespaceAnalyzer} *
  • {@link StandardAnalyzer} *
* More generally any subclass of {@link Analyzer} that has at least one constructor matching: *
    *
  • no arguments *
  • {@link Version} *
  • {@link Version}, {@link Set} *
* is usable. If the class has a static method named getDefaultStopSet() then this is assumed * to do what it says on the can; some of the Lucene analyzers store their default stop words elsewhere, * and such stopwords are usable by this class. If no stop word set can be found, and there is a constructor without * stopwords and a constructor with stopwords, then the former is assumed to use a default stop word set. *

* Configuration is by means of the bigdata properties file. * All relevant properties start com.bigdata.search.ConfigurableAnalyzerFactory which we * abbreviate to c.b.s.C in this documentation. * Properties from {@link Options} apply to the factory. *

* Other properties, from {@link AnalyzerOptions} start with * c.b.s.C.analyzer.language-range where language-range conforms * with the extended language range construct from RFC 4647, section 2.2. * There is an issue that bigdata does not allow '*' in property names, and we use the character '_' to * substitute for '*' in extended language ranges in property names. * These are used to specify an analyzer for the given language range. *

* If no analyzer is specified for the language range * then the {@link StandardAnalyzer} is used. *

* Given any specific language, then the analyzer matching the longest configured language range, * measured in number of subtags is returned by {@link #getAnalyzer(String, boolean)} * In the event of a tie, the alphabetically first language range is used. * The algorithm to find a match is "Extended Filtering" as defined in section 3.3.2 of RFC 4647. *

* Some useful analyzers are as follows: *

*
{@link KeywordAnalyzer}
*
This treats every lexical value as a single search token
*
{@link WhitespaceAnalyzer}
*
This uses whitespace to tokenize
*
{@link PatternAnalyzer}
*
This uses a regular expression to tokenize
*
{@link TermCompletionAnalyzer}
*
This uses up to three regular expressions to specify multiple tokens for each word, to address term completion use cases.
*
{@link EmptyAnalyzer}
*
This suppresses the functionality, by treating every expression as a stop word.
*
* there are in addition the language specific analyzers that are included * by using the option {@link Options#NATURAL_LANGUAGE_SUPPORT} * * * @author jeremycarroll * */ public class ConfigurableAnalyzerFactory implements IAnalyzerFactory { final private static transient Logger log = Logger.getLogger(ConfigurableAnalyzerFactory.class); /** * Options understood by the {@link ConfigurableAnalyzerFactory}. */ public interface Options { /** * By setting this option to true, then all the known Lucene Analyzers for natural * languages are used for a range of language tags. * These settings may then be overridden by the settings of the user. * Specifically the following properties are loaded, prior to loading the * user's specification (with c.b.s.C expanding to * com.bigdata.search.ConfigurableAnalyzerFactory)
c.b.s.C.analyzer._.like=eng
c.b.s.C.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer
c.b.s.C.analyzer.pt.like=por
c.b.s.C.analyzer.zho.analyzerClass=org.apache.lucene.analysis.cn.ChineseAnalyzer
c.b.s.C.analyzer.chi.like=zho
c.b.s.C.analyzer.zh.like=zho
c.b.s.C.analyzer.jpn.analyzerClass=org.apache.lucene.analysis.cjk.CJKAnalyzer
c.b.s.C.analyzer.ja.like=jpn
c.b.s.C.analyzer.kor.like=jpn
c.b.s.C.analyzer.ko.like=kor
c.b.s.C.analyzer.ces.analyzerClass=org.apache.lucene.analysis.cz.CzechAnalyzer
c.b.s.C.analyzer.cze.like=ces
c.b.s.C.analyzer.cs.like=ces
c.b.s.C.analyzer.dut.analyzerClass=org.apache.lucene.analysis.nl.DutchAnalyzer
c.b.s.C.analyzer.nld.like=dut
c.b.s.C.analyzer.nl.like=dut
c.b.s.C.analyzer.deu.analyzerClass=org.apache.lucene.analysis.de.GermanAnalyzer
c.b.s.C.analyzer.ger.like=deu
c.b.s.C.analyzer.de.like=deu
c.b.s.C.analyzer.gre.analyzerClass=org.apache.lucene.analysis.el.GreekAnalyzer
c.b.s.C.analyzer.ell.like=gre
c.b.s.C.analyzer.el.like=gre
c.b.s.C.analyzer.rus.analyzerClass=org.apache.lucene.analysis.ru.RussianAnalyzer
c.b.s.C.analyzer.ru.like=rus
c.b.s.C.analyzer.tha.analyzerClass=org.apache.lucene.analysis.th.ThaiAnalyzer
c.b.s.C.analyzer.th.like=tha
c.b.s.C.analyzer.eng.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer
c.b.s.C.analyzer.en.like=eng
* * */ String NATURAL_LANGUAGE_SUPPORT = ConfigurableAnalyzerFactory.class.getName() + ".naturalLanguageSupport"; /** * This is the prefix to all properties configuring the individual analyzers. */ String ANALYZER = ConfigurableAnalyzerFactory.class.getName() + ".analyzer."; String DEFAULT_NATURAL_LANGUAGE_SUPPORT = "false"; } /** * Options understood by analyzers created by {@link ConfigurableAnalyzerFactory}. * These options are appended to the RFC 4647 language range */ public interface AnalyzerOptions { /** * If specified this is the fully qualified name of a subclass of {@link Analyzer} * that has appropriate constructors. * This is set implicitly if some of the options below are selected (for example {@link #PATTERN}). * For each configured language range, if it is not set, either explicitly or implicitly, then * {@link #LIKE} must be specified. */ String ANALYZER_CLASS = "analyzerClass"; /** * The value of this property is a language range, for which * an analyzer is defined. * Treat this language range in the same way as the specified * language range. * * {@link #LIKE} loops are not permitted. * * If this is option is specified for a language range, * then no other option is permitted. */ String LIKE = "like"; /** * The value of this property is one of: *
*
{@link #STOPWORDS_VALUE_NONE}
*
This analyzer is used without stop words.
*
{@link #STOPWORDS_VALUE_DEFAULT}
*
Use the default setting for stopwords for this analyzer. It is an error * to set this value on some analyzers such as {@link SimpleAnalyzer} that do not supprt stop words. *
*
A fully qualified class name
*
... of a subclass of {@link Analyzer} which * has a static method getDefaultStopSet(), in which case, the returned set of stop words is used. *
*
* If the {@link #ANALYZER_CLASS} does not support stop words then any value other than {@link #STOPWORDS_VALUE_NONE} is an error. * If the {@link #ANALYZER_CLASS} does support stop words then the default value is {@link #STOPWORDS_VALUE_DEFAULT} */ String STOPWORDS = "stopwords"; String STOPWORDS_VALUE_DEFAULT = "default"; String STOPWORDS_VALUE_NONE = "none"; /** * The value of the pattern parameter to * {@link PatternAnalyzer#PatternAnalyzer(Version, Pattern, boolean, Set)} * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). * It is an error if a different analyzer class is specified. */ String PATTERN = "pattern"; /** * The value of the wordBoundary parameter to * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). * It is an error if a different analyzer class is specified. */ String WORD_BOUNDARY = "wordBoundary"; /** * The value of the subWordBoundary parameter to * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). * It is an error if a different analyzer class is specified. */ String SUB_WORD_BOUNDARY = "subWordBoundary"; /** * The value of the softHyphens parameter to * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). * It is an error if a different analyzer class is specified. */ String SOFT_HYPHENS = "softHyphens"; /** * The value of the alwaysRemoveSoftHypens parameter to * {@link TermCompletionAnalyzer#TermCompletionAnalyzer(Pattern, Pattern, Pattern, boolean)} * (Note the {@link Pattern#UNICODE_CHARACTER_CLASS} flag is enabled). * It is an error if a different analyzer class is specified. */ String ALWAYS_REMOVE_SOFT_HYPHENS = "alwaysRemoveSoftHyphens"; boolean DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS = false; /** * The default sub-word boundary is a pattern that never matches, * i.e. there are no sub-word boundaries. */ Pattern DEFAULT_SUB_WORD_BOUNDARY = Pattern.compile("(?!)"); } /** * Initialization is a little tricky, because on the very first * call to the constructor with a new namespace or a new journal * the fullTextIndex is not ready for use. * Therefore we delegate to an unconfigured object * which on the first call to {@link NeedsConfiguringAnalyzerFactory#getAnalyzer(String, boolean)} * does the configuration and replaces itself here with a * {@link ConfiguredAnalyzerFactory} */ IAnalyzerFactory delegate; /** * Builds a new ConfigurableAnalyzerFactory. * @param fullTextIndex */ public ConfigurableAnalyzerFactory(final FullTextIndex fullTextIndex) { delegate = new NeedsConfiguringAnalyzerFactory(this, fullTextIndex); } static int loggerIdCounter = 0; @Override public Analyzer getAnalyzer(final String languageCode, boolean filterStopwords) { final Analyzer unlogged = delegate.getAnalyzer(languageCode, filterStopwords); return unlogged; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy