com.bigdata.search.NeedsConfiguringAnalyzerFactory Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on May 6, 2014 by Jeremy J. Carroll, Syapse Inc.
*/
package com.bigdata.search;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.Constructor;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.WeakHashMap;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.pattern.PatternTokenizer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
import com.bigdata.search.ConfigurableAnalyzerFactory.Options;
/**
*
* The bulk of the code in this class is invoked from {@link #init()} to set up the array of
* {@link ConfiguredAnalyzerFactory.AnalyzerPair}s. For example, all of the subclasses of {@link AnalyzerPair}s,
* are simply to call the appropriate constructor in the appropriate way: the difficulty is that many subclasses
* of {@link Analyzer} have constructors with different signatures, and our code needs to navigate each sort.
* @author jeremycarroll
*
*/
class NeedsConfiguringAnalyzerFactory implements IAnalyzerFactory {
final private static transient Logger log = Logger.getLogger(NeedsConfiguringAnalyzerFactory.class);
/**
* We create only one {@link ConfiguredAnalyzerFactory} per namespace
* and store it here. The UUID is stable and allows us to side-step lifecycle
* issues such as creation and destruction of namespaces, potentially with different properties.
* We use a WeakHashMap to ensure that after the destruction of a namespace we clean up.
* We have to synchronize this for thread safety.
*/
private static final Map allConfigs =
Collections.synchronizedMap(new WeakHashMap());
private static final String ALL_LUCENE_NATURAL_LANGUAGES =
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.like=eng\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.por.analyzerClass=org.apache.lucene.analysis.br.BrazilianAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.pt.like=por\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.zho.analyzerClass=org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.chi.like=zho\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.zh.like=zho\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.jpn.analyzerClass=org.apache.lucene.analysis.cjk.CJKAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ja.like=jpn\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.kor.like=jpn\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ko.like=kor\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ces.analyzerClass=org.apache.lucene.analysis.cz.CzechAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.cze.like=ces\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.cs.like=ces\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.dut.analyzerClass=org.apache.lucene.analysis.nl.DutchAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.nld.like=dut\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.nl.like=dut\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.deu.analyzerClass=org.apache.lucene.analysis.de.GermanAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ger.like=deu\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.de.like=deu\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.gre.analyzerClass=org.apache.lucene.analysis.el.GreekAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ell.like=gre\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.el.like=gre\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.rus.analyzerClass=org.apache.lucene.analysis.ru.RussianAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.ru.like=rus\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.tha.analyzerClass=org.apache.lucene.analysis.th.ThaiAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.th.like=tha\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.eng.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n" +
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.en.like=eng\n";
private static final String LUCENE_STANDARD_ANALYZER =
"com.bigdata.search.ConfigurableAnalyzerFactory.analyzer.*.analyzerClass=org.apache.lucene.analysis.standard.StandardAnalyzer\n";
static int loggerIdCounter = 0;
/**
* This class and all its subclasses provide a variety of patterns
* for mapping from the various constructor patterns of subclasses
* of {@link Analyzer} to {@link ConfiguredAnalyzerFactory#AnalyzerPair}.
* @author jeremycarroll
*
*/
private static class AnalyzerPair extends ConfiguredAnalyzerFactory.AnalyzerPair {
AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) {
super(range, withStopWords, withOutStopWords);
}
/**
* This clone constructor implements {@link AnalyzerOptions#LIKE}.
* @param range
* @param copyMe
*/
AnalyzerPair(String range, AnalyzerPair copyMe) {
super(range, copyMe);
}
/**
* If we have a constructor, with arguments including a populated
* stop word set, then we can use it to make both the withStopWords
* analyzer, and the withoutStopWords analyzer.
* @param range
* @param cons A Constructor including a {@link java.util.Set} argument
* for the stop words.
* @param params The arguments to pass to the constructor including a populated stopword set.
* @throws Exception
*/
AnalyzerPair(String range, Constructor extends Analyzer> cons, Object ... params) throws Exception {
this(range, cons.newInstance(params), cons.newInstance(useEmptyStopWordSet(params)));
}
AnalyzerPair(String range, Analyzer stopWordsNotSupported) {
this(range, stopWordsNotSupported, stopWordsNotSupported);
}
private static Object[] useEmptyStopWordSet(Object[] params) {
Object rslt[] = new Object[params.length];
for (int i=0; i cls) throws Exception {
super(lro.languageRange, getConstructor(cls, Version.class, Set.class), Version.LUCENE_CURRENT, lro.getStopWords());
}
}
/**
* Used for Analyzer classes which do not support stopwords and have a constructor with signature (Version).
* @author jeremycarroll
*
*/
private static class VersionAnalyzerPair extends AnalyzerPair {
public VersionAnalyzerPair(String range, Class extends Analyzer> cls) throws Exception {
super(range, getConstructor(cls, Version.class).newInstance(Version.LUCENE_CURRENT));
}
}
/**
* Special case code for {@link PatternTokenizer}
* @author jeremycarroll
*
*/
private static class PatternAnalyzer extends AnalyzerPair {
public PatternAnalyzer(ConfigOptionsToAnalyzer lro, Pattern pattern, CharArraySet stopWords) throws Exception {
/*
super(lro.languageRange, getConstructor(PatternAnalyzerImpl.class,Pattern.class, CharArraySet.class),
pattern, stopWords);
*/
super(lro.languageRange, new PatternAnalyzerImpl(pattern, stopWords), new PatternAnalyzerImpl(pattern, CharArraySet.EMPTY_SET));
}
public PatternAnalyzer(ConfigOptionsToAnalyzer lro, Pattern pattern) throws Exception {
super(lro.languageRange, new PatternAnalyzerImpl(pattern, CharArraySet.EMPTY_SET));
}
}
/**
* This class is initialized with the config options, using the {@link #setProperty(String, String)}
* method, for a particular language range and works out which pair of {@link Analyzer}s
* to use for that language range.
*
* Instances of this class are only alive during the execution of
* {@link NeedsConfiguringAnalyzerFactory#ConfigurableAnalyzerFactory(FullTextIndex)},
* the life-cycle is:
*
* - The relveant config properties are applied, and are used to populate the fields.
*
- The fields are validated
*
- An {@link AnalyzerPair} is constructed
*
*
* @author jeremycarroll
*
*/
private static class ConfigOptionsToAnalyzer {
String like;
String className;
String stopwords;
Pattern pattern;
final String languageRange;
AnalyzerPair result;
Pattern wordBoundary;
Pattern subWordBoundary;
Pattern softHyphens;
Boolean alwaysRemoveSoftHyphens;
public ConfigOptionsToAnalyzer(String languageRange) {
this.languageRange = languageRange;
}
/**
* This is called only when we have already identified that
* the class does support stopwords.
* @return
*/
public Set> getStopWords() {
if (doNotUseStopWords())
return CharArraySet.EMPTY_SET;
if (useDefaultStopWords()) {
return getStopWordsForClass(className);
}
return getStopWordsForClass(stopwords);
}
boolean doNotUseStopWords() {
return AnalyzerOptions.STOPWORDS_VALUE_NONE.equals(stopwords) || (stopwords == null && pattern != null);
}
protected Set> getStopWordsForClass(String clazzName) {
Class extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
try {
return (Set>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
} catch (Exception e) {
if (StandardAnalyzer.class.equals(analyzerClass)) {
return StandardAnalyzer.STOP_WORDS_SET;
}
if (StopAnalyzer.class.equals(analyzerClass)) {
return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
}
}
protected boolean useDefaultStopWords() {
return ( stopwords == null && pattern == null ) || AnalyzerOptions.STOPWORDS_VALUE_DEFAULT.equals(stopwords);
}
/**
* The first step in the life-cycle, used to initialize the fields.
* @return true if the property was recognized.
*/
public boolean setProperty(String shortProperty, String value) {
if (shortProperty.equals(AnalyzerOptions.LIKE) ) {
like = value;
} else if (shortProperty.equals(AnalyzerOptions.ANALYZER_CLASS) ) {
className = value;
} else if (shortProperty.equals(AnalyzerOptions.STOPWORDS) ) {
stopwords = value;
} else if (shortProperty.equals(AnalyzerOptions.PATTERN) ) {
pattern = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
} else if (shortProperty.equals(AnalyzerOptions.WORD_BOUNDARY) ) {
wordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
} else if (shortProperty.equals(AnalyzerOptions.SUB_WORD_BOUNDARY) ) {
subWordBoundary = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
} else if (shortProperty.equals(AnalyzerOptions.SOFT_HYPHENS) ) {
softHyphens = Pattern.compile(value,Pattern.UNICODE_CHARACTER_CLASS);
} else if (shortProperty.equals(AnalyzerOptions.ALWAYS_REMOVE_SOFT_HYPHENS) ) {
alwaysRemoveSoftHyphens = Boolean.valueOf(value);
} else {
return false;
}
return true;
}
/**
* The second phase of the life-cycle, used for sanity checking.
*/
public void validate() {
if (pattern != null ) {
if ( className != null && className != PatternTokenizer.class.getName()) {
throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className);
}
className = PatternTokenizer.class.getName();
}
if (this.wordBoundary != null ) {
if ( className != null && className != TermCompletionAnalyzer.class.getName()) {
throw new RuntimeException("Bad Option: Language range "+languageRange + " with pattern propety for class "+ className);
}
className = TermCompletionAnalyzer.class.getName();
if ( subWordBoundary == null ) {
subWordBoundary = AnalyzerOptions.DEFAULT_SUB_WORD_BOUNDARY;
}
if ( alwaysRemoveSoftHyphens != null && softHyphens == null ) {
throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify softHypens when setting alwaysRemoveSoftHyphens");
}
if (softHyphens != null && alwaysRemoveSoftHyphens == null) {
alwaysRemoveSoftHyphens = AnalyzerOptions.DEFAULT_ALWAYS_REMOVE_SOFT_HYPHENS;
}
} else if ( subWordBoundary != null || softHyphens != null || alwaysRemoveSoftHyphens != null ||
TermCompletionAnalyzer.class.getName().equals(className) ) {
throw new RuntimeException("Bad option: Language range "+languageRange + ": must specify wordBoundary for TermCompletionAnalyzer");
}
if (PatternTokenizer.class.getName().equals(className) && pattern == null ) {
throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify pattern for PatternTokenizer.");
}
if ( (like != null) == (className != null) ) {
throw new RuntimeException("Bad Option: Language range "+languageRange + " must specify exactly one of implementation class or like.");
}
if (stopwords != null && like != null) {
throw new RuntimeException("Bad Option: Language range "+languageRange + " must not specify stopwords with like.");
}
}
/**
* The third and final phase of the life-cyle used for identifying
* the AnalyzerPair.
*/
private AnalyzerPair construct() throws Exception {
if (className == null) {
return null;
}
if (pattern != null) {
return new PatternAnalyzer(this, pattern);
}
if (softHyphens != null) {
return new AnalyzerPair(
languageRange,
new TermCompletionAnalyzer(
wordBoundary,
subWordBoundary,
softHyphens,
alwaysRemoveSoftHyphens));
}
if (wordBoundary != null) {
return new AnalyzerPair(
languageRange,
new TermCompletionAnalyzer(
wordBoundary,
subWordBoundary));
}
final Class extends Analyzer> cls = getAnalyzerClass();
if (hasConstructor(cls, CharArraySet.class)) {
return new AnalyzerPair(languageRange,
cls.getConstructor(CharArraySet.class).newInstance(new CharArraySet(getStopWords(), false)),
cls.getConstructor(CharArraySet.class).newInstance(new CharArraySet(Collections.emptySet(), false)));
}
if (hasConstructor(cls, Version.class, Set.class)) {
// RussianAnalyzer is missing any way to access stop words.
if (RussianAnalyzer.class.equals(cls)) {
if (useDefaultStopWords()) {
return new AnalyzerPair(languageRange, new RussianAnalyzer(), new RussianAnalyzer(CharArraySet.EMPTY_SET));
}
if (doNotUseStopWords()) {
return new AnalyzerPair(languageRange, new RussianAnalyzer(CharArraySet.EMPTY_SET));
}
}
return new VersionSetAnalyzerPair(this, cls);
}
if (stopwords != null && !stopwords.equals(AnalyzerOptions.STOPWORDS_VALUE_NONE)) {
throw new RuntimeException("Bad option: language range: " + languageRange + " stopwords are not supported by " + className);
}
if (hasConstructor(cls, Version.class)) {
return new VersionAnalyzerPair(languageRange, cls);
}
if (hasConstructor(cls)) {
return new AnalyzerPair(languageRange, cls.newInstance());
}
throw new RuntimeException("Bad option: cannot find constructor for class " + className + " for language range " + languageRange);
}
/**
* Also part of the third phase of the life-cycle, following the {@link AnalyzerOptions#LIKE}
* properties.
* @param depth
* @param max
* @param analyzers
* @return
*/
AnalyzerPair followLikesToAnalyzerPair(int depth, int max,
Map analyzers) {
if (result == null) {
if (depth == max) {
throw new RuntimeException("Bad configuration: - 'like' loop for language range " + languageRange);
}
ConfigOptionsToAnalyzer next = analyzers.get(like);
if (next == null) {
throw new RuntimeException("Bad option: - 'like' not found for language range " + languageRange+ " (not found: '"+ like +"')");
}
result = new AnalyzerPair(languageRange, next.followLikesToAnalyzerPair(depth+1, max, analyzers));
}
return result;
}
protected Class extends Analyzer> getAnalyzerClass() {
return getAnalyzerClass(className);
}
@SuppressWarnings("unchecked")
protected Class extends Analyzer> getAnalyzerClass(String className2) {
final Class extends Analyzer> cls;
try {
cls = (Class extends Analyzer>) Class.forName(className2);
} catch (ClassNotFoundException e) {
throw new RuntimeException("Bad option: cannot find class " + className2 + " for language range " + languageRange, e);
}
return cls;
}
void setAnalyzerPair(AnalyzerPair ap) {
result = ap;
}
}
private final FullTextIndex> fullTextIndex;
private final ConfigurableAnalyzerFactory parent;
NeedsConfiguringAnalyzerFactory(ConfigurableAnalyzerFactory parent, final FullTextIndex> fullTextIndex) {
if (fullTextIndex == null)
throw new IllegalArgumentException();
this.fullTextIndex = fullTextIndex;
this.parent = parent;
}
private ConfiguredAnalyzerFactory init() {
UUID uuid = fullTextIndex.getIndex().getIndexMetadata().getIndexUUID();
ConfiguredAnalyzerFactory configuration = allConfigs.get(uuid);
if (configuration == null) {
// First time for this namespace - we need to analyze the properties
// and construct a ConfiguredAnalyzerFactory
final Properties properties = initProperties();
final Map analyzers = new HashMap();
properties2analyzers(properties, analyzers);
if (!analyzers.containsKey("*")) {
throw new RuntimeException("Bad config: must specify behavior on language range '*'");
}
for (ConfigOptionsToAnalyzer a: analyzers.values()) {
a.validate();
}
try {
for (ConfigOptionsToAnalyzer a: analyzers.values()) {
a.setAnalyzerPair(a.construct());
}
} catch (Exception e) {
throw new RuntimeException("Cannot construct ConfigurableAnalyzerFactory", e);
}
int sz = analyzers.size();
for (ConfigOptionsToAnalyzer a: analyzers.values()) {
a.followLikesToAnalyzerPair(0, sz, analyzers);
}
AnalyzerPair[] allPairs = new AnalyzerPair[sz];
int i = 0;
for (ConfigOptionsToAnalyzer a: analyzers.values()) {
allPairs[i++] = a.result;
}
Arrays.sort(allPairs);
if (log.isInfoEnabled()) {
StringBuilder sb = new StringBuilder();
sb.append("Installed text Analyzer's: ");
for (AnalyzerPair ap: allPairs) {
sb.append(ap.toString());
sb.append(", ");
}
log.info(sb.toString());
}
configuration = new ConfiguredAnalyzerFactory(allPairs, getDefaultLanguage());
allConfigs.put(uuid, configuration);
}
return configuration;
}
private String getDefaultLanguage() {
final IKeyBuilder keyBuilder = fullTextIndex.getKeyBuilder();
if (keyBuilder.isUnicodeSupported()) {
// The configured locale for the database.
final Locale locale = ((KeyBuilder) keyBuilder)
.getSortKeyGenerator().getLocale();
// The language for that locale.
return locale.getLanguage();
} else {
// Rule, Britannia!
return "en";
}
}
private static boolean hasConstructor(Class extends Analyzer> cls, Class> ... parameterTypes) {
return getConstructor(cls, parameterTypes) != null;
}
protected static Constructor extends Analyzer> getConstructor(Class extends Analyzer> cls,
Class>... parameterTypes) {
try {
return cls.getConstructor(parameterTypes);
} catch (NoSuchMethodException | SecurityException e) {
if(log.isDebugEnabled()) {
log.debug(e);
}
return null;
}
}
private void properties2analyzers(Properties props, Map analyzers) {
Enumeration> en = props.propertyNames();
while (en.hasMoreElements()) {
String prop = (String)en.nextElement();
if (prop.equals(Options.NATURAL_LANGUAGE_SUPPORT)) continue;
if (prop.startsWith(Options.ANALYZER)) {
String languageRangeAndProperty[] = prop.substring(Options.ANALYZER.length()).replaceAll("_","*").split("[.]");
if (languageRangeAndProperty.length == 2) {
String languageRange = languageRangeAndProperty[0].toLowerCase(Locale.US); // Turkish "I" could create a problem
String shortProperty = languageRangeAndProperty[1];
String value = props.getProperty(prop);
log.info("Setting language range: " + languageRange + "/" + shortProperty + " = " + value);
ConfigOptionsToAnalyzer cons = analyzers.get(languageRange);
if (cons == null) {
cons = new ConfigOptionsToAnalyzer(languageRange);
analyzers.put(languageRange, cons);
}
if (cons.setProperty(shortProperty, value)) {
continue;
}
}
}
log.warn("Failed to process configuration property: " + prop);
}
}
protected Properties initProperties() {
final Properties parentProperties = fullTextIndex.getProperties();
Properties myProps;
if (Boolean.valueOf(parentProperties.getProperty(
Options.NATURAL_LANGUAGE_SUPPORT,
Options.DEFAULT_NATURAL_LANGUAGE_SUPPORT))) {
myProps = loadPropertyString(ALL_LUCENE_NATURAL_LANGUAGES);
} else if (hasPropertiesForStarLanguageRange(parentProperties)){
myProps = new Properties();
} else {
myProps = loadPropertyString(LUCENE_STANDARD_ANALYZER);
}
copyRelevantProperties(fullTextIndex.getProperties(), myProps);
return myProps;
}
Properties loadPropertyString(String props) {
Properties rslt = new Properties();
try {
rslt.load(new StringReader(props));
} catch (IOException e) {
throw new RuntimeException("Impossible - well clearly not!", e);
}
return rslt;
}
private void copyRelevantProperties(Properties from, Properties to) {
Enumeration> en = from.propertyNames();
while (en.hasMoreElements()) {
String prop = (String)en.nextElement();
if (prop.startsWith(ConfigurableAnalyzerFactory.class.getName())) {
to.setProperty(prop, from.getProperty(prop));
}
}
}
private boolean hasPropertiesForStarLanguageRange(Properties from) {
Enumeration> en = from.propertyNames();
while (en.hasMoreElements()) {
String prop = (String)en.nextElement();
if (prop.startsWith(Options.ANALYZER+"_.")
|| prop.startsWith(Options.ANALYZER+"*.")) {
return true;
}
}
return false;
}
@Override
public Analyzer getAnalyzer(String languageCode, boolean filterStopwords) {
// delayed initialization of parent
IAnalyzerFactory realDelegate = init();
parent.delegate = realDelegate;
return realDelegate.getAnalyzer(languageCode, filterStopwords);
}
}