com.bigdata.search.DefaultAnalyzerFactory Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Dec 21, 2010
*/
package com.bigdata.search;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.br.BrazilianAnalyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.de.GermanAnalyzer;
import org.apache.lucene.analysis.el.GreekAnalyzer;
import org.apache.lucene.analysis.fr.FrenchAnalyzer;
import org.apache.lucene.analysis.nl.DutchAnalyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
/**
* This is the default implementation but should be regarded as legacy since
* it fails to use the correct {@link Analyzer} for almost all languages (other than
* English). It uses the correct natural language analyzer only for literals tagged with
* certain three letter ISO 639 codes:
* "por", "deu", "ger", "zho", "chi", "jpn", "kor", "ces", "cze", "dut", "nld", "gre", "ell",
* "fra", "fre", "rus" and "tha". All other tags are treated as English.
* These codes do not work if they are used with subtagse.g. "ger-AT" is treated as English.
* No two letter code, other than "en" works correctly: note that the W3C and
* IETF recommend the use of the two letter forms instead of the three letter forms.
*
* @author Bryan Thompson
* @deprecated Using {@link ConfigurableAnalyzerFactory} with
* the {@link ConfigurableAnalyzerFactory.Options#NATURAL_LANGUAGE_SUPPORT}
* uses the appropriate natural language analyzers for the two letter codes
* and for tags which include sub-tags.
* @version $Id$
*/
public class DefaultAnalyzerFactory implements IAnalyzerFactory {
private final FullTextIndex fullTextIndex;
public DefaultAnalyzerFactory(final FullTextIndex fullTextIndex) {
if (fullTextIndex == null)
throw new IllegalArgumentException();
this.fullTextIndex = fullTextIndex;
}
public Analyzer getAnalyzer(final String languageCode, final boolean filterStopwords) {
final IKeyBuilder keyBuilder = fullTextIndex.getKeyBuilder();
Map map = getAnalyzers();
AnalyzerConstructor ctor = null;
if (languageCode == null) {
if (keyBuilder.isUnicodeSupported()) {
// The configured local for the database.
final Locale locale = ((KeyBuilder) keyBuilder)
.getSortKeyGenerator().getLocale();
// The analyzer for that locale.
Analyzer a = getAnalyzer(locale.getLanguage(), filterStopwords);
if (a != null)
return a;
}
// fall through
} else {
/*
* Check the declared analyzers. We first check the three letter
* language code. If we do not have a match there then we check the
* 2 letter language code.
*/
String code = languageCode;
if (code.length() > 3) {
code = code.substring(0, 2);
ctor = map.get(languageCode);
}
if (ctor == null && code.length() > 2) {
code = code.substring(0, 1);
ctor = map.get(languageCode);
}
}
if (ctor == null) {
// request the default analyzer.
ctor = map.get("");
if (ctor == null) {
throw new IllegalStateException("No entry for empty string?");
}
}
Analyzer a = ctor.newInstance(filterStopwords);
return a;
}
abstract private static class AnalyzerConstructor {
abstract public Analyzer newInstance(final boolean filterStopwords);
}
/**
* A map containing instances of the various kinds of analyzers that we know
* about.
*
* Note: There MUST be an entry under the empty string (""). This entry will
* be requested when there is no entry for the specified language code.
*/
private Map analyzers;
/**
* Initializes the various kinds of analyzers that we know about.
*
* Note: Each {@link Analyzer} is registered under both the 3 letter and the
* 2 letter language codes. See ISO 639-2.
*
* @todo get some informed advice on which {@link Analyzer}s map onto which
* language codes.
*
* @todo thread safety? Analyzers produce token processors so maybe there is
* no problem here once things are initialized. If so, maybe this
* could be static.
*
* @todo configuration. Could be configured by a file containing a class
* name and a list of codes that are handled by that class.
*
* @todo strip language code down to 2/3 characters during lookup.
*
* @todo There are a lot of pidgins based on frenchenglish, and other
* languages that are not being assigned here.
*/
synchronized private Map getAnalyzers() {
if (analyzers != null) {
return analyzers;
}
analyzers = new HashMap();
final CharArraySet emptyStopwords = CharArraySet.EMPTY_SET;
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new BrazilianAnalyzer() :
new BrazilianAnalyzer(emptyStopwords);
}
};
analyzers.put("por", a);
analyzers.put("pt", a);
}
/*
* Claims to handle Chinese. Does single character extraction. Claims to
* produce smaller indices as a result.
*
* Note: you can not tokenize with the Chinese analyzer and the do
* search using the CJK analyzer and visa versa.
*
* Note: I have no idea whether this would work for Japanese and Korean
* as well. I expect so, but no real clue.
*/
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new SmartChineseAnalyzer() :
new SmartChineseAnalyzer(emptyStopwords);
}
};
analyzers.put("zho", a);
analyzers.put("chi", a);
analyzers.put("zh", a);
}
/*
* Claims to handle Chinese, Japanese, Korean. Does double character
* extraction with overlap.
*/
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new CJKAnalyzer() :
new CJKAnalyzer(emptyStopwords);
}
};
// analyzers.put("zho", a);
// analyzers.put("chi", a);
// analyzers.put("zh", a);
analyzers.put("jpn", a);
analyzers.put("ja", a);
analyzers.put("jpn", a);
analyzers.put("kor",a);
analyzers.put("ko",a);
}
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new CzechAnalyzer() :
new CzechAnalyzer(emptyStopwords);
}
};
analyzers.put("ces",a);
analyzers.put("cze",a);
analyzers.put("cs",a);
}
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new DutchAnalyzer() :
new DutchAnalyzer(emptyStopwords);
}
};
analyzers.put("dut",a);
analyzers.put("nld",a);
analyzers.put("nl",a);
}
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new FrenchAnalyzer() :
new FrenchAnalyzer(emptyStopwords);
}
};
analyzers.put("fra",a);
analyzers.put("fre",a);
analyzers.put("fr",a);
}
/*
* Note: There are a lot of language codes for German variants that
* might be useful here.
*/
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new GermanAnalyzer() :
new GermanAnalyzer(emptyStopwords);
}
};
analyzers.put("deu",a);
analyzers.put("ger",a);
analyzers.put("de",a);
}
// Note: ancient greek has a different code (grc).
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new GreekAnalyzer() :
new GreekAnalyzer(emptyStopwords);
}
};
analyzers.put("gre",a);
analyzers.put("ell",a);
analyzers.put("el",a);
}
// @todo what about other Cyrillic scripts?
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new RussianAnalyzer() :
new RussianAnalyzer(emptyStopwords);
}
};
analyzers.put("rus",a);
analyzers.put("ru",a);
}
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return new ThaiAnalyzer();
}
};
analyzers.put("tha",a);
analyzers.put("th",a);
}
// English
{
AnalyzerConstructor a = new AnalyzerConstructor() {
public Analyzer newInstance(final boolean filterStopwords) {
return filterStopwords ?
new StandardAnalyzer() :
new StandardAnalyzer(emptyStopwords);
}
};
analyzers.put("eng", a);
analyzers.put("en", a);
/*
* Note: There MUST be an entry under the empty string (""). This
* entry will be requested when there is no entry for the specified
* language code.
*/
analyzers.put("", a);
}
return analyzers;
}
}