com.bigdata.search.ConfiguredAnalyzerFactory Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on May 6, 2014 by Jeremy J. Carroll, Syapse Inc.
*/
package com.bigdata.search;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.lucene.analysis.Analyzer;
import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
/**
* This comment describes the implementation of {@link ConfiguredAnalyzerFactory}.
* The only method in the interface is {@link ConfiguredAnalyzerFactory#getAnalyzer(String, boolean)},
* a map is used from language tag to {@link AnalyzerPair}, where the pair contains
* an {@link Analyzer} both with and without stopwords configured (some times these two analyzers are identical,
* if, for example, stop words are not supported or not required).
*
* If there is no entry for the language tag in the map {@link ConfiguredAnalyzerFactory#langTag2AnalyzerPair},
* then one is created, by walking down the array {@link ConfiguredAnalyzerFactory#config} of AnalyzerPairs
* until a matching one is found.
* @author jeremycarroll
*
*/
class ConfiguredAnalyzerFactory implements IAnalyzerFactory {
/**
* These provide a mapping from a language range to a pair of Analyzers
* and sort with the best-match (i.e. longest match) first.
* @author jeremycarroll
*
*/
protected static class AnalyzerPair implements Comparable{
final LanguageRange range;
private final Analyzer withStopWords;
private final Analyzer withoutStopWords;
public Analyzer getAnalyzer(boolean filterStopwords) {
return filterStopwords ? withStopWords : withoutStopWords;
}
public boolean extendedFilterMatch(String[] language) {
return range.extendedFilterMatch(language);
}
AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) {
this.range = new LanguageRange(range);
this.withStopWords = withStopWords;
this.withoutStopWords = withOutStopWords;
}
/**
* This clone constructor implements {@link AnalyzerOptions#LIKE}.
* @param range
* @param copyMe
*/
AnalyzerPair(String range, AnalyzerPair copyMe) {
this(range, copyMe.withStopWords, copyMe.withoutStopWords);
}
@Override
public String toString() {
return range.full + "=(" + withStopWords.getClass().getSimpleName() +")";
}
@Override
public int compareTo(AnalyzerPair o) {
return range.compareTo(o.range);
}
}
private final AnalyzerPair config[];
/**
* This caches the result of looking up a lang tag in the
* config of language ranges.
*/
private final Map langTag2AnalyzerPair = new ConcurrentHashMap();;
/**
* While it would be very unusual to have more than 500 different language tags in a store
* it is possible - we use a max size to prevent a memory explosion, and a naive caching
* strategy so the code will still work on the {@link #MAX_LANG_CACHE_SIZE}+1 th entry.
*/
private static final int MAX_LANG_CACHE_SIZE = 500;
private final String defaultLanguage;
/**
* Builds a new ConfigurableAnalyzerFactory.
* @param fullTextIndex
*/
public ConfiguredAnalyzerFactory(AnalyzerPair config[], String defaultLanguage) {
this.config = config;
this.defaultLanguage = defaultLanguage;
}
private String getDefaultLanguage() {
return defaultLanguage;
}
@Override
public Analyzer getAnalyzer(String languageCode, boolean filterStopwords) {
if (languageCode == null || languageCode.equals("")) {
languageCode = getDefaultLanguage();
}
AnalyzerPair pair = langTag2AnalyzerPair.get(languageCode);
if (pair == null) {
pair = lookupPair(languageCode);
// naive cache - clear everything if cache is full
if (langTag2AnalyzerPair.size() == MAX_LANG_CACHE_SIZE) {
langTag2AnalyzerPair.clear();
}
// there is a race condition below, but we don't care who wins.
langTag2AnalyzerPair.put(languageCode, pair);
}
return pair.getAnalyzer(filterStopwords);
}
private AnalyzerPair lookupPair(String languageCode) {
String language[] = languageCode.split("-");
for (AnalyzerPair p: config) {
if (p.extendedFilterMatch(language)) {
return p;
}
}
throw new RuntimeException("Impossible - supposedly - did not match '*'");
}
}