com.bigdata.search.ConfiguredAnalyzerFactory Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on May 6, 2014 by Jeremy J. Carroll, Syapse Inc.
 */
package com.bigdata.search;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.lucene.analysis.Analyzer;

import com.bigdata.search.ConfigurableAnalyzerFactory.AnalyzerOptions;
/**
 * This comment describes the implementation of {@link ConfiguredAnalyzerFactory}.
 * The only method in the interface is {@link ConfiguredAnalyzerFactory#getAnalyzer(String, boolean)},
 * a map is used from language tag to {@link AnalyzerPair}, where the pair contains
 * an {@link Analyzer} both with and without stopwords configured (some times these two analyzers are identical,
 * if, for example, stop words are not supported or not required).
 * 
 * If there is no entry for the language tag in the map {@link ConfiguredAnalyzerFactory#langTag2AnalyzerPair},
 * then one is created, by walking down the array {@link ConfiguredAnalyzerFactory#config} of AnalyzerPairs
 * until a matching one is found.
 * @author jeremycarroll
 *
 */
class ConfiguredAnalyzerFactory implements IAnalyzerFactory {


	/**
	 * These provide a mapping from a language range to a pair of Analyzers
	 * and sort with the best-match (i.e. longest match) first.
	 * @author jeremycarroll
	 *
	 */
	protected static class AnalyzerPair implements Comparable{
		final LanguageRange range;
		private final Analyzer withStopWords;
		private final Analyzer withoutStopWords;
		
		public Analyzer getAnalyzer(boolean filterStopwords) {
			return filterStopwords ? withStopWords : withoutStopWords;
		}
		
		public boolean extendedFilterMatch(String[] language) {
			return range.extendedFilterMatch(language);
		}
		
    	AnalyzerPair(String range, Analyzer withStopWords, Analyzer withOutStopWords) {
    		this.range = new LanguageRange(range);
    		this.withStopWords = withStopWords;
    		this.withoutStopWords = withOutStopWords;
    	}
    	
    	/**
    	 * This clone constructor implements {@link AnalyzerOptions#LIKE}.
    	 * @param range
    	 * @param copyMe
    	 */
    	AnalyzerPair(String range, AnalyzerPair copyMe) {
    		this(range, copyMe.withStopWords, copyMe.withoutStopWords);
    	}

		@Override
		public String toString() {
			return range.full + "=(" + withStopWords.getClass().getSimpleName() +")";
		}
		
		@Override
		public int compareTo(AnalyzerPair o) {
			return range.compareTo(o.range);
		}
	}
	

    private final AnalyzerPair config[];
    
    /**
     * This caches the result of looking up a lang tag in the
     * config of language ranges.
     */
    private final Map langTag2AnalyzerPair = new ConcurrentHashMap();;
    
    /**
     * While it would be very unusual to have more than 500 different language tags in a store
     * it is possible - we use a max size to prevent a memory explosion, and a naive caching
     * strategy so the code will still work on the {@link #MAX_LANG_CACHE_SIZE}+1 th entry.
     */
    private static final int MAX_LANG_CACHE_SIZE = 500;

    		
    private final String defaultLanguage;
    /**
     * Builds a new ConfigurableAnalyzerFactory.
     * @param fullTextIndex
     */
    public ConfiguredAnalyzerFactory(AnalyzerPair config[],  String defaultLanguage) {
    	this.config = config;
    	this.defaultLanguage = defaultLanguage;
    }

	private String getDefaultLanguage() {
		return defaultLanguage;
	}

	@Override
	public Analyzer getAnalyzer(String languageCode, boolean filterStopwords) {
		
		if (languageCode == null || languageCode.equals("")) {
			
			languageCode = getDefaultLanguage();
		}
		
		AnalyzerPair pair = langTag2AnalyzerPair.get(languageCode);
		
		if (pair == null) {
			pair = lookupPair(languageCode);
			
			// naive cache - clear everything if cache is full
			if (langTag2AnalyzerPair.size() == MAX_LANG_CACHE_SIZE) {
				langTag2AnalyzerPair.clear();
			}
			// there is a race condition below, but we don't care who wins.
			langTag2AnalyzerPair.put(languageCode, pair);
		}
		
		return pair.getAnalyzer(filterStopwords);
		
	}

	private AnalyzerPair lookupPair(String languageCode) {
		String language[] = languageCode.split("-");
		for (AnalyzerPair p: config) {
			if (p.extendedFilterMatch(language)) {
				return p;
			}
		}
		throw new RuntimeException("Impossible - supposedly - did not match '*'");
	}
}