All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.zulia.server.analysis.ZuliaPerFieldAnalyzer Maven / Gradle / Ivy

package io.zulia.server.analysis;

import io.zulia.message.ZuliaIndex;
import io.zulia.server.analysis.analyzer.BooleanAnalyzer;
import io.zulia.server.analysis.filter.BritishUSFilter;
import io.zulia.server.analysis.filter.CaseProtectedWordsFilter;
import io.zulia.server.config.ServerIndexConfig;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.DelegatingAnalyzerWrapper;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.UpperCaseFilter;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishMinimalStemFilter;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.minhash.MinHashFilterFactory;
import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.CATENATE_ALL;

public class ZuliaPerFieldAnalyzer extends DelegatingAnalyzerWrapper {
	private final ServerIndexConfig indexConfig;
	private final Analyzer defaultAnalyzer;

	private Map fieldAnalyzers;

	public ZuliaPerFieldAnalyzer(ServerIndexConfig indexConfig) {
		super(PER_FIELD_REUSE_STRATEGY);
		this.indexConfig = indexConfig;
		this.defaultAnalyzer = new KeywordAnalyzer();
		this.fieldAnalyzers = new HashMap<>();
		refresh();
	}

	public void refresh() {
		Map newFieldAnalyzers = new HashMap<>();
		for (ZuliaIndex.IndexAs indexAs : indexConfig.getIndexAsValues()) {
			String indexFieldName = indexAs.getIndexFieldName();

			ZuliaIndex.FieldConfig.FieldType fieldType = indexConfig.getFieldTypeForIndexField(indexFieldName);
			ZuliaIndex.AnalyzerSettings analyzerSettings = indexConfig.getAnalyzerSettingsForIndexField(indexFieldName);

			Analyzer a;

			if (ZuliaIndex.FieldConfig.FieldType.STRING.equals(fieldType)) {
				if (analyzerSettings != null) {
					a = getAnalyzerForField(analyzerSettings);
				}
				else {
					a = new KeywordAnalyzer();
				}
			}
			else if (ZuliaIndex.FieldConfig.FieldType.BOOL.equals(fieldType)) {
				a = new BooleanAnalyzer();
			}
			else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_INT.equals(fieldType)) {
				a = new WhitespaceAnalyzer();
			}
			else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_LONG.equals(fieldType)) {
				a = new WhitespaceAnalyzer();
			}
			else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_FLOAT.equals(fieldType)) {
				a = new WhitespaceAnalyzer();
			}
			else if (ZuliaIndex.FieldConfig.FieldType.NUMERIC_DOUBLE.equals(fieldType)) {
				a = new WhitespaceAnalyzer();
			}
			else {
				a = new KeywordAnalyzer();
			}

			newFieldAnalyzers.put(indexFieldName, a);

		}

		this.fieldAnalyzers = newFieldAnalyzers;
	}

	@Override
	protected Analyzer getWrappedAnalyzer(String fieldName) {
		Analyzer analyzer = fieldAnalyzers.get(fieldName);
		return (analyzer != null) ? analyzer : defaultAnalyzer;
	}

	@Override
	public String toString() {
		return "ZuliaPerFieldAnalyzerWrapper(" + fieldAnalyzers + ", default=" + defaultAnalyzer + ")";
	}

	public static Analyzer getAnalyzerForField(ZuliaIndex.AnalyzerSettings analyzerSettings) {

		return new Analyzer() {

			@Override
			public int getPositionIncrementGap(String fieldName) {
				return 100;
			}

			@Override
			protected TokenStreamComponents createComponents(String fieldName) {

				ZuliaIndex.AnalyzerSettings.Tokenizer tokenizer = analyzerSettings.getTokenizer();
				Tokenizer src;
				if (ZuliaIndex.AnalyzerSettings.Tokenizer.KEYWORD.equals(tokenizer)) {
					src = new KeywordTokenizer();
				}
				else if (ZuliaIndex.AnalyzerSettings.Tokenizer.WHITESPACE.equals(tokenizer)) {
					src = new WhitespaceTokenizer();
				}
				else if (ZuliaIndex.AnalyzerSettings.Tokenizer.STANDARD.equals(tokenizer)) {
					src = new StandardTokenizer();
				}
				else {
					throw new RuntimeException("Unknown tokenizer type <" + tokenizer);
				}

				return new TokenStreamComponents(src, getFilteredStream(src));
			}

			@Override
			protected TokenStream normalize(String fieldName, TokenStream in) {
				return getFilteredStream(in);
			}

			private TokenStream getFilteredStream(TokenStream src) {
				TokenStream tok;
				TokenStream lastTok;

				ZuliaIndex.AnalyzerSettings.Tokenizer tokenizer = analyzerSettings.getTokenizer();
				if (ZuliaIndex.AnalyzerSettings.Tokenizer.KEYWORD.equals(tokenizer)) {
					tok = src;
					lastTok = src;
				}
				else if (ZuliaIndex.AnalyzerSettings.Tokenizer.WHITESPACE.equals(tokenizer)) {
					tok = src;
					lastTok = src;
				}
				else if (ZuliaIndex.AnalyzerSettings.Tokenizer.STANDARD.equals(tokenizer)) {
					//standard filter does nothing anymore, consider removing it
					tok = new StandardFilter(src);
					lastTok = tok;
				}
				else {
					throw new RuntimeException("Unknown tokenizer type <" + tokenizer);
				}

				List filterList = analyzerSettings.getFilterList();
				for (ZuliaIndex.AnalyzerSettings.Filter filter : filterList) {
					if (ZuliaIndex.AnalyzerSettings.Filter.LOWERCASE.equals(filter)) {
						tok = new LowerCaseFilter(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.UPPERCASE.equals(filter)) {
						tok = new UpperCaseFilter(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.ASCII_FOLDING.equals(filter)) {
						tok = new ASCIIFoldingFilter(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.TWO_TWO_SHINGLE.equals(filter)) {
						ShingleFilter shingleFilter = new ShingleFilter(lastTok, 2, 2);
						shingleFilter.setOutputUnigrams(false);
						tok = shingleFilter;
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.THREE_THREE_SHINGLE.equals(filter)) {
						ShingleFilter shingleFilter = new ShingleFilter(lastTok, 3, 3);
						shingleFilter.setOutputUnigrams(false);
						tok = shingleFilter;
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.THREE_THREE_SHINGLE.equals(filter)) {
						ShingleFilter shingleFilter = new ShingleFilter(lastTok, 3, 3);
						shingleFilter.setOutputUnigrams(false);
						tok = shingleFilter;
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.THREE_THREE_SHINGLE.equals(filter)) {
						ShingleFilter shingleFilter = new ShingleFilter(lastTok, 3, 3);
						shingleFilter.setOutputUnigrams(false);
						tok = shingleFilter;
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.FOUR_FOUR_SHINGLE.equals(filter)) {
						ShingleFilter shingleFilter = new ShingleFilter(lastTok, 4, 4);
						shingleFilter.setOutputUnigrams(false);
						tok = shingleFilter;
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.FIVE_FIVE_SHINGLE.equals(filter)) {
						ShingleFilter shingleFilter = new ShingleFilter(lastTok, 5, 5);
						shingleFilter.setOutputUnigrams(false);
						tok = shingleFilter;
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.KSTEM.equals(filter)) {
						tok = new KStemFilter(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.STOPWORDS.equals(filter)) {
						tok = new StopFilter(lastTok, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.ENGLISH_MIN_STEM.equals(filter)) {
						tok = new EnglishMinimalStemFilter(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.SNOWBALL_STEM.equals(filter)) {
						tok = new SnowballFilter(lastTok, "English");
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.ENGLISH_POSSESSIVE.equals(filter)) {
						tok = new EnglishPossessiveFilter(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.MINHASH.equals(filter)) {
						tok = new MinHashFilterFactory(Collections.emptyMap()).create(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.BRITISH_US.equals(filter)) {
						tok = new BritishUSFilter(lastTok);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.CONCAT_ALL.equals(filter)) {
						tok = new WordDelimiterGraphFilter(lastTok, CATENATE_ALL, null);
					}
					else if (ZuliaIndex.AnalyzerSettings.Filter.CASE_PROTECTED_WORDS.equals(filter)) {
						tok = new CaseProtectedWordsFilter(lastTok);
					}
					else {
						throw new RuntimeException("Unknown filter type <" + filter + ">");
					}
					lastTok = tok;
				}
				return tok;
			}

		};

	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy