com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hon-lucene-synonyms Show documentation
Show all versions of hon-lucene-synonyms Show documentation
Enables proper query-time synonym expansion, with no reindexing required.
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.healthonnet.search;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.github.healthonnet.synonyms.AlternateQuery;
import com.github.healthonnet.synonyms.ReasonForNotExpandingSynonyms;
import com.github.healthonnet.synonyms.TextInQuery;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.queries.function.BoostedQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.ExtendedDismaxQParser;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.Const;
import com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.Params;
import org.apache.solr.search.SyntaxError;
import com.github.healthonnet.synonyms.NoBoostSolrParams;
import com.google.common.collect.Ordering;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;
/**
*
* Main implementation of the synonym-expanding ExtendedDismaxQParser plugin for Solr.
*
* This parser was originally derived from ExtendedDismaxQParser, which itself was derived from the
* DismaxQParser from Solr.
*
* @see https://github.com/healthonnet/hon-lucene-synonyms
*/
public class SynonymExpandingExtendedDismaxQParserPlugin extends QParserPlugin implements
ResourceLoaderAware {
public static final String name = "synonym_edismax";
/**
* Convenience class for parameters
*/
public static class Params {
/**
* @see The Extended DisMax Query Parser
*/
public static String MULT_BOOST = "boost";
public static final String SYNONYMS = "synonyms";
public static final String SYNONYMS_ANALYZER = "synonyms.analyzer";
public static final String SYNONYMS_ORIGINAL_BOOST = "synonyms.originalBoost";
public static final String SYNONYMS_SYNONYM_BOOST = "synonyms.synonymBoost";
public static final String SYNONYMS_DISABLE_PHRASE_QUERIES = "synonyms.disablePhraseQueries";
public static final String SYNONYMS_CONSTRUCT_PHRASES = "synonyms.constructPhrases";
public static final String SYNONYMS_IGNORE_QUERY_OPERATORS = "synonyms.ignoreQueryOperators";
/**
* instead of splicing synonyms into the original query string, ie
* dog bite
* canine familiaris bite
* dog chomp
* canine familiaris chomp
* do this:
* dog bite
* "canine familiaris" chomp
* with phrases off:
* dog bite canine familiaris chomp
*/
public static final String SYNONYMS_BAG = "synonyms.bag";
/**
* if true, ignore mm param for the synonym query and use it only for the main query
*
* @see org.apache.solr.common.params.DisMaxParams#MM
*/
public static final String SYNONYMS_IGNORE_MM = "synonyms.ignoreMM";
}
/**
* Convenience class for calling constants.
* @author nolan
*
*/
public static class Const {
/**
* A field we can't ever find in any schema, so we can safely tell
* DisjunctionMaxQueryParser to use it as our defaultField, and map aliases
* from it to any field in our schema.
*/
static final String IMPOSSIBLE_FIELD_NAME = "\uFFFC\uFFFC\uFFFC";
static final Pattern COMPLEX_QUERY_OPERATORS_PATTERN = Pattern.compile("(?:\\*|\\s-\\b|\\b(?:OR|AND|\\+)\\b)");
}
private NamedList> args;
private Map synonymAnalyzers;
private Version luceneMatchVersion = null;
private SolrResourceLoader loader;
@SuppressWarnings("rawtypes")
// TODO it would be nice if the user didn't have to encode tokenizers/filters
// as a NamedList. But for now this is the hack I'm using
public void init(NamedList args) {
this.args = (NamedList>)args;
}
@Override
public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
if (luceneMatchVersion == null) {
this.luceneMatchVersion = req.getCore().getSolrConfig().luceneMatchVersion;
parseConfig();
}
return new SynonymExpandingExtendedDismaxQParser(qstr, localParams, params, req, synonymAnalyzers);
}
private Map convertNamedListToMap(NamedList> namedList) {
Map result = new HashMap<>();
for (Entry entry : namedList) {
if (entry.getValue() instanceof String) {
result.put(entry.getKey(), (String)entry.getValue());
}
}
return result;
}
public void inform(ResourceLoader loader) throws IOException {
// TODO: Can we assume that loader always is a sub type of SolrResourceLoader?
this.loader = (SolrResourceLoader) loader;
}
/*
* Expected call pattern:
* init(), inform(loader), createParser(), so we should now have
* config, loader and luceneMatchVersion needed for creating analyzer components
*/
private void parseConfig() {
try {
synonymAnalyzers = new HashMap<>();
Object xmlSynonymAnalyzers = args.get("synonymAnalyzers");
if (xmlSynonymAnalyzers != null && xmlSynonymAnalyzers instanceof NamedList) {
NamedList> synonymAnalyzersList = (NamedList>) xmlSynonymAnalyzers;
for (Entry entry : synonymAnalyzersList) {
String analyzerName = entry.getKey();
if (!(entry.getValue() instanceof NamedList)) {
continue;
}
NamedList> analyzerAsNamedList = (NamedList>) entry.getValue();
TokenizerFactory tokenizerFactory = null;
TokenFilterFactory filterFactory;
List filterFactories = new LinkedList<>();
for (Entry analyzerEntry : analyzerAsNamedList) {
String key = analyzerEntry.getKey();
if (!(entry.getValue() instanceof NamedList)) {
continue;
}
Map params = convertNamedListToMap((NamedList>)analyzerEntry.getValue());
String className = params.get("class");
if (className == null) {
continue;
}
params.put("luceneMatchVersion", luceneMatchVersion.toString());
if (key.equals("tokenizer")) {
try {
tokenizerFactory = TokenizerFactory.forName(className, params);
} catch (IllegalArgumentException iae) {
if (!className.contains(".")) {
iae.printStackTrace();
}
// Now try by classname instead of SPI keyword
tokenizerFactory = loader.newInstance(className, TokenizerFactory.class, new String[]{}, new Class[] { Map.class }, new Object[] { params });
}
if (tokenizerFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware)tokenizerFactory).inform(loader);
}
} else if (key.equals("filter")) {
try {
filterFactory = TokenFilterFactory.forName(className, params);
} catch (IllegalArgumentException iae) {
if (!className.contains(".")) {
iae.printStackTrace();
}
// Now try by classname instead of SPI keyword
filterFactory = loader.newInstance(className, TokenFilterFactory.class, new String[]{}, new Class[] { Map.class }, new Object[] { params });
}
if (filterFactory instanceof ResourceLoaderAware) {
((ResourceLoaderAware)filterFactory).inform(loader);
}
filterFactories.add(filterFactory);
}
}
if (tokenizerFactory == null) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"tokenizer must not be null for synonym analyzer: " + analyzerName);
} else if (filterFactories.isEmpty()) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"filter factories must be defined for synonym analyzer: " + analyzerName);
}
TokenizerChain analyzer = new TokenizerChain(tokenizerFactory,
filterFactories.toArray(new TokenFilterFactory[filterFactories.size()]));
synonymAnalyzers.put(analyzerName, analyzer);
}
}
} catch (IOException e) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to create parser. Check your config.", e);
}
}
}
class SynonymExpandingExtendedDismaxQParser extends QParser {
// delegate all our parsing to these two parsers - one for the "synonym" query and the other for the main query
private ExtendedDismaxQParser synonymQueryParser;
private ExtendedDismaxQParser mainQueryParser;
private Map synonymAnalyzers;
private Query queryToHighlight;
/**
* variables used purely for debugging
*/
private List expandedSynonyms;
private ReasonForNotExpandingSynonyms reasonForNotExpandingSynonyms;
public SynonymExpandingExtendedDismaxQParser(String qstr, SolrParams localParams, SolrParams params,
SolrQueryRequest req, Map synonymAnalyzers) {
super(qstr, localParams, params, req);
mainQueryParser = new ExtendedDismaxQParser(qstr, localParams, params, req);
// ensure the synonyms aren't artificially boosted
synonymQueryParser = new ExtendedDismaxQParser(qstr, NoBoostSolrParams.wrap(localParams),
NoBoostSolrParams.wrap(params), req);
this.synonymAnalyzers = synonymAnalyzers;
}
@Override
public String[] getDefaultHighlightFields() {
return mainQueryParser.getDefaultHighlightFields();
}
@Override
public Query getHighlightQuery() throws SyntaxError {
return queryToHighlight != null ? queryToHighlight : mainQueryParser.getHighlightQuery();
}
@Override
public void addDebugInfo(NamedList