com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hon-lucene-synonyms Show documentation
Enables proper query-time synonym expansion, with no reindexing required.
There is a newer version: 5.0.5
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.github.healthonnet.search;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import com.github.healthonnet.synonyms.AlternateQuery;
import com.github.healthonnet.synonyms.ReasonForNotExpandingSynonyms;
import com.github.healthonnet.synonyms.TextInQuery;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.queries.function.BoostedQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.ExtendedDismaxQParser;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.Const;
import com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.Params;
import org.apache.solr.search.SyntaxError;
import com.github.healthonnet.synonyms.NoBoostSolrParams;

import com.google.common.collect.Ordering;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;

/**
 * 
 * Main implementation of the synonym-expanding ExtendedDismaxQParser plugin for Solr.
 * 
 * This parser was originally derived from ExtendedDismaxQParser, which itself was derived from the 
 * DismaxQParser from Solr.
 * 
 * @see https://github.com/healthonnet/hon-lucene-synonyms
 */
public class SynonymExpandingExtendedDismaxQParserPlugin extends QParserPlugin implements
        ResourceLoaderAware {
    public static final String name = "synonym_edismax";

    /**
     * Convenience class for parameters
     */
    public static class Params {
        
        /**
         * @see The Extended DisMax Query Parser
         */
        public static String MULT_BOOST = "boost";
        
        public static final String SYNONYMS = "synonyms";
        public static final String SYNONYMS_ANALYZER = "synonyms.analyzer";
        public static final String SYNONYMS_ORIGINAL_BOOST = "synonyms.originalBoost";
        public static final String SYNONYMS_SYNONYM_BOOST = "synonyms.synonymBoost";
        public static final String SYNONYMS_DISABLE_PHRASE_QUERIES = "synonyms.disablePhraseQueries";
        public static final String SYNONYMS_CONSTRUCT_PHRASES = "synonyms.constructPhrases";
        public static final String SYNONYMS_IGNORE_QUERY_OPERATORS = "synonyms.ignoreQueryOperators";
        /** 
         * instead of splicing synonyms into the original query string, ie
         *    dog bite
         *    canine familiaris bite
         *    dog chomp
         *    canine familiaris chomp
         * do this: 
         *    dog bite 
         *    "canine familiaris" chomp
         * with phrases off:
         *    dog bite canine familiaris chomp
         */
        public static final String SYNONYMS_BAG = "synonyms.bag";

        /**
         * if true, ignore mm param for the synonym query and use it only for the main query
         * 
         * @see org.apache.solr.common.params.DisMaxParams#MM
         */
        public static final String SYNONYMS_IGNORE_MM = "synonyms.ignoreMM";
    }

    /**
     * Convenience class for calling constants.
     * @author nolan
     *
     */
    public static class Const {
        /**
         * A field we can't ever find in any schema, so we can safely tell
         * DisjunctionMaxQueryParser to use it as our defaultField, and map aliases
         * from it to any field in our schema.
         */
        static final String IMPOSSIBLE_FIELD_NAME = "\uFFFC\uFFFC\uFFFC";
        
        static final Pattern COMPLEX_QUERY_OPERATORS_PATTERN = Pattern.compile("(?:\\*|\\s-\\b|\\b(?:OR|AND|\\+)\\b)"); 
    }    
    
    private NamedList args;
    private Map synonymAnalyzers;
    private Version luceneMatchVersion = null;
    private SolrResourceLoader loader;

    @SuppressWarnings("rawtypes")
    // TODO it would be nice if the user didn't have to encode tokenizers/filters
    // as a NamedList.  But for now this is the hack I'm using
    public void init(NamedList args) {
        this.args = (NamedList)args;
    }

    @Override
    public QParser createParser(String qstr, SolrParams localParams, SolrParams params, SolrQueryRequest req) {
        if (luceneMatchVersion == null) {
            this.luceneMatchVersion = req.getCore().getSolrConfig().luceneMatchVersion;
            parseConfig();
        }
        return new SynonymExpandingExtendedDismaxQParser(qstr, localParams, params, req, synonymAnalyzers);
    }
    
    private Map convertNamedListToMap(NamedList namedList) {
        Map result = new HashMap<>();
        
        for (Entry entry : namedList) {
            if (entry.getValue() instanceof String) {
                result.put(entry.getKey(), (String)entry.getValue());
            }
        }
        
        return result;
    }

    public void inform(ResourceLoader loader) throws IOException {
        // TODO: Can we assume that loader always is a sub type of SolrResourceLoader?
        this.loader = (SolrResourceLoader) loader;
    }

    /*
     * Expected call pattern:
     * init(), inform(loader), createParser(), so we should now have
     * config, loader and luceneMatchVersion needed for creating analyzer components
     */
    private void parseConfig() {
        try {
            synonymAnalyzers = new HashMap<>();

            Object xmlSynonymAnalyzers = args.get("synonymAnalyzers");

            if (xmlSynonymAnalyzers != null && xmlSynonymAnalyzers instanceof NamedList) {
                NamedList synonymAnalyzersList = (NamedList) xmlSynonymAnalyzers;
                for (Entry entry : synonymAnalyzersList) {
                    String analyzerName = entry.getKey();
                    if (!(entry.getValue() instanceof NamedList)) {
                        continue;
                    }
                    NamedList analyzerAsNamedList = (NamedList) entry.getValue();

                    TokenizerFactory tokenizerFactory = null;
                    TokenFilterFactory filterFactory;
                    List filterFactories = new LinkedList<>();

                    for (Entry analyzerEntry : analyzerAsNamedList) {
                        String key = analyzerEntry.getKey();
                        if (!(entry.getValue() instanceof NamedList)) {
                            continue;
                        }
                        Map params = convertNamedListToMap((NamedList)analyzerEntry.getValue());

                        String className = params.get("class");
                        if (className == null) {
                            continue;
                        }

                        params.put("luceneMatchVersion", luceneMatchVersion.toString());

                        if (key.equals("tokenizer")) {
                            try {
                                tokenizerFactory = TokenizerFactory.forName(className, params);
                            } catch (IllegalArgumentException iae) {
                                if (!className.contains(".")) {
                                    iae.printStackTrace();
                                }
                                // Now try by classname instead of SPI keyword
                                tokenizerFactory = loader.newInstance(className, TokenizerFactory.class, new String[]{}, new Class[] { Map.class }, new Object[] { params });
                            }
                            if (tokenizerFactory instanceof ResourceLoaderAware) {
                                ((ResourceLoaderAware)tokenizerFactory).inform(loader);
                            }
                        } else if (key.equals("filter")) {
                            try {
                                filterFactory = TokenFilterFactory.forName(className, params);
                            } catch (IllegalArgumentException iae) {
                                if (!className.contains(".")) {
                                    iae.printStackTrace();
                                }
                                // Now try by classname instead of SPI keyword
                                filterFactory = loader.newInstance(className, TokenFilterFactory.class, new String[]{}, new Class[] { Map.class }, new Object[] { params });
                            }
                            if (filterFactory instanceof ResourceLoaderAware) {
                                ((ResourceLoaderAware)filterFactory).inform(loader);
                            }
                            filterFactories.add(filterFactory);
                        }
                    }
                    if (tokenizerFactory == null) {
                        throw new SolrException(ErrorCode.SERVER_ERROR,
                                "tokenizer must not be null for synonym analyzer: " + analyzerName);
                    } else if (filterFactories.isEmpty()) {
                        throw new SolrException(ErrorCode.SERVER_ERROR,
                                "filter factories must be defined for synonym analyzer: " + analyzerName);
                    }

                    TokenizerChain analyzer = new TokenizerChain(tokenizerFactory,
                            filterFactories.toArray(new TokenFilterFactory[filterFactories.size()]));

                    synonymAnalyzers.put(analyzerName, analyzer);
                }
            }
        } catch (IOException e) {
            throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to create parser. Check your config.", e);
        }
    }
}

class SynonymExpandingExtendedDismaxQParser extends QParser {

    // delegate all our parsing to these two parsers - one for the "synonym" query and the other for the main query
    private ExtendedDismaxQParser synonymQueryParser;
    private ExtendedDismaxQParser mainQueryParser;

    private Map synonymAnalyzers;
    private Query queryToHighlight;
    
    /**
     * variables used purely for debugging
     */
    private List expandedSynonyms;
    private ReasonForNotExpandingSynonyms reasonForNotExpandingSynonyms;
    
    public SynonymExpandingExtendedDismaxQParser(String qstr, SolrParams localParams, SolrParams params,
            SolrQueryRequest req, Map synonymAnalyzers) {
        super(qstr, localParams, params, req);
        mainQueryParser = new ExtendedDismaxQParser(qstr, localParams, params, req);
        
        // ensure the synonyms aren't artificially boosted
        synonymQueryParser = new ExtendedDismaxQParser(qstr, NoBoostSolrParams.wrap(localParams),
                NoBoostSolrParams.wrap(params), req);
        this.synonymAnalyzers = synonymAnalyzers;
    }

    @Override
    public String[] getDefaultHighlightFields() {
      return mainQueryParser.getDefaultHighlightFields();
    }
    
    @Override
    public Query getHighlightQuery() throws SyntaxError {
        return queryToHighlight != null ? queryToHighlight : mainQueryParser.getHighlightQuery();
    }
    
    @Override
    public void addDebugInfo(NamedList