All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.highlight.UnifiedSolrHighlighter Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.highlight;

import java.io.IOException;
import java.text.BreakIterator;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import java.util.function.Supplier;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator;
import org.apache.lucene.search.uhighlight.DefaultPassageFormatter;
import org.apache.lucene.search.uhighlight.LengthGoalBreakIterator;
import org.apache.lucene.search.uhighlight.PassageFormatter;
import org.apache.lucene.search.uhighlight.PassageScorer;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.uhighlight.WholeBreakIterator;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.util.RTimerTree;
import org.apache.solr.util.plugin.PluginInfoInitialized;

/**
 * Highlighter impl that uses {@link UnifiedHighlighter}
 *
 * 

Example configuration with default values: * *

 * <requestHandler name="/select" class="solr.SearchHandler">
 * <lst name="defaults">
 * <str name="hl.method">unified</str>
 * <int name="hl.snippets">1</int>
 * <str name="hl.tag.pre">&lt;em&gt;</str>
 * <str name="hl.tag.post">&lt;/em&gt;</str>
 * <str name="hl.simple.pre">&lt;em&gt;</str>
 * <str name="hl.simple.post">&lt;/em&gt;</str>
 * <str name="hl.tag.ellipsis">(internal/unspecified)</str>
 * <bool name="hl.defaultSummary">false</bool>
 * <str name="hl.encoder">simple</str>
 * <float name="hl.score.k1">1.2</float>
 * <float name="hl.score.b">0.75</float>
 * <float name="hl.score.pivot">87</float>
 * <str name="hl.bs.language"></str>
 * <str name="hl.bs.country"></str>
 * <str name="hl.bs.variant"></str>
 * <str name="hl.bs.type">SENTENCE</str>
 * <int name="hl.maxAnalyzedChars">51200</int>
 * <bool name="hl.highlightMultiTerm">true</bool>
 * <bool name="hl.usePhraseHighlighter">true</bool>
 * <int name="hl.cacheFieldValCharsThreshold">524288</int>
 * <str name="hl.offsetSource"></str>
 * <bool name="hl.weightMatches">true</bool>
 * </lst>
 * </requestHandler>
 * 
* *

Notes: * *

    *
  • hl.q (string) can specify the query *
  • hl.fl (string) specifies the field list. *
  • hl.snippets (int) specifies how many snippets to return. *
  • hl.tag.pre (string) specifies text which appears before a highlighted term. *
  • hl.tag.post (string) specifies text which appears after a highlighted term. *
  • hl.simple.pre (string) specifies text which appears before a highlighted term. (prefer * hl.tag.pre) *
  • hl.simple.post (string) specifies text which appears before a highlighted term. (prefer * hl.tag.post) *
  • hl.tag.ellipsis (string) specifies text which joins non-adjacent passages. The default is * to retain each value in a list without joining them. *
  • hl.defaultSummary (bool) specifies if a field should have a default summary of the leading * text. *
  • hl.encoder (string) can be 'html' (html escapes content) or 'simple' (no escaping). *
  • hl.score.k1 (float) specifies bm25 scoring parameter 'k1' *
  • hl.score.b (float) specifies bm25 scoring parameter 'b' *
  • hl.score.pivot (float) specifies bm25 scoring parameter 'avgdl' *
  • hl.bs.type (string) specifies how to divide text into passages: [SENTENCE, LINE, WORD, * CHAR, WHOLE] *
  • hl.bs.language (string) specifies language code for BreakIterator. default is empty string * (root locale) *
  • hl.bs.country (string) specifies country code for BreakIterator. default is empty string * (root locale) *
  • hl.bs.variant (string) specifies country code for BreakIterator. default is empty string * (root locale) *
  • hl.maxAnalyzedChars (int) specifies how many characters at most will be processed in a * document for any one field. *
  • hl.highlightMultiTerm (bool) enables highlighting for range/wildcard/fuzzy/prefix queries * at some cost. default is true *
  • hl.usePhraseHighlighter (bool) enables phrase highlighting. default is true *
  • hl.cacheFieldValCharsThreshold (int) controls how many characters from a field are cached. * default is 524288 (1MB in 2 byte chars) *
  • hl.offsetSource (string) specifies which offset source to use, prefers postings, but will * use what's available if not specified *
  • hl.weightMatches (bool) enables Lucene Weight Matches mode *
* * @lucene.experimental */ public class UnifiedSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized { protected static final String SNIPPET_SEPARATOR = "\u0000"; @Override public void init(PluginInfo info) {} @Override public NamedList doHighlighting( DocList docs, Query query, SolrQueryRequest req, String[] defaultFields) throws IOException { final SolrParams params = req.getParams(); // if highlighting isn't enabled, then why call doHighlighting? if (!isHighlightingEnabled(params)) return null; int[] docIDs = toDocIDs(docs); // fetch the unique keys String[] keys = getUniqueKeys(req.getSearcher(), docIDs); // query-time parameters String[] fieldNames = getHighlightFields(query, req, defaultFields); int maxPassages[] = new int[fieldNames.length]; for (int i = 0; i < fieldNames.length; i++) { maxPassages[i] = params.getFieldInt(fieldNames[i], HighlightParams.SNIPPETS, 1); } UnifiedHighlighter highlighter = getHighlighter(req); Map snippets = fieldNames.length == 0 ? Collections.emptyMap() : highlighter.highlightFields(fieldNames, query, docIDs, maxPassages); return encodeSnippets(keys, fieldNames, snippets); } /** * Creates an instance of the Lucene {@link UnifiedHighlighter}. Provided for subclass extension * so that a subclass can return a subclass of {@link SolrExtendedUnifiedHighlighter}. */ protected UnifiedHighlighter getHighlighter(SolrQueryRequest req) { return new SolrExtendedUnifiedHighlighter(req); } /** * Encodes the resulting snippets into a namedlist * * @param keys the document unique keys * @param fieldNames field names to highlight in the order * @param snippets map from field name to snippet array for the docs * @return encoded namedlist of summaries */ protected NamedList encodeSnippets( String[] keys, String[] fieldNames, Map snippets) { NamedList list = new SimpleOrderedMap<>(); for (int i = 0; i < keys.length; i++) { NamedList summary = new SimpleOrderedMap<>(); for (String field : fieldNames) { String snippet = snippets.get(field)[i]; if (snippet == null) { // TODO reuse logic of DefaultSolrHighlighter.alternateField } else { // we used a special snippet separator char and we can now split on it. summary.add(field, snippet.split(SNIPPET_SEPARATOR)); } } list.add(keys[i], summary); } return list; } /** Converts solr's DocList to the int[] docIDs */ protected int[] toDocIDs(DocList docs) { int[] docIDs = new int[docs.size()]; DocIterator iterator = docs.iterator(); for (int i = 0; i < docIDs.length; i++) { if (!iterator.hasNext()) { throw new AssertionError(); } docIDs[i] = iterator.nextDoc(); } if (iterator.hasNext()) { throw new AssertionError(); } return docIDs; } /** Retrieves the unique keys for the topdocs to key the results */ protected String[] getUniqueKeys(SolrIndexSearcher searcher, int[] docIDs) throws IOException { IndexSchema schema = searcher.getSchema(); SchemaField keyField = schema.getUniqueKeyField(); if (keyField != null) { SolrReturnFields returnFields = new SolrReturnFields(keyField.getName(), null); String[] uniqueKeys = new String[docIDs.length]; for (int i = 0; i < docIDs.length; i++) { int docid = docIDs[i]; SolrDocument solrDoc = searcher.getDocFetcher().solrDoc(docid, returnFields); uniqueKeys[i] = schema.printableUniqueKey(solrDoc); } return uniqueKeys; } else { return new String[docIDs.length]; } } /** From {@link #getHighlighter(org.apache.solr.request.SolrQueryRequest)}. */ protected static class SolrExtendedUnifiedHighlighter extends UnifiedHighlighter { protected static final Predicate NOT_REQUIRED_FIELD_MATCH_PREDICATE = s -> true; private final SolrIndexSearcher solrIndexSearcher; protected final SolrParams params; protected final IndexSchema schema; protected final RTimerTree loadFieldValuesTimer; public SolrExtendedUnifiedHighlighter(SolrQueryRequest req) { super(req.getSearcher(), req.getSchema().getIndexAnalyzer()); this.solrIndexSearcher = req.getSearcher(); this.params = req.getParams(); this.schema = req.getSchema(); this.setMaxLength(params.getInt(HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS)); this.setCacheFieldValCharsThreshold( params.getInt( HighlightParams.CACHE_FIELD_VAL_CHARS_THRESHOLD, DEFAULT_CACHE_CHARS_THRESHOLD)); final RTimerTree timerTree; if (req.getRequestTimer() != null) { // It may be null if not used in a search context. timerTree = req.getRequestTimer(); } else { timerTree = new RTimerTree(); // since null checks are annoying } loadFieldValuesTimer = timerTree.sub("loadFieldValues"); // we assume a new timer, state of STARTED loadFieldValuesTimer.resume(); // ensure state is STARTED (some obscure test / use-case) loadFieldValuesTimer .pause(); // state of PAUSED now with about zero time. Will fail if state isn't STARTED. } @Override protected OffsetSource getOffsetSource(String field) { String sourceStr = params.getFieldParam(field, HighlightParams.OFFSET_SOURCE); if (sourceStr != null) { return OffsetSource.valueOf(sourceStr.toUpperCase(Locale.ROOT)); } else { return super.getOffsetSource(field); } } // optimization for Solr which keeps a FieldInfos on-hand @Override protected FieldInfo getFieldInfo(String field) { return ((SolrIndexSearcher) searcher).getFieldInfos().fieldInfo(field); } @Override public int getMaxNoHighlightPassages(String field) { boolean defaultSummary = params.getFieldBool(field, HighlightParams.DEFAULT_SUMMARY, false); if (defaultSummary) { return -1; // signifies return first hl.snippets passages worth of the content } else { return 0; // will return null } } @Override protected PassageFormatter getFormatter(String fieldName) { String preTag = params.getFieldParam( fieldName, HighlightParams.TAG_PRE, params.getFieldParam(fieldName, HighlightParams.SIMPLE_PRE, "")); String postTag = params.getFieldParam( fieldName, HighlightParams.TAG_POST, params.getFieldParam(fieldName, HighlightParams.SIMPLE_POST, "")); String ellipsis = params.getFieldParam(fieldName, HighlightParams.TAG_ELLIPSIS, SNIPPET_SEPARATOR); String encoder = params.getFieldParam(fieldName, HighlightParams.ENCODER, "simple"); return new DefaultPassageFormatter(preTag, postTag, ellipsis, "html".equals(encoder)); } @Override protected PassageScorer getScorer(String fieldName) { float k1 = params.getFieldFloat(fieldName, HighlightParams.SCORE_K1, 1.2f); float b = params.getFieldFloat(fieldName, HighlightParams.SCORE_B, 0.75f); float pivot = params.getFieldFloat(fieldName, HighlightParams.SCORE_PIVOT, 87f); return new PassageScorer(k1, b, pivot); } @Override protected BreakIterator getBreakIterator(String field) { // Use a default fragsize the same as the regex Fragmenter (original Highlighter) since we're // both likely shooting for sentence-like patterns. int fragsize = params.getFieldInt( field, HighlightParams.FRAGSIZE, LuceneRegexFragmenter.DEFAULT_FRAGMENT_SIZE); String type = params.getFieldParam(field, HighlightParams.BS_TYPE); if (fragsize == 0 || "WHOLE".equals(type)) { // 0 is special value; no fragmenting return new WholeBreakIterator(); } BreakIterator baseBI; if ("SEPARATOR".equals(type)) { char customSep = parseBiSepChar(params.getFieldParam(field, HighlightParams.BS_SEP)); baseBI = new CustomSeparatorBreakIterator(customSep); } else { String language = params.getFieldParam(field, HighlightParams.BS_LANGUAGE); String country = params.getFieldParam(field, HighlightParams.BS_COUNTRY); String variant = params.getFieldParam(field, HighlightParams.BS_VARIANT); Locale locale = parseLocale(language, country, variant); baseBI = parseBreakIterator(type, locale); } if (fragsize <= 1) { // no real minimum size return baseBI; } float fragalign = params.getFieldFloat(field, HighlightParams.FRAGALIGNRATIO, 0.33f); if (params.getFieldBool(field, HighlightParams.FRAGSIZEISMINIMUM, true)) { return LengthGoalBreakIterator.createMinLength(baseBI, fragsize, fragalign); } return LengthGoalBreakIterator.createClosestToLength(baseBI, fragsize, fragalign); } /** parse custom separator char for {@link CustomSeparatorBreakIterator} */ protected char parseBiSepChar(String sepChar) { if (sepChar == null) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP + " not passed"); } if (sepChar.length() != 1) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, HighlightParams.BS_SEP + " must be a single char but got: '" + sepChar + "'"); } return sepChar.charAt(0); } /** parse a break iterator type for the specified locale */ protected BreakIterator parseBreakIterator(String type, Locale locale) { if (type == null || "SENTENCE".equals(type)) { return BreakIterator.getSentenceInstance(locale); } else if ("LINE".equals(type)) { return BreakIterator.getLineInstance(locale); } else if ("WORD".equals(type)) { return BreakIterator.getWordInstance(locale); } else if ("CHARACTER".equals(type)) { return BreakIterator.getCharacterInstance(locale); } else { throw new IllegalArgumentException("Unknown " + HighlightParams.BS_TYPE + ": " + type); } } /** parse a locale from a language+country+variant spec */ protected Locale parseLocale(String language, String country, String variant) { if (language == null && country == null && variant == null) { return Locale.ROOT; } else if (language == null) { throw new IllegalArgumentException( "language is required if country or variant is specified"); } else if (country == null && variant != null) { throw new IllegalArgumentException("To specify variant, country is required"); } else if (country != null && variant != null) { return new Locale(language, country, variant); } else if (country != null) { return new Locale(language, country); } else { return new Locale(language); } } @Override protected List loadFieldValues( String[] fields, DocIdSetIterator docIter, int cacheCharsThreshold) throws IOException { // Time loading field values. It can be an expensive part of highlighting. loadFieldValuesTimer.resume(); try { return super.loadFieldValues(fields, docIter, cacheCharsThreshold); } finally { loadFieldValuesTimer.pause(); // note: doesn't need to be "stopped"; pause is fine. } } @Override protected Set getFlags(String field) { Set flags = EnumSet.noneOf(HighlightFlag.class); if (params.getFieldBool(field, HighlightParams.HIGHLIGHT_MULTI_TERM, true)) { flags.add(HighlightFlag.MULTI_TERM_QUERY); } if (params.getFieldBool(field, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) { flags.add(HighlightFlag.PHRASES); } flags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED); if (params.getFieldBool(field, HighlightParams.WEIGHT_MATCHES, true) && flags.contains(HighlightFlag.PHRASES) && flags.contains(HighlightFlag.MULTI_TERM_QUERY)) { flags.add(HighlightFlag.WEIGHT_MATCHES); } return flags; } @Override protected Predicate getFieldMatcher(String field) { // note that the UH at Lucene level default to effectively "true" if (params.getFieldBool(field, HighlightParams.FIELD_MATCH, false)) { return field::equals; // requireFieldMatch } String[] queryFieldPattern = params.getFieldParams(field, HighlightParams.QUERY_FIELD_PATTERN); if (queryFieldPattern != null && queryFieldPattern.length != 0) { Supplier> indexedFieldsSupplier = () -> solrIndexSearcher.getDocFetcher().getIndexedFieldNames(); Set fields = Set.of(expandWildcardsInFields(indexedFieldsSupplier, queryFieldPattern)); return fields::contains; } return NOT_REQUIRED_FIELD_MATCH_PREDICATE; } } }