org.apache.solr.highlight.DefaultSolrHighlighter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.highlight;

import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.OffsetLimitTokenFilter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.QueryTermScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.FastVectorHighlighter;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.util.AttributeSource.State;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.HighlightParams;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.CollectionUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.PluginInfo;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.HighlightComponent;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocList;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SolrReturnFields;
import org.apache.solr.util.plugin.PluginInfoInitialized;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @since solr 1.3
 */
public class DefaultSolrHighlighter extends SolrHighlighter implements PluginInfoInitialized {

  /**
   * This constant was formerly part of HighlightParams. After deprecation it was removed so clients
   * would no longer use it, but we still support it server side.
   */
  private static final String USE_FVH = HighlightParams.HIGHLIGHT + ".useFastVectorHighlighter";

  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

  protected final SolrCore solrCore;

  // Will be invoked via reflection
  public DefaultSolrHighlighter(SolrCore solrCore) {
    this.solrCore = solrCore;
  }

  // Thread safe registry
  protected final Map formatters = new HashMap<>();

  // Thread safe registry
  protected final Map encoders = new HashMap<>();

  // Thread safe registry
  protected final Map fragmenters = new HashMap<>();

  // Thread safe registry
  protected final Map fragListBuilders = new HashMap<>();

  // Thread safe registry
  protected final Map fragmentsBuilders = new HashMap<>();

  // Thread safe registry
  protected final Map boundaryScanners = new HashMap<>();

  @Override
  public void init(PluginInfo info) {
    formatters.clear();
    encoders.clear();
    fragmenters.clear();
    fragListBuilders.clear();
    fragmentsBuilders.clear();
    boundaryScanners.clear();

    // Load the fragmenters
    SolrFragmenter frag =
        solrCore.initPlugins(
            info.getChildren("fragmenter"), fragmenters, SolrFragmenter.class, null);
    if (frag == null) {
      frag = new GapFragmenter();
      solrCore.initDefaultPlugin(frag, SolrFragmenter.class);
    }
    fragmenters.put("", frag);
    fragmenters.put(null, frag);

    // Load the formatters
    SolrFormatter fmt =
        solrCore.initPlugins(info.getChildren("formatter"), formatters, SolrFormatter.class, null);
    if (fmt == null) {
      fmt = new HtmlFormatter();
      solrCore.initDefaultPlugin(fmt, SolrFormatter.class);
    }
    formatters.put("", fmt);
    formatters.put(null, fmt);

    // Load the encoders
    SolrEncoder enc =
        solrCore.initPlugins(info.getChildren("encoder"), encoders, SolrEncoder.class, null);
    if (enc == null) {
      enc = new DefaultEncoder();
      solrCore.initDefaultPlugin(enc, SolrEncoder.class);
    }
    encoders.put("", enc);
    encoders.put(null, enc);

    // Load the FragListBuilders
    SolrFragListBuilder fragListBuilder =
        solrCore.initPlugins(
            info.getChildren("fragListBuilder"), fragListBuilders, SolrFragListBuilder.class, null);
    if (fragListBuilder == null) {
      fragListBuilder = new SimpleFragListBuilder();
      solrCore.initDefaultPlugin(fragListBuilder, SolrFragListBuilder.class);
    }
    fragListBuilders.put("", fragListBuilder);
    fragListBuilders.put(null, fragListBuilder);

    // Load the FragmentsBuilders
    SolrFragmentsBuilder fragsBuilder =
        solrCore.initPlugins(
            info.getChildren("fragmentsBuilder"),
            fragmentsBuilders,
            SolrFragmentsBuilder.class,
            null);
    if (fragsBuilder == null) {
      fragsBuilder = new ScoreOrderFragmentsBuilder();
      solrCore.initDefaultPlugin(fragsBuilder, SolrFragmentsBuilder.class);
    }
    fragmentsBuilders.put("", fragsBuilder);
    fragmentsBuilders.put(null, fragsBuilder);

    // Load the BoundaryScanners
    SolrBoundaryScanner boundaryScanner =
        solrCore.initPlugins(
            info.getChildren("boundaryScanner"), boundaryScanners, SolrBoundaryScanner.class, null);
    if (boundaryScanner == null) {
      boundaryScanner = new SimpleBoundaryScanner();
      solrCore.initDefaultPlugin(boundaryScanner, SolrBoundaryScanner.class);
    }
    boundaryScanners.put("", boundaryScanner);
    boundaryScanners.put(null, boundaryScanner);
  }

  /**
   * Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this
   * field.
   *
   * @param query The current Query
   * @param fieldName The name of the field
   * @param request The current SolrQueryRequest
   * @param tokenStream document text tokenStream that implements reset() efficiently (e.g.
   *     CachingTokenFilter). If it's used, call reset() first.
   * @throws IOException If there is a low-level I/O error.
   */
  protected Highlighter getPhraseHighlighter(
      Query query, String fieldName, SolrQueryRequest request, TokenStream tokenStream)
      throws IOException {
    SolrParams params = request.getParams();
    Highlighter highlighter =
        new Highlighter(
            getFormatter(fieldName, params),
            getEncoder(fieldName, params),
            getSpanQueryScorer(query, fieldName, tokenStream, request));

    highlighter.setTextFragmenter(getFragmenter(fieldName, params));

    return highlighter;
  }

  /**
   * Return a {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
   *
   * @param query The current Query
   * @param fieldName The name of the field
   * @param request The current SolrQueryRequest
   */
  protected Highlighter getHighlighter(Query query, String fieldName, SolrQueryRequest request) {
    SolrParams params = request.getParams();
    Highlighter highlighter =
        new Highlighter(
            getFormatter(fieldName, params),
            getEncoder(fieldName, params),
            getQueryScorer(query, fieldName, request));
    highlighter.setTextFragmenter(getFragmenter(fieldName, params));
    return highlighter;
  }

  /**
   * Return a {@link org.apache.lucene.search.highlight.QueryScorer} suitable for this Query and
   * field.
   *
   * @param query The current query
   * @param tokenStream document text tokenStream that implements reset() efficiently (e.g.
   *     CachingTokenFilter). If it's used, call reset() first.
   * @param fieldName The name of the field
   * @param request The SolrQueryRequest
   */
  protected QueryScorer getSpanQueryScorer(
      Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) {
    QueryScorer scorer =
        new QueryScorer(
            query,
            request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false)
                ? fieldName
                : null) {
          @Override
          protected WeightedSpanTermExtractor newTermExtractor(String defaultField) {
            return new CustomSpanTermExtractor(defaultField);
          }
        };
    scorer.setExpandMultiTermQuery(
        request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true));

    boolean defaultPayloads = true; // overwritten below
    try {
      // It'd be nice to know if payloads are on the tokenStream but the presence of the attribute
      // isn't a good indicator.
      final Terms terms = request.getSearcher().getSlowAtomicReader().terms(fieldName);
      if (terms != null) {
        defaultPayloads = terms.hasPayloads();
      }
    } catch (IOException e) {
      log.error("Couldn't check for existence of payloads", e);
    }
    scorer.setUsePayloads(
        request.getParams().getFieldBool(fieldName, HighlightParams.PAYLOADS, defaultPayloads));
    return scorer;
  }

  private static class CustomSpanTermExtractor extends WeightedSpanTermExtractor {
    public CustomSpanTermExtractor(String defaultField) {
      super(defaultField);
    }

    @Override
    protected void extract(Query query, float boost, Map terms)
        throws IOException {
      // these queries are not supported in lucene highlighting out of the box since 8.0
      if (query instanceof ToParentBlockJoinQuery) {
        extract(((ToParentBlockJoinQuery) query).getChildQuery(), boost, terms);
      } else if (query instanceof ToChildBlockJoinQuery) {
        extract(((ToChildBlockJoinQuery) query).getParentQuery(), boost, terms);
      } else {
        super.extract(query, boost, terms);
      }
    }
  }

  /**
   * Return a {@link org.apache.lucene.search.highlight.Scorer} suitable for this Query and field.
   *
   * @param query The current query
   * @param fieldName The name of the field
   * @param request The SolrQueryRequest
   */
  protected Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
    boolean reqFieldMatch =
        request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
    if (reqFieldMatch) {
      return new QueryTermScorer(query, request.getSearcher().getIndexReader(), fieldName);
    } else {
      return new QueryTermScorer(query);
    }
  }

  /**
   * Return the max number of snippets for this field. If this has not been configured for this
   * field, fall back to the configured default or the solr default.
   *
   * @param fieldName The name of the field
   * @param params The params controlling Highlighting
   */
  protected int getMaxSnippets(String fieldName, SolrParams params) {
    return params.getFieldInt(fieldName, HighlightParams.SNIPPETS, 1);
  }

  /**
   * Return whether adjacent fragments should be merged.
   *
   * @param fieldName The name of the field
   * @param params The params controlling Highlighting
   */
  protected boolean isMergeContiguousFragments(String fieldName, SolrParams params) {
    return params.getFieldBool(fieldName, HighlightParams.MERGE_CONTIGUOUS_FRAGMENTS, false);
  }

  /**
   * Return a {@link org.apache.lucene.search.highlight.Formatter} appropriate for this field. If a
   * formatter has not been configured for this field, fall back to the configured default or the
   * solr default ({@link org.apache.lucene.search.highlight.SimpleHTMLFormatter}).
   *
   * @param fieldName The name of the field
   * @param params The params controlling Highlighting
   * @return An appropriate {@link org.apache.lucene.search.highlight.Formatter}.
   */
  protected Formatter getFormatter(String fieldName, SolrParams params) {
    String str = params.getFieldParam(fieldName, HighlightParams.FORMATTER);
    SolrFormatter formatter = formatters.get(str);
    if (formatter == null) {
      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: " + str);
    }
    return formatter.getFormatter(fieldName, params);
  }

  /**
   * Return an {@link org.apache.lucene.search.highlight.Encoder} appropriate for this field. If an
   * encoder has not been configured for this field, fall back to the configured default or the solr
   * default ({@link org.apache.lucene.search.highlight.DefaultEncoder}).
   *
   * @param fieldName The name of the field
   * @param params The params controlling Highlighting
   * @return An appropriate {@link org.apache.lucene.search.highlight.Encoder}.
   */
  protected Encoder getEncoder(String fieldName, SolrParams params) {
    String str = params.getFieldParam(fieldName, HighlightParams.ENCODER);
    SolrEncoder encoder = encoders.get(str);
    if (encoder == null) {
      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown encoder: " + str);
    }
    return encoder.getEncoder(fieldName, params);
  }

  /**
   * Return a {@link org.apache.lucene.search.highlight.Fragmenter} appropriate for this field. If a
   * fragmenter has not been configured for this field, fall back to the configured default or the
   * solr default ({@link GapFragmenter}).
   *
   * @param fieldName The name of the field
   * @param params The params controlling Highlighting
   * @return An appropriate {@link org.apache.lucene.search.highlight.Fragmenter}.
   */
  protected Fragmenter getFragmenter(String fieldName, SolrParams params) {
    String fmt = params.getFieldParam(fieldName, HighlightParams.FRAGMENTER);
    SolrFragmenter frag = fragmenters.get(fmt);
    if (frag == null) {
      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: " + fmt);
    }
    return frag.getFragmenter(fieldName, params);
  }

  protected FragListBuilder getFragListBuilder(String fieldName, SolrParams params) {
    String flb = params.getFieldParam(fieldName, HighlightParams.FRAG_LIST_BUILDER);
    SolrFragListBuilder solrFlb = fragListBuilders.get(flb);
    if (solrFlb == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "Unknown fragListBuilder: " + flb);
    }
    return solrFlb.getFragListBuilder(params);
  }

  protected FragmentsBuilder getFragmentsBuilder(String fieldName, SolrParams params) {
    BoundaryScanner bs = getBoundaryScanner(fieldName, params);
    return getSolrFragmentsBuilder(fieldName, params).getFragmentsBuilder(params, bs);
  }

  protected SolrFragmentsBuilder getSolrFragmentsBuilder(String fieldName, SolrParams params) {
    String fb = params.getFieldParam(fieldName, HighlightParams.FRAGMENTS_BUILDER);
    SolrFragmentsBuilder solrFb = fragmentsBuilders.get(fb);
    if (solrFb == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmentsBuilder: " + fb);
    }
    return solrFb;
  }

  protected BoundaryScanner getBoundaryScanner(String fieldName, SolrParams params) {
    String bs = params.getFieldParam(fieldName, HighlightParams.BOUNDARY_SCANNER);
    SolrBoundaryScanner solrBs = boundaryScanners.get(bs);
    if (solrBs == null) {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST, "Unknown boundaryScanner: " + bs);
    }
    return solrBs.getBoundaryScanner(fieldName, params);
  }

  /**
   * Generates a list of Highlighted query fragments for each item in a list of documents, or
   * returns null if highlighting is disabled.
   *
   * @param docs query results
   * @param query the query
   * @param req the current request
   * @param defaultFields default list of fields to summarize
   * @return NamedList containing a NamedList for each document, which in turns contains sets
   *     (field, summary) pairs.
   */
  @Override
  public NamedList