org.apache.lucene.search.matchhighlight.MatchRegionRetriever Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Apache Lucene (module: highlighter)
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.matchhighlight;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.PrimitiveIterator;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ForkJoinPool;
import java.util.function.Predicate;
import java.util.function.ToIntFunction;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.search.FilterMatchesIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TaskExecutor;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IOConsumer;

/**
 * Utility class to compute a list of "match regions" for a given query, searcher and document(s)
 * using {@link Matches} API.
 */
public class MatchRegionRetriever {
  private final List leaves;
  private final Weight weight;
  private final Map offsetStrategies;
  private final TreeSet queryAffectedHighlightedFields;
  private final Predicate shouldLoadStoredField;
  private final IndexSearcher searcher;

  /**
   * A callback invoked for each document selected by the query. The callback receives a list of hit
   * ranges, document field value accessor, the leaf reader and document ID of the document.
   */
  @FunctionalInterface
  public interface MatchOffsetsConsumer {
    /**
     * @param docId Document id (global).
     * @param leafReader Document's {@link LeafReader}.
     * @param leafDocId Document id (within the {@link LeafReader}).
     * @param fieldValueProvider Access to preloaded document fields. See the {@link
     *     MatchRegionRetriever#MatchRegionRetriever(IndexSearcher, Query,
     *     OffsetsRetrievalStrategySupplier, Predicate, Predicate)} constructor's documentation for
     *     guidelines on which fields are available through this interface.
     * @param hits A map of field names and offset ranges with query hits.
     */
    void accept(
        int docId,
        LeafReader leafReader,
        int leafDocId,
        FieldValueProvider fieldValueProvider,
        Map> hits)
        throws IOException;
  }

  /**
   * Access to field values of the highlighted document. See the {@link
   * MatchRegionRetriever#MatchRegionRetriever(IndexSearcher, Query,
   * OffsetsRetrievalStrategySupplier, Predicate, Predicate)} constructor's documentation for
   * guidelines on which fields are available through this interface.
   */
  public interface FieldValueProvider extends Iterable {
    /**
     * @return Return a list of values for the provided field name or {@code null} if the field is
     *     not loaded or does not exist for the field.
     */
    List getValues(String field);
  }

  /**
   * This constructor uses the default offset strategy supplier from {@link
   * #computeOffsetRetrievalStrategies(IndexReader, Analyzer)}.
   *
   * @param searcher The {@link IndexSearcher} used to execute the query. The index searcher's
   *     {@linkplain IndexSearcher#getTaskExecutor() task executor} is also used for computing
   *     highlights concurrently.
   * @param query The query for which highlights should be returned.
   * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields in the
   *     absence of position offsets in the index. Note that the analyzer must return tokens
   *     (positions and offsets) identical to the ones stored in the index.
   * @param fieldsToLoadUnconditionally A custom predicate that should return {@code true} for any
   *     field that should be preloaded and made available through {@link FieldValueProvider},
   *     regardless of whether the query affected the field or not. This predicate can be used to
   *     load additional fields during field highlighting, making them available to {@link
   *     MatchOffsetsConsumer}s.
   * @param fieldsToLoadIfWithHits A custom predicate that should return {@code true} for fields
   *     that should be highlighted. Typically, this would always return {@code true} indicating any
   *     field affected by the query should be highlighted. However, sometimes highlights may not be
   *     needed: for example, if they affect fields that are only used for filtering purposes.
   *     Returning {@code false} for such fields saves the costs of loading those fields into memory
   *     and scanning through field matches.
   */
  public MatchRegionRetriever(
      IndexSearcher searcher,
      Query query,
      Analyzer analyzer,
      Predicate fieldsToLoadUnconditionally,
      Predicate fieldsToLoadIfWithHits)
      throws IOException {
    this(
        searcher,
        query,
        computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer),
        fieldsToLoadUnconditionally,
        fieldsToLoadIfWithHits);
  }

  /**
   * @param searcher The {@link IndexSearcher} used to execute the query. The index searcher's
   *     {@linkplain IndexSearcher#getTaskExecutor() task executor} is also used for computing
   *     highlights concurrently.
   * @param query The query for which matches should be retrieved. The query should be rewritten
   *     against the provided searcher.
   * @param fieldOffsetStrategySupplier A custom supplier of per-field {@link
   *     OffsetsRetrievalStrategy} instances.
   * @param fieldsToLoadUnconditionally A custom predicate that should return {@code true} for any
   *     field that should be preloaded and made available through {@link FieldValueProvider},
   *     regardless of whether the query affected the field or not. This predicate can be used to
   *     load additional fields during field highlighting, making them available to {@link
   *     MatchOffsetsConsumer}s.
   * @param fieldsToLoadIfWithHits A custom predicate that should return {@code true} for fields
   *     that should be highlighted. Typically, this would always return {@code true} indicating any
   *     field affected by the query should be highlighted. However, sometimes highlights may not be
   *     needed: for example, if they affect fields that are only used for filtering purposes.
   *     Returning {@code false} for such fields saves the costs of loading those fields into memory
   *     and scanning through field matches.
   */
  public MatchRegionRetriever(
      IndexSearcher searcher,
      Query query,
      OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier,
      Predicate fieldsToLoadUnconditionally,
      Predicate fieldsToLoadIfWithHits)
      throws IOException {
    this.searcher = searcher;
    leaves = searcher.getIndexReader().leaves();
    assert checkOrderConsistency(leaves);

    // We need full scoring mode so that we can receive matches from all sub-clauses
    // (no optimizations in Boolean queries take place).
    weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);

    // Compute a subset of fields affected by this query and for which highlights should be
    // returned, so that we don't load or scan fields that are irrelevant.
    queryAffectedHighlightedFields = new TreeSet<>();
    query.visit(
        new QueryVisitor() {
          @Override
          public boolean acceptField(String field) {
            if (fieldsToLoadIfWithHits.test(field)) {
              queryAffectedHighlightedFields.add(field);
            }
            return false;
          }
        });

    // Compute value offset retrieval strategy for all affected fields.
    offsetStrategies = new HashMap<>();
    for (String field : queryAffectedHighlightedFields) {
      offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
    }

    shouldLoadStoredField =
        (field) -> {
          return fieldsToLoadUnconditionally.test(field)
              || queryAffectedHighlightedFields.contains(field);
        };
  }

  /**
   * Processes {@link TopDocs} with reasonable defaults. See variants of this method for low-level
   * tuning parameters.
   *
   * @see #highlightDocuments(PrimitiveIterator.OfInt, MatchOffsetsConsumer, ToIntFunction, int,
   *     int)
   * @param topDocs Search results.
   * @param consumer A streaming consumer for document-hits pairs.
   */
  public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer)
      throws IOException {
    highlightDocuments(
        Arrays.stream(topDocs.scoreDocs).mapToInt(scoreDoc -> scoreDoc.doc).sorted().iterator(),
        consumer,
        field -> Integer.MAX_VALUE);
  }

  /**
   * Low-level, high-efficiency method for highlighting large numbers of documents at once.
   *
   * @param docIds A stream of sorted document identifiers for which hit ranges should be
   *     returned.
   * @param consumer A streaming consumer for document-hits pairs.
   * @param maxHitsPerField A predicate that should, for the provided field, return the maximum
   *     number of hit regions to consider when scoring passages. The predicate should return {@link
   *     Integer#MAX_VALUE} for all hits to be considered, although typically 3-10 hits are
   *     sufficient and lead to performance savings in long fields with large numbers of hit ranges.
   * @see #highlightDocuments(PrimitiveIterator.OfInt, MatchOffsetsConsumer, ToIntFunction, int,
   *     int)
   */
  public void highlightDocuments(
      PrimitiveIterator.OfInt docIds,
      MatchOffsetsConsumer consumer,
      ToIntFunction maxHitsPerField)
      throws IOException {
    // Typically enough to saturate a single processing thread and be large enough to
    // compensate for overhead of concurrency.
    final int DEFAULT_MAX_BLOCK_SIZE = 50;
    highlightDocuments(
        docIds,
        consumer,
        maxHitsPerField,
        DEFAULT_MAX_BLOCK_SIZE,
        ForkJoinPool.getCommonPoolParallelism());
  }

  /**
   * Low-level, high-efficiency method for highlighting large numbers of documents at once.
   *
   * Document IDs are grouped into sequential "blocks". For each block, highlights are computed
   * (this can use parallel threads, if {@link IndexSearcher#getTaskExecutor()}) can execute tasks
   * in parallel. Finally, processed highlights are passed to the {@code consumer}.
   *
   * @param docIds A stream of sorted document identifiers for which hit ranges should be
   *     returned.
   * @param consumer A streaming consumer for document-query hits pairs. This consumer will be
   *     called sequentially, with document ordering corresponding to that of the query results.
   * @param maxHitsPerField A predicate that should, for the provided field, return the maximum
   *     number of hit regions to consider when scoring passages. The predicate should return {@link
   *     Integer#MAX_VALUE} for all hits to be considered, although typically 3-10 hits are
   *     sufficient and lead to performance savings in long fields with large numbers of hit ranges.
   * @param maxBlockSize The maximum size of a single contiguous "block" of documents. Each block
   *     can be processed in parallel, using the index searcher's task executor.
   * @param maxBlocksProcessedInParallel Maximum number of queued document "blocks"; when reached,
   *     the queue is processed (possibly concurrently) and then passed to the {@code consumer}. Set
   *     this value to {@code 1} to process blocks sequentially.
   */
  public void highlightDocuments(
      PrimitiveIterator.OfInt docIds,
      MatchOffsetsConsumer consumer,
      ToIntFunction maxHitsPerField,
      int maxBlockSize,
      int maxBlocksProcessedInParallel)
      throws IOException {
    if (leaves.isEmpty()) {
      return;
    }

    ArrayList> blockQueue = new ArrayList<>();

    TaskExecutor taskExecutor = searcher.getTaskExecutor();
    IOConsumer>> drainQueue;
    if (maxBlocksProcessedInParallel == 1) {
      // Sequential, own-thread processing.
      drainQueue =
          (queue) -> {
            for (var callable : queue) {
              try {
                processBlock(callable.call(), consumer);
              } catch (Exception e) {
                throw new IOException(e);
              }
            }
            queue.clear();
          };
    } else {
      // Potentially concurrent processing via IndexSearcher's TaskExecutor.
      drainQueue =
          (queue) -> {
            for (var highlightData : taskExecutor.invokeAll(queue)) {
              processBlock(highlightData, consumer);
            }
            queue.clear();
          };
    }

    // Collect blocks.
    int previousDocId = -1;
    int[] block = new int[maxBlockSize];
    int blockPos = 0;
    while (docIds.hasNext()) {
      int docId = docIds.nextInt();
      if (docId < previousDocId) {
        throw new RuntimeException("Input document IDs must be sorted (increasing).");
      }
      previousDocId = docId;

      block[blockPos++] = docId;
      if (blockPos >= maxBlockSize || !docIds.hasNext()) {
        final int[] idBlock = ArrayUtil.copyOfSubArray(block, 0, blockPos);
        blockQueue.add(() -> prepareBlock(idBlock, maxHitsPerField));
        blockPos = 0;

        if (blockQueue.size() >= maxBlocksProcessedInParallel) {
          drainQueue.accept(blockQueue);
        }
      }
    }

    // Finalize any remaining blocks.
    if (!blockQueue.isEmpty()) {
      drainQueue.accept(blockQueue);
    }
  }

  private record DocHighlightData(
      int docId,
      LeafReader leafReader,
      int leafDocId,
      FieldValueProvider fieldValueProvider,
      Map> hits) {}

  private DocHighlightData[] prepareBlock(int[] idBlock, ToIntFunction maxHitsPerField)
      throws IOException {
    DocHighlightData[] docData = new DocHighlightData[idBlock.length];

    Iterator ctx = leaves.iterator();
    LeafReaderContext currentContext = ctx.next();
    LeafReader reader = currentContext.reader();

    for (int i = 0; i < idBlock.length; i++) {
      final int docId = idBlock[i];

      while (docId >= currentContext.docBase + reader.maxDoc()) {
        currentContext = ctx.next();
        reader = currentContext.reader();
      }

      int contextRelativeDocId = docId - currentContext.docBase;

      var fieldVisitor = new StoredFieldsVisitor(shouldLoadStoredField);
      StoredFields storedFields = reader.storedFields();
      storedFields.document(contextRelativeDocId, fieldVisitor);

      Map> highlights = new TreeMap<>();
      highlightDocument(
          currentContext, contextRelativeDocId, fieldVisitor, maxHitsPerField, highlights);

      docData[i] =
          new DocHighlightData(docId, reader, contextRelativeDocId, fieldVisitor, highlights);
    }

    return docData;
  }

  private void processBlock(DocHighlightData[] docHighlightData, MatchOffsetsConsumer consumer)
      throws IOException {
    for (var data : docHighlightData) {
      consumer.accept(
          data.docId, data.leafReader, data.leafDocId, data.fieldValueProvider, data.hits);
    }
  }

  /**
   * Low-level method for retrieving hit ranges for a single document. This method can be used with
   * custom document {@link FieldValueProvider}.
   */
  public void highlightDocument(
      LeafReaderContext leafReaderContext,
      int contextDocId,
      FieldValueProvider doc,
      ToIntFunction maxHitsPerField,
      Map> outputHighlights)
      throws IOException {
    Matches matches = weight.matches(leafReaderContext, contextDocId);
    if (matches == null) {
      return;
    }

    for (String field : queryAffectedHighlightedFields) {
      MatchesIterator matchesIterator = matches.getMatches(field);
      if (matchesIterator == null) {
        // No matches on this field, even though the field was part of the query. This may be
        // possible
        // with complex queries that source non-text fields (have no "hit regions" in any textual
        // representation). Skip.
      } else {
        OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
        if (offsetStrategy == null) {
          throw new IOException(
              "Non-empty matches but no offset retrieval strategy for field: " + field);
        }
        var delegate = offsetStrategy;

        // Limit the number of hits so that we're not extracting dozens just to trim them to a few
        // in the end.
        final int maxHits = maxHitsPerField.applyAsInt(field);
        if (maxHits != Integer.MAX_VALUE) {
          offsetStrategy =
              (matchesIterator1, doc1) ->
                  delegate.get(new MatchesIteratorWithLimit(matchesIterator1, maxHits), doc1);
        }

        List ranges = offsetStrategy.get(matchesIterator, doc);
        if (!ranges.isEmpty()) {
          outputHighlights.put(field, ranges);
        }
      }
    }
  }

  private static class MatchesIteratorWithLimit extends FilterMatchesIterator {
    private int limit;

    public MatchesIteratorWithLimit(MatchesIterator matchesIterator, int limit) {
      super(matchesIterator);
      if (limit < 0) {
        throw new IllegalArgumentException();
      }
      this.limit = limit;
    }

    @Override
    public boolean next() throws IOException {
      if (limit == 0) {
        return false;
      }
      limit--;
      return super.next();
    }
  }

  private boolean checkOrderConsistency(List leaves) {
    for (int i = 1; i < leaves.size(); i++) {
      LeafReaderContext prev = leaves.get(i - 1);
      LeafReaderContext next = leaves.get(i);
      assert prev.docBase <= next.docBase;
      assert prev.docBase + prev.reader().maxDoc() == next.docBase;
    }
    return true;
  }

  /**
   * Compute default strategies for retrieving offsets from {@link MatchesIterator} instances for a
   * set of given fields.
   */
  public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
      IndexReader reader, Analyzer analyzer) {
    FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
    return (field) -> {
      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
      if (fieldInfo == null) {
        return (mi, doc) -> {
          throw new IOException("FieldInfo is null for field: " + field);
        };
      }

      switch (fieldInfo.getIndexOptions()) {
        case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
          return new OffsetsFromMatchIterator(field, new OffsetsFromPositions(field, analyzer));

        case DOCS_AND_FREQS_AND_POSITIONS:
          return new OffsetsFromPositions(field, analyzer);

        case DOCS_AND_FREQS:
        case DOCS:
          // By default retrieve offsets from individual tokens
          // retrieved by the analyzer (possibly narrowed down to
          // only those terms that the query hinted at when passed
          // a QueryVisitor.
          //
          // Alternative strategies are also possible and may make sense
          // depending on the use case (OffsetsFromValues, for example).
          return new OffsetsFromTokens(field, analyzer);

        case NONE:
        default:
          return (matchesIterator, doc) -> {
            throw new IOException(
                "Field is indexed without positions and/or offsets: "
                    + field
                    + ", "
                    + fieldInfo.getIndexOptions());
          };
      }
    };
  }

  private static class StoredFieldsVisitor extends StoredFieldVisitor
      implements FieldValueProvider {
    private final Predicate needsField;
    private final LinkedHashMap> fieldValues = new LinkedHashMap<>();

    public StoredFieldsVisitor(Predicate shouldLoadStoredField) {
      this.needsField = shouldLoadStoredField;
    }

    @Override
    public Status needsField(FieldInfo fieldInfo) throws IOException {
      return needsField.test(fieldInfo.name) ? Status.YES : Status.NO;
    }

    @Override
    public List getValues(String field) {
      List values = fieldValues.get(field);
      return values == null ? List.of() : values;
    }

    @Override
    public void stringField(FieldInfo fieldInfo, String value) throws IOException {
      addField(fieldInfo, value);
    }

    @Override
    public void intField(FieldInfo fieldInfo, int value) throws IOException {
      addField(fieldInfo, Integer.toString(value));
    }

    @Override
    public void longField(FieldInfo fieldInfo, long value) throws IOException {
      addField(fieldInfo, Long.toString(value));
    }

    @Override
    public void floatField(FieldInfo fieldInfo, float value) throws IOException {
      addField(fieldInfo, Float.toString(value));
    }

    @Override
    public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
      addField(fieldInfo, Double.toString(value));
    }

    private void addField(FieldInfo field, String value) {
      fieldValues.computeIfAbsent(field.name, v -> new ArrayList<>()).add(value);
    }

    @Override
    public Iterator iterator() {
      return fieldValues.keySet().iterator();
    }
  }
}