org.apache.lucene.search.matchhighlight.MatchRegionRetriever Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Apache Lucene (module: highlighter)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.matchhighlight;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.PrimitiveIterator;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.function.Predicate;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Matches;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.IOSupplier;

/**
 * Utility class to compute a list of "match regions" for a given query, searcher and document(s)
 * using {@link Matches} API.
 */
public class MatchRegionRetriever {
  private final List leaves;
  private final Weight weight;
  private final TreeSet affectedFields;
  private final Map offsetStrategies;
  private final Set preloadFields;

  /**
   * A callback for accepting a single document (and its associated leaf reader, leaf document ID)
   * and its match offset ranges, as indicated by the {@link Matches} interface retrieved for the
   * query.
   */
  @FunctionalInterface
  public interface MatchOffsetsConsumer {
    void accept(
        int docId, LeafReader leafReader, int leafDocId, Map> hits)
        throws IOException;
  }

  /**
   * An abstraction that provides document values for a given field. The default implementation just
   * reaches to a preloaded {@link Document}. It is possible to write a more efficient
   * implementation on top of a reusable character buffer (that reuses the buffer while retrieving
   * hit regions for documents).
   */
  @FunctionalInterface
  public interface FieldValueProvider {
    List getValues(String field) throws IOException;
  }

  /**
   * A constructor with the default offset strategy supplier.
   *
   * @param analyzer An analyzer that may be used to reprocess (retokenize) document fields in the
   *     absence of position offsets in the index. Note that the analyzer must return tokens
   *     (positions and offsets) identical to the ones stored in the index.
   */
  public MatchRegionRetriever(IndexSearcher searcher, Query query, Analyzer analyzer)
      throws IOException {
    this(searcher, query, computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
  }

  /**
   * @param searcher Index searcher to be used for retrieving matches.
   * @param query The query for which matches should be retrieved. The query should be rewritten
   *     against the provided searcher.
   * @param fieldOffsetStrategySupplier A custom supplier of per-field {@link
   *     OffsetsRetrievalStrategy} instances.
   */
  public MatchRegionRetriever(
      IndexSearcher searcher,
      Query query,
      OffsetsRetrievalStrategySupplier fieldOffsetStrategySupplier)
      throws IOException {
    leaves = searcher.getIndexReader().leaves();
    assert checkOrderConsistency(leaves);

    // We need full scoring mode so that we can receive matches from all sub-clauses
    // (no optimizations in Boolean queries take place).
    weight = searcher.createWeight(query, ScoreMode.COMPLETE, 0);

    // Compute the subset of fields affected by this query so that we don't load or scan
    // fields that are irrelevant.
    affectedFields = new TreeSet<>();
    query.visit(
        new QueryVisitor() {
          @Override
          public boolean acceptField(String field) {
            affectedFields.add(field);
            return false;
          }
        });

    // Compute value offset retrieval strategy for all affected fields.
    offsetStrategies = new HashMap<>();
    for (String field : affectedFields) {
      offsetStrategies.put(field, fieldOffsetStrategySupplier.apply(field));
    }

    // Ask offset strategies if they'll need field values.
    preloadFields = new HashSet<>();
    offsetStrategies.forEach(
        (field, strategy) -> {
          preloadFields.add(field);
        });

    // Only preload those field values that can be affected by the query and are required
    // by strategies.
    preloadFields.retainAll(affectedFields);
  }

  public void highlightDocuments(TopDocs topDocs, MatchOffsetsConsumer consumer)
      throws IOException {
    highlightDocuments(
        Arrays.stream(topDocs.scoreDocs).mapToInt(scoreDoc -> scoreDoc.doc).sorted().iterator(),
        consumer);
  }

  /**
   * Low-level, high-efficiency method for highlighting large numbers of documents at once in a
   * streaming fashion.
   *
   * @param docIds A stream of sorted document identifiers for which hit ranges should be
   *     returned.
   * @param consumer A streaming consumer for document-hits pairs.
   */
  public void highlightDocuments(PrimitiveIterator.OfInt docIds, MatchOffsetsConsumer consumer)
      throws IOException {
    if (leaves.isEmpty()) {
      return;
    }

    Iterator ctx = leaves.iterator();
    LeafReaderContext currentContext = ctx.next();
    int previousDocId = -1;
    Map> highlights = new TreeMap<>();
    while (docIds.hasNext()) {
      int docId = docIds.nextInt();

      if (docId < previousDocId) {
        throw new RuntimeException("Input document IDs must be sorted (increasing).");
      }
      previousDocId = docId;

      while (docId >= currentContext.docBase + currentContext.reader().maxDoc()) {
        currentContext = ctx.next();
      }

      int contextRelativeDocId = docId - currentContext.docBase;

      // Only preload fields we may potentially need.
      FieldValueProvider docFieldsSupplier =
          new DocumentFieldValueProvider(currentContext, contextRelativeDocId, preloadFields);

      highlights.clear();
      highlightDocument(
          currentContext, contextRelativeDocId, docFieldsSupplier, (field) -> true, highlights);
      consumer.accept(docId, currentContext.reader(), contextRelativeDocId, highlights);
    }
  }

  /**
   * Low-level method for retrieving hit ranges for a single document. This method can be used with
   * custom document {@link FieldValueProvider}.
   */
  public void highlightDocument(
      LeafReaderContext leafReaderContext,
      int contextDocId,
      FieldValueProvider doc,
      Predicate acceptField,
      Map> outputHighlights)
      throws IOException {
    Matches matches = weight.matches(leafReaderContext, contextDocId);
    if (matches == null) {
      return;
    }

    for (String field : affectedFields) {
      if (acceptField.test(field)) {
        MatchesIterator matchesIterator = matches.getMatches(field);
        if (matchesIterator == null) {
          // No matches on this field, even though the field was part of the query. This may be
          // possible
          // with complex queries that source non-text fields (have no "hit regions" in any textual
          // representation). Skip.
        } else {
          OffsetsRetrievalStrategy offsetStrategy = offsetStrategies.get(field);
          if (offsetStrategy == null) {
            throw new IOException(
                "Non-empty matches but no offset retrieval strategy for field: " + field);
          }
          List ranges = offsetStrategy.get(matchesIterator, doc);
          if (!ranges.isEmpty()) {
            outputHighlights.put(field, ranges);
          }
        }
      }
    }
  }

  private boolean checkOrderConsistency(List leaves) {
    for (int i = 1; i < leaves.size(); i++) {
      LeafReaderContext prev = leaves.get(i - 1);
      LeafReaderContext next = leaves.get(i);
      assert prev.docBase <= next.docBase;
      assert prev.docBase + prev.reader().maxDoc() == next.docBase;
    }
    return true;
  }

  /**
   * Compute default strategies for retrieving offsets from {@link MatchesIterator} instances for a
   * set of given fields.
   */
  public static OffsetsRetrievalStrategySupplier computeOffsetRetrievalStrategies(
      IndexReader reader, Analyzer analyzer) {
    FieldInfos fieldInfos = FieldInfos.getMergedFieldInfos(reader);
    return (field) -> {
      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
      if (fieldInfo == null) {
        return (mi, doc) -> {
          throw new IOException("FieldInfo is null for field: " + field);
        };
      }

      switch (fieldInfo.getIndexOptions()) {
        case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
          return new OffsetsFromMatchIterator(field, new OffsetsFromPositions(field, analyzer));

        case DOCS_AND_FREQS_AND_POSITIONS:
          return new OffsetsFromPositions(field, analyzer);

        case DOCS_AND_FREQS:
        case DOCS:
          // By default retrieve offsets from individual tokens
          // retrieved by the analyzer (possibly narrowed down to
          // only those terms that the query hinted at when passed
          // a QueryVisitor.
          //
          // Alternative strategies are also possible and may make sense
          // depending on the use case (OffsetsFromValues, for example).
          return new OffsetsFromTokens(field, analyzer);

        case NONE:
        default:
          return (matchesIterator, doc) -> {
            throw new IOException(
                "Field is indexed without positions and/or offsets: "
                    + field
                    + ", "
                    + fieldInfo.getIndexOptions());
          };
      }
    };
  }

  /** Implements {@link FieldValueProvider} wrapping a preloaded {@link Document}. */
  private static final class DocumentFieldValueProvider implements FieldValueProvider {
    private final IOSupplier docSupplier;
    private Document doc;

    public DocumentFieldValueProvider(
        LeafReaderContext currentContext, int docId, Set preloadFields) {
      docSupplier = () -> currentContext.reader().storedFields().document(docId, preloadFields);
    }

    @Override
    public List getValues(String field) throws IOException {
      if (doc == null) {
        doc = Objects.requireNonNull(docSupplier.get());
      }

      return Arrays.asList(doc.getValues(field));
    }
  }
}