org.apache.lucene.search.matchhighlight.MatchHighlighter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Apache Lucene (module: highlighter)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.matchhighlight;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Predicate;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

/**
 * An example highlighter that combines several lower-level highlighting utilities in this package
 * into a fully featured, ready-to-use component.
 *
 * Note that if you need to customize or tweak the details of highlighting, it is better to
 * assemble your own highlighter using those low-level building blocks, rather than extend or modify
 * this one.
 */
public class MatchHighlighter {
  private final IndexSearcher searcher;
  private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies;
  private final Analyzer analyzer;

  private final HashSet fieldsAlwaysReturned = new HashSet<>();
  private final List fieldHighlighters = new ArrayList<>();

  /**
   * Actual per-field highlighter. Field highlighters are probed whether they are applicable to a
   * particular combination of (field, hasMatches) pair. If a highlighter declares it is applicable,
   * its {@link #format} method is invoked and the result is returned as the field's value.
   *
   * @see FieldValueHighlighters
   */
  public interface FieldValueHighlighter {
    /**
     * Check if this highlighter can be applied to a given field.
     *
     * @param field Field name
     * @param hasMatches {@code true} if the field has a non-empty set of match regions.
     */
    boolean isApplicable(String field, boolean hasMatches);

    /** Do format field values appropriately. */
    List format(
        String field,
        String[] values,
        String contiguousValue,
        List valueRanges,
        List matchOffsets);

    /**
     * @return Returns a set of fields that must be fetched for each document, regardless of whether
     *     they had matches or not. This is useful to load and return certain fields that should
     *     always be included (identifiers, document titles, etc.).
     */
    default Collection alwaysFetchedFields() {
      return Collections.emptyList();
    }

    /** Returns a new field value highlighter that is a combination of this one and another one. */
    default FieldValueHighlighter or(FieldValueHighlighter other) {
      FieldValueHighlighter first = this;
      FieldValueHighlighter second = other;

      HashSet fieldUnion = new HashSet<>();
      fieldUnion.addAll(first.alwaysFetchedFields());
      fieldUnion.addAll(second.alwaysFetchedFields());

      return new FieldValueHighlighter() {
        @Override
        public boolean isApplicable(String field, boolean hasMatches) {
          return first.isApplicable(field, hasMatches) || second.isApplicable(field, hasMatches);
        }

        @Override
        public List format(
            String field,
            String[] values,
            String contiguousValue,
            List valueRanges,
            List matchOffsets) {
          FieldValueHighlighter delegate =
              first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty())
                  ? first
                  : second;
          return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets);
        }

        @Override
        public Collection alwaysFetchedFields() {
          return fieldUnion;
        }
      };
    }
  }

  /**
   * Append a new highlighter to field highlighters chain. The order of field highlighters is
   * important (first-matching wins).
   */
  public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) {
    fieldHighlighters.add(highlighter);
    fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields());
    return this;
  }

  /** Always fetch the given set of fields for all input documents. */
  public void alwaysFetchFields(String... fields) {
    for (String fld : fields) {
      fieldsAlwaysReturned.add(Objects.requireNonNull(fld));
    }
  }

  /** Single document's highlights. */
  public static class DocHighlights {
    public final int docId;
    public final Map> fields = new LinkedHashMap<>();

    public DocHighlights(int docId) {
      this.docId = docId;
    }
  }

  /** An {@link OffsetRange} of a match, together with the source query that caused it. */
  public static class QueryOffsetRange extends OffsetRange {
    public final Query query;

    QueryOffsetRange(Query query, int from, int to) {
      super(from, to);
      this.query = query;
    }

    @Override
    public QueryOffsetRange slice(int from, int to) {
      return new QueryOffsetRange(query, from, to);
    }
  }

  private static class DocHit {
    final int docId;
    private final LeafReader leafReader;
    private final int leafDocId;
    private final LinkedHashMap> matchRanges = new LinkedHashMap<>();

    DocHit(int docId, LeafReader leafReader, int leafDocId) {
      this.docId = docId;
      this.leafReader = leafReader;
      this.leafDocId = leafDocId;
    }

    void addMatches(Query query, Map> hits) {
      hits.forEach(
          (field, offsets) -> {
            List target =
                matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>());
            offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to)));
          });
    }

    Document document(Predicate needsField) throws IOException {
      // Only load the fields that have a chance to be highlighted.
      DocumentStoredFieldVisitor visitor =
          new DocumentStoredFieldVisitor() {
            @Override
            public Status needsField(FieldInfo fieldInfo) {
              return (matchRanges.containsKey(fieldInfo.name) || needsField.test(fieldInfo.name))
                  ? Status.YES
                  : Status.NO;
            }
          };

      leafReader.storedFields().document(leafDocId, visitor);
      return visitor.getDocument();
    }
  }

  public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) {
    this(
        searcher,
        analyzer,
        MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer));
  }

  public MatchHighlighter(
      IndexSearcher searcher,
      Analyzer analyzer,
      OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) {
    this.searcher = searcher;
    this.offsetsRetrievalStrategies = offsetsRetrievalStrategies;
    this.analyzer = analyzer;
  }

  public Stream highlight(TopDocs topDocs, Query... queries) throws IOException {
    // We want to preserve topDocs document ordering and MatchRegionRetriever is optimized
    // for streaming, so we'll just prepopulate the map in proper order.
    LinkedHashMap docHits = new LinkedHashMap<>();
    for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
      docHits.put(scoreDoc.doc, null);
    }

    // Collect match ranges for each query and associate each range to the origin query.
    for (Query q : queries) {
      MatchRegionRetriever highlighter =
          new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies);
      highlighter.highlightDocuments(
          topDocs,
          (int docId,
              LeafReader leafReader,
              int leafDocId,
              Map> hits) -> {
            DocHit docHit = docHits.get(docId);
            if (docHit == null) {
              docHit = new DocHit(docId, leafReader, leafDocId);
              docHits.put(docId, docHit);
            }
            docHit.addMatches(q, hits);
          });
    }

    return docHits.values().stream()
        .filter(Objects::nonNull) // This should always the case?
        .map(this::computeDocFieldValues);
  }

  private DocHighlights computeDocFieldValues(DocHit docHit) {
    Document doc;
    try {
      doc = docHit.document(fieldsAlwaysReturned::contains);
    } catch (IOException e) {
      throw new UncheckedIOException(e);
    }

    DocHighlights docHighlights = new DocHighlights(docHit.docId);

    HashSet unique = new HashSet<>();
    for (IndexableField indexableField : doc) {
      String field = indexableField.name();
      if (!unique.add(field)) {
        continue;
      }

      String[] values = doc.getValues(field);
      String contiguousValue = contiguousFieldValue(field, values);
      List valueRanges = computeValueRanges(field, values);
      List offsets = docHit.matchRanges.get(field);

      List formattedValues =
          fieldValueHighlighter(field, offsets != null)
              .format(field, values, contiguousValue, valueRanges, offsets);

      if (formattedValues != null) {
        docHighlights.fields.put(field, formattedValues);
      }
    }

    return docHighlights;
  }

  private List computeValueRanges(String field, String[] values) {
    ArrayList valueRanges = new ArrayList<>();
    int offset = 0;
    for (CharSequence v : values) {
      valueRanges.add(new OffsetRange(offset, offset + v.length()));
      offset += v.length();
      offset += analyzer.getOffsetGap(field);
    }
    return valueRanges;
  }

  private String contiguousFieldValue(String field, String[] values) {
    String value;
    if (values.length == 1) {
      value = values[0];
    } else {
      // TODO: This can be inefficient if offset gap is large but the logic
      // of applying offsets would get much more complicated so leaving for now
      // (would have to recalculate all offsets to omit gaps).
      String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field));
      value = String.join(fieldGapPadding, values);
    }
    return value;
  }

  private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) {
    for (FieldValueHighlighter highlighter : fieldHighlighters) {
      if (highlighter.isApplicable(field, hasMatches)) {
        return highlighter;
      }
    }
    throw new RuntimeException("No field highlighter could be matched to field: " + field);
  }
}