All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.matchhighlight.MatchHighlighter Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.matchhighlight;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Predicate;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.DocumentStoredFieldVisitor;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

/**
 * An example highlighter that combines several lower-level highlighting utilities in this package
 * into a fully featured, ready-to-use component.
 *
 * 

Note that if you need to customize or tweak the details of highlighting, it is better to * assemble your own highlighter using those low-level building blocks, rather than extend or modify * this one. */ public class MatchHighlighter { private final IndexSearcher searcher; private final OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies; private final Analyzer analyzer; private final HashSet fieldsAlwaysReturned = new HashSet<>(); private final List fieldHighlighters = new ArrayList<>(); /** * Actual per-field highlighter. Field highlighters are probed whether they are applicable to a * particular combination of (field, hasMatches) pair. If a highlighter declares it is applicable, * its {@link #format} method is invoked and the result is returned as the field's value. * * @see FieldValueHighlighters */ public interface FieldValueHighlighter { /** * Check if this highlighter can be applied to a given field. * * @param field Field name * @param hasMatches {@code true} if the field has a non-empty set of match regions. */ boolean isApplicable(String field, boolean hasMatches); /** Do format field values appropriately. */ List format( String field, String[] values, String contiguousValue, List valueRanges, List matchOffsets); /** * @return Returns a set of fields that must be fetched for each document, regardless of whether * they had matches or not. This is useful to load and return certain fields that should * always be included (identifiers, document titles, etc.). */ default Collection alwaysFetchedFields() { return Collections.emptyList(); } /** Returns a new field value highlighter that is a combination of this one and another one. */ default FieldValueHighlighter or(FieldValueHighlighter other) { FieldValueHighlighter first = this; FieldValueHighlighter second = other; HashSet fieldUnion = new HashSet<>(); fieldUnion.addAll(first.alwaysFetchedFields()); fieldUnion.addAll(second.alwaysFetchedFields()); return new FieldValueHighlighter() { @Override public boolean isApplicable(String field, boolean hasMatches) { return first.isApplicable(field, hasMatches) || second.isApplicable(field, hasMatches); } @Override public List format( String field, String[] values, String contiguousValue, List valueRanges, List matchOffsets) { FieldValueHighlighter delegate = first.isApplicable(field, matchOffsets != null && !matchOffsets.isEmpty()) ? first : second; return delegate.format(field, values, contiguousValue, valueRanges, matchOffsets); } @Override public Collection alwaysFetchedFields() { return fieldUnion; } }; } } /** * Append a new highlighter to field highlighters chain. The order of field highlighters is * important (first-matching wins). */ public MatchHighlighter appendFieldHighlighter(FieldValueHighlighter highlighter) { fieldHighlighters.add(highlighter); fieldsAlwaysReturned.addAll(highlighter.alwaysFetchedFields()); return this; } /** Always fetch the given set of fields for all input documents. */ public void alwaysFetchFields(String... fields) { for (String fld : fields) { fieldsAlwaysReturned.add(Objects.requireNonNull(fld)); } } /** Single document's highlights. */ public static class DocHighlights { public final int docId; public final Map> fields = new LinkedHashMap<>(); public DocHighlights(int docId) { this.docId = docId; } } /** An {@link OffsetRange} of a match, together with the source query that caused it. */ public static class QueryOffsetRange extends OffsetRange { public final Query query; QueryOffsetRange(Query query, int from, int to) { super(from, to); this.query = query; } @Override public QueryOffsetRange slice(int from, int to) { return new QueryOffsetRange(query, from, to); } } private static class DocHit { final int docId; private final LeafReader leafReader; private final int leafDocId; private final LinkedHashMap> matchRanges = new LinkedHashMap<>(); DocHit(int docId, LeafReader leafReader, int leafDocId) { this.docId = docId; this.leafReader = leafReader; this.leafDocId = leafDocId; } void addMatches(Query query, Map> hits) { hits.forEach( (field, offsets) -> { List target = matchRanges.computeIfAbsent(field, (fld) -> new ArrayList<>()); offsets.forEach(o -> target.add(new QueryOffsetRange(query, o.from, o.to))); }); } Document document(Predicate needsField) throws IOException { // Only load the fields that have a chance to be highlighted. DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor() { @Override public Status needsField(FieldInfo fieldInfo) { return (matchRanges.containsKey(fieldInfo.name) || needsField.test(fieldInfo.name)) ? Status.YES : Status.NO; } }; leafReader.storedFields().document(leafDocId, visitor); return visitor.getDocument(); } } public MatchHighlighter(IndexSearcher searcher, Analyzer analyzer) { this( searcher, analyzer, MatchRegionRetriever.computeOffsetRetrievalStrategies(searcher.getIndexReader(), analyzer)); } public MatchHighlighter( IndexSearcher searcher, Analyzer analyzer, OffsetsRetrievalStrategySupplier offsetsRetrievalStrategies) { this.searcher = searcher; this.offsetsRetrievalStrategies = offsetsRetrievalStrategies; this.analyzer = analyzer; } public Stream highlight(TopDocs topDocs, Query... queries) throws IOException { // We want to preserve topDocs document ordering and MatchRegionRetriever is optimized // for streaming, so we'll just prepopulate the map in proper order. LinkedHashMap docHits = new LinkedHashMap<>(); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { docHits.put(scoreDoc.doc, null); } // Collect match ranges for each query and associate each range to the origin query. for (Query q : queries) { MatchRegionRetriever highlighter = new MatchRegionRetriever(searcher, searcher.rewrite(q), offsetsRetrievalStrategies); highlighter.highlightDocuments( topDocs, (int docId, LeafReader leafReader, int leafDocId, Map> hits) -> { DocHit docHit = docHits.get(docId); if (docHit == null) { docHit = new DocHit(docId, leafReader, leafDocId); docHits.put(docId, docHit); } docHit.addMatches(q, hits); }); } return docHits.values().stream() .filter(Objects::nonNull) // This should always the case? .map(this::computeDocFieldValues); } private DocHighlights computeDocFieldValues(DocHit docHit) { Document doc; try { doc = docHit.document(fieldsAlwaysReturned::contains); } catch (IOException e) { throw new UncheckedIOException(e); } DocHighlights docHighlights = new DocHighlights(docHit.docId); HashSet unique = new HashSet<>(); for (IndexableField indexableField : doc) { String field = indexableField.name(); if (!unique.add(field)) { continue; } String[] values = doc.getValues(field); String contiguousValue = contiguousFieldValue(field, values); List valueRanges = computeValueRanges(field, values); List offsets = docHit.matchRanges.get(field); List formattedValues = fieldValueHighlighter(field, offsets != null) .format(field, values, contiguousValue, valueRanges, offsets); if (formattedValues != null) { docHighlights.fields.put(field, formattedValues); } } return docHighlights; } private List computeValueRanges(String field, String[] values) { ArrayList valueRanges = new ArrayList<>(); int offset = 0; for (CharSequence v : values) { valueRanges.add(new OffsetRange(offset, offset + v.length())); offset += v.length(); offset += analyzer.getOffsetGap(field); } return valueRanges; } private String contiguousFieldValue(String field, String[] values) { String value; if (values.length == 1) { value = values[0]; } else { // TODO: This can be inefficient if offset gap is large but the logic // of applying offsets would get much more complicated so leaving for now // (would have to recalculate all offsets to omit gaps). String fieldGapPadding = " ".repeat(analyzer.getOffsetGap(field)); value = String.join(fieldGapPadding, values); } return value; } private FieldValueHighlighter fieldValueHighlighter(String field, boolean hasMatches) { for (FieldValueHighlighter highlighter : fieldHighlighters) { if (highlighter.isApplicable(field, hasMatches)) { return highlighter; } } throw new RuntimeException("No field highlighter could be matched to field: " + field); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy