All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter Maven / Gradle / Ivy

There is a newer version: 8.13.4
Show newest version
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
package org.elasticsearch.search.fetch.subphase.highlight;

import org.apache.lucene.search.highlight.Encoder;
import org.apache.lucene.search.vectorhighlight.BaseFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.CustomFieldQuery;
import org.apache.lucene.search.vectorhighlight.FieldFragList;
import org.apache.lucene.search.vectorhighlight.FieldQuery;
import org.apache.lucene.search.vectorhighlight.FragListBuilder;
import org.apache.lucene.search.vectorhighlight.FragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.ScoreOrderFragmentsBuilder;
import org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner;
import org.apache.lucene.search.vectorhighlight.SimpleFieldFragList;
import org.apache.lucene.search.vectorhighlight.SimpleFragListBuilder;
import org.apache.lucene.search.vectorhighlight.SingleFragListBuilder;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.TextSearchInfo;
import org.elasticsearch.search.fetch.FetchSubPhase;
import org.elasticsearch.search.fetch.subphase.highlight.SearchHighlightContext.Field;
import org.elasticsearch.search.fetch.subphase.highlight.SearchHighlightContext.FieldOptions;
import org.elasticsearch.search.lookup.SourceLookup;

import java.io.IOException;
import java.text.BreakIterator;
import java.util.Collections;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.function.Function;

public class FastVectorHighlighter implements Highlighter {
    private static final BoundaryScanner DEFAULT_SIMPLE_BOUNDARY_SCANNER = new SimpleBoundaryScanner();
    private static final BoundaryScanner DEFAULT_SENTENCE_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
        BreakIterator.getSentenceInstance(Locale.ROOT)
    );
    private static final BoundaryScanner DEFAULT_WORD_BOUNDARY_SCANNER = new BreakIteratorBoundaryScanner(
        BreakIterator.getWordInstance(Locale.ROOT)
    );

    public static final Setting SETTING_TV_HIGHLIGHT_MULTI_VALUE = Setting.boolSetting(
        "search.highlight.term_vector_multi_value",
        true,
        Setting.Property.NodeScope
    );

    private static final String CACHE_KEY = "highlight-fsv";
    private final Boolean termVectorMultiValue;

    public FastVectorHighlighter(Settings settings) {
        this.termVectorMultiValue = SETTING_TV_HIGHLIGHT_MULTI_VALUE.get(settings);
    }

    @Override
    public HighlightField highlight(FieldHighlightContext fieldContext) throws IOException {
        SearchHighlightContext.Field field = fieldContext.field;
        FetchSubPhase.HitContext hitContext = fieldContext.hitContext;
        MappedFieldType fieldType = fieldContext.fieldType;
        boolean forceSource = fieldContext.forceSource;
        boolean fixBrokenAnalysis = fieldContext.context.containsBrokenAnalysis(fieldContext.fieldName);

        if (canHighlight(fieldType) == false) {
            throw new IllegalArgumentException(
                "the field ["
                    + fieldContext.fieldName
                    + "] should be indexed with term vector with position offsets to be used with fast vector highlighter"
            );
        }

        Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;

        if (fieldContext.cache.containsKey(CACHE_KEY) == false) {
            fieldContext.cache.put(CACHE_KEY, new HighlighterEntry());
        }
        HighlighterEntry cache = (HighlighterEntry) fieldContext.cache.get(CACHE_KEY);
        FieldHighlightEntry entry = cache.fields.get(fieldType);
        if (entry == null) {
            FragListBuilder fragListBuilder;
            if (field.fieldOptions().numberOfFragments() == 0) {
                fragListBuilder = new SingleFragListBuilder();
            } else {
                fragListBuilder = field.fieldOptions().fragmentOffset() == -1
                    ? new SimpleFragListBuilder()
                    : new SimpleFragListBuilder(field.fieldOptions().fragmentOffset());
            }

            Function fragmentsBuilderSupplier = fragmentsBuilderSupplier(
                field,
                fieldType,
                forceSource,
                fixBrokenAnalysis
            );

            entry = new FieldHighlightEntry();
            if (field.fieldOptions().requireFieldMatch()) {
                /*
                 * we use top level reader to rewrite the query against all readers,
                 * with use caching it across hits (and across readers...)
                 */
                entry.fieldMatchFieldQuery = new CustomFieldQuery(
                    fieldContext.query,
                    hitContext.topLevelReader(),
                    true,
                    field.fieldOptions().requireFieldMatch()
                );
            } else {
                /*
                 * we use top level reader to rewrite the query against all readers,
                 * with use caching it across hits (and across readers...)
                 */
                entry.noFieldMatchFieldQuery = new CustomFieldQuery(
                    fieldContext.query,
                    hitContext.topLevelReader(),
                    true,
                    field.fieldOptions().requireFieldMatch()
                );
            }
            entry.fragListBuilder = fragListBuilder;
            entry.fragmentsBuilderSupplier = fragmentsBuilderSupplier;
            if (cache.fvh == null) {
                // parameters to FVH are not requires since:
                // first two booleans are not relevant since they are set on the CustomFieldQuery
                // (phrase and fieldMatch) fragment builders are used explicitly
                cache.fvh = new org.apache.lucene.search.vectorhighlight.FastVectorHighlighter();
            }
            CustomFieldQuery.highlightFilters.set(field.fieldOptions().highlightFilter());
            cache.fields.put(fieldType, entry);
        }
        final FieldQuery fieldQuery;
        if (field.fieldOptions().requireFieldMatch()) {
            fieldQuery = entry.fieldMatchFieldQuery;
        } else {
            fieldQuery = entry.noFieldMatchFieldQuery;
        }
        cache.fvh.setPhraseLimit(field.fieldOptions().phraseLimit());

        String[] fragments;
        FragmentsBuilder fragmentsBuilder = entry.fragmentsBuilderSupplier.apply(hitContext.sourceLookup());

        // a HACK to make highlighter do highlighting, even though its using the single frag list builder
        int numberOfFragments = field.fieldOptions().numberOfFragments() == 0
            ? Integer.MAX_VALUE
            : field.fieldOptions().numberOfFragments();
        int fragmentCharSize = field.fieldOptions().numberOfFragments() == 0 ? Integer.MAX_VALUE : field.fieldOptions().fragmentCharSize();
        // we highlight against the low level reader and docId, because if we load source, we want to reuse it if possible
        // Only send matched fields if they were requested to save time.
        if (field.fieldOptions().matchedFields() != null && field.fieldOptions().matchedFields().isEmpty() == false) {
            fragments = cache.fvh.getBestFragments(
                fieldQuery,
                hitContext.reader(),
                hitContext.docId(),
                fieldType.name(),
                field.fieldOptions().matchedFields(),
                fragmentCharSize,
                numberOfFragments,
                entry.fragListBuilder,
                fragmentsBuilder,
                field.fieldOptions().preTags(),
                field.fieldOptions().postTags(),
                encoder
            );
        } else {
            fragments = cache.fvh.getBestFragments(
                fieldQuery,
                hitContext.reader(),
                hitContext.docId(),
                fieldType.name(),
                fragmentCharSize,
                numberOfFragments,
                entry.fragListBuilder,
                fragmentsBuilder,
                field.fieldOptions().preTags(),
                field.fieldOptions().postTags(),
                encoder
            );
        }

        if (CollectionUtils.isEmpty(fragments) == false) {
            return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
        }

        int noMatchSize = fieldContext.field.fieldOptions().noMatchSize();
        if (noMatchSize > 0) {
            // Essentially we just request that a fragment is built from 0 to noMatchSize using
            // the normal fragmentsBuilder
            FieldFragList fieldFragList = new SimpleFieldFragList(-1 /*ignored*/);
            fieldFragList.add(0, noMatchSize, Collections.emptyList());
            fragments = fragmentsBuilder.createFragments(
                hitContext.reader(),
                hitContext.docId(),
                fieldType.name(),
                fieldFragList,
                1,
                field.fieldOptions().preTags(),
                field.fieldOptions().postTags(),
                encoder
            );
            if (CollectionUtils.isEmpty(fragments) == false) {
                return new HighlightField(fieldContext.fieldName, Text.convertFromStringArray(fragments));
            }
        }

        return null;
    }

    private Function fragmentsBuilderSupplier(
        SearchHighlightContext.Field field,
        MappedFieldType fieldType,
        boolean forceSource,
        boolean fixBrokenAnalysis
    ) {
        BoundaryScanner boundaryScanner = getBoundaryScanner(field);
        FieldOptions options = field.fieldOptions();
        Function supplier;
        if (forceSource == false && fieldType.isStored()) {
            if (options.numberOfFragments() != 0 && options.scoreOrdered()) {
                supplier = ignored -> new ScoreOrderFragmentsBuilder(options.preTags(), options.postTags(), boundaryScanner);
            } else {
                supplier = ignored -> new SimpleFragmentsBuilder(
                    fieldType,
                    fixBrokenAnalysis,
                    options.preTags(),
                    options.postTags(),
                    boundaryScanner
                );
            }
        } else {
            if (options.numberOfFragments() != 0 && options.scoreOrdered()) {
                supplier = lookup -> new SourceScoreOrderFragmentsBuilder(
                    fieldType,
                    fixBrokenAnalysis,
                    lookup,
                    options.preTags(),
                    options.postTags(),
                    boundaryScanner
                );
            } else {
                supplier = lookup -> new SourceSimpleFragmentsBuilder(
                    fieldType,
                    fixBrokenAnalysis,
                    lookup,
                    options.preTags(),
                    options.postTags(),
                    boundaryScanner
                );
            }
        }

        return lookup -> {
            BaseFragmentsBuilder builder = supplier.apply(lookup);
            builder.setDiscreteMultiValueHighlighting(termVectorMultiValue);
            return builder;
        };
    }

    @Override
    public boolean canHighlight(MappedFieldType ft) {
        return ft.getTextSearchInfo().termVectors() == TextSearchInfo.TermVector.OFFSETS;
    }

    private static BoundaryScanner getBoundaryScanner(Field field) {
        final FieldOptions fieldOptions = field.fieldOptions();
        final Locale boundaryScannerLocale = fieldOptions.boundaryScannerLocale() != null
            ? fieldOptions.boundaryScannerLocale()
            : Locale.ROOT;
        final HighlightBuilder.BoundaryScannerType type = fieldOptions.boundaryScannerType() != null
            ? fieldOptions.boundaryScannerType()
            : HighlightBuilder.BoundaryScannerType.CHARS;
        switch (type) {
            case SENTENCE:
                if (boundaryScannerLocale != null) {
                    return new BreakIteratorBoundaryScanner(BreakIterator.getSentenceInstance(boundaryScannerLocale));
                }
                return DEFAULT_SENTENCE_BOUNDARY_SCANNER;
            case WORD:
                if (boundaryScannerLocale != null) {
                    return new BreakIteratorBoundaryScanner(BreakIterator.getWordInstance(boundaryScannerLocale));
                }
                return DEFAULT_WORD_BOUNDARY_SCANNER;
            case CHARS:
                if (fieldOptions.boundaryMaxScan() != SimpleBoundaryScanner.DEFAULT_MAX_SCAN
                    || fieldOptions.boundaryChars() != SimpleBoundaryScanner.DEFAULT_BOUNDARY_CHARS) {
                    return new SimpleBoundaryScanner(fieldOptions.boundaryMaxScan(), fieldOptions.boundaryChars());
                }
                return DEFAULT_SIMPLE_BOUNDARY_SCANNER;
            default:
                throw new IllegalArgumentException("Invalid boundary scanner type: " + type.toString());
        }
    }

    private static class FieldHighlightEntry {
        public FragListBuilder fragListBuilder;
        public Function fragmentsBuilderSupplier;
        public FieldQuery noFieldMatchFieldQuery;
        public FieldQuery fieldMatchFieldQuery;
    }

    private static class HighlighterEntry {
        public org.apache.lucene.search.vectorhighlight.FastVectorHighlighter fvh;
        public Map fields = new HashMap<>();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy