All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.wikimedia.highlighter.cirrus.opensearch.FieldWrapper Maven / Gradle / Ivy

The newest version!
package org.wikimedia.highlighter.cirrus.opensearch;

import static java.util.stream.Collectors.toCollection;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.BytesRef;
import org.opensearch.common.util.BigArrays;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.index.mapper.TextSearchInfo;
import org.opensearch.search.fetch.subphase.highlight.FieldHighlightContext;
import org.opensearch.search.fetch.subphase.highlight.HighlightUtils;
import org.opensearch.search.fetch.subphase.highlight.SearchHighlightContext;
import org.wikimedia.highlighter.cirrus.opensearch.CirrusHighlighter.HighlightExecutionContext;
import org.wikimedia.highlighter.cirrus.lucene.hit.PostingsHitEnum;
import org.wikimedia.highlighter.cirrus.lucene.hit.TokenStreamHitEnum;
import org.wikimedia.highlighter.cirrus.lucene.hit.weight.BasicQueryWeigher;
import org.wikimedia.highlighter.cirrus.lucene.hit.weight.DefaultSimilarityTermWeigher;
import org.wikimedia.search.highlighter.cirrus.HitEnum;
import org.wikimedia.search.highlighter.cirrus.Segmenter;
import org.wikimedia.search.highlighter.cirrus.SourceExtracter;
import org.wikimedia.search.highlighter.cirrus.hit.ConcatHitEnum;
import org.wikimedia.search.highlighter.cirrus.hit.EmptyHitEnum;
import org.wikimedia.search.highlighter.cirrus.hit.PositionBoostingHitEnumWrapper;
import org.wikimedia.search.highlighter.cirrus.hit.ReplayingHitEnum.HitEnumAndLength;
import org.wikimedia.search.highlighter.cirrus.hit.TermWeigher;
import org.wikimedia.search.highlighter.cirrus.hit.WeightFilteredHitEnumWrapper;
import org.wikimedia.search.highlighter.cirrus.hit.weight.CachingTermWeigher;
import org.wikimedia.search.highlighter.cirrus.hit.weight.ConstantTermWeigher;
import org.wikimedia.search.highlighter.cirrus.snippet.MultiSegmenter;
import org.wikimedia.search.highlighter.cirrus.source.NonMergingMultiSourceExtracter;
import org.wikimedia.search.highlighter.cirrus.source.StringSourceExtracter;

@SuppressWarnings("checkstyle:classfanoutcomplexity") // to improve if we ever touch that code again
public class FieldWrapper {
    private final HighlightExecutionContext executionContext;
    private final FieldHighlightContext context;
    private final BasicQueryWeigher weigher;
    private List values;
    /**
     * If there is a TokenStream still open during the highlighting.
     */
    private TokenStream tokenStream;

    /**
     * Position gap for the field.  Only looked up if needed.  < 0 means not looked up.
     */
    private static final int POSITION_GAP_INIT = -1;
    private int positionGap = POSITION_GAP_INIT;

    /**
     * Build a wrapper around the default field in the context.
     */
    public FieldWrapper(HighlightExecutionContext executionContext, FieldHighlightContext context,
            BasicQueryWeigher weigher) {
        this.executionContext = executionContext;
        this.context = context;
        this.weigher = weigher;
    }

    /**
     * Build a wrapper around fieldName which is not the default field in the
     * context.
     */
    public FieldWrapper(HighlightExecutionContext executionContext, FieldHighlightContext context,
            BasicQueryWeigher weigher, String fieldName) {
        assert !context.fieldName.equals(fieldName);
        MappedFieldType fieldType = context.context.mapperService().fieldType(fieldName);
        this.executionContext = executionContext;

        this.context = new FieldHighlightContext(fieldName, context.field, fieldType, context.context, context.hitContext,
                context.query, false, context.cache);
        this.weigher = weigher;
    }

    /**
     * Name of the wrapped field.
     */
    public String fieldName() {
        return context.fieldName;
    }

    @Override
    public String toString() {
        return context.fieldName;
    }

    /**
     * Cleanup any resources we still have open.
     */
    public void cleanup() throws IOException {
        if (tokenStream != null) {
            try {
                tokenStream.end();
            } finally {
                tokenStream.close();
            }
        }
    }

    public List getFieldValues() throws IOException {
        if (values == null) {
            boolean forceSource = context.forceSource;
            List objs = HighlightUtils.loadFieldValues(context.fieldType, context.hitContext, forceSource);
            values = objs.stream().map(Object::toString).collect(toCollection(() -> new ArrayList<>(objs.size())));
        }
        return values;
    }

    public SourceExtracter buildSourceExtracter() throws IOException {
        List fieldValues = getFieldValues();
        switch (fieldValues.size()) {
            case 0:
                return new StringSourceExtracter("");
            case 1:
                return new StringSourceExtracter(fieldValues.get(0));
            default:
                // Elasticsearch uses a string offset gap of 1, the default on the
                // builder.
                NonMergingMultiSourceExtracter.Builder builder = NonMergingMultiSourceExtracter
                        .builder();
                for (String s : fieldValues) {
                    builder.add(new StringSourceExtracter(s), s.length());
                }
                return builder.build();
        }
    }

    /**
     * Does this field have more then one value?
     */
    public boolean isMultValued() throws IOException {
        return getFieldValues().size() > 1;
    }

    public Segmenter buildSegmenter() throws IOException {
        List fieldValues = getFieldValues();
        SegmenterFactory segmenterFactory = executionContext.getSegmenterFactory();
        switch (fieldValues.size()) {
            case 0:
                return segmenterFactory.build("");
            case 1:
                return segmenterFactory.build(fieldValues.get(0));
            default:
                // Elasticsearch uses a string offset gap of 1, the default on the
                // builder.
                MultiSegmenter.Builder builder = MultiSegmenter.builder();
                for (String s : fieldValues) {
                    builder.add(segmenterFactory.build(s), s.length());
                }
                return builder.build();
        }
    }

    /**
     * Can this field produce hits? This'll return false when there are no terms
     * worth anything that aren't in phrases AND there aren't any phrases on
     * this field.
     */
    public boolean canProduceHits() {
        return weigher.maxTermWeight() > 0 || weigher.areTherePhrasesOnField(context.fieldName);
    }

    public HitEnum buildHitEnum() throws IOException {
        HitEnum e = buildHitEnumForSource();

        // Support phrase matches. Note that this must be done here rather than
        // after merging HitEnums because each hit could map offsets to
        // different positions. Since they are merged based on _offset_ the
        // phrase wrapper will see jumbled positions, causing it to break
        // horribly. Don't do it. I've tried.
        e = weigher.wrap(context.fieldName, e);

        SearchHighlightContext.FieldOptions options = context.field.fieldOptions();
        if (!options.scoreOrdered()) {
            Boolean topScoring = (Boolean)executionContext.getOption("top_scoring");
            if (topScoring == null || !topScoring) {
                // If we don't pay attention to scoring then there is no point
                // is messing with the weights.

                // Filter 0 weight hits which pop out from the TokenStreamHitEnum,
                // phrase match misses.
                return new WeightFilteredHitEnumWrapper(e, 0f);
            }
        }
        // TODO move this up so we don't have to redo it per matched_field
        @SuppressWarnings("unchecked")
        Map boostBefore = (Map)executionContext.getOption("boost_before");
        if (boostBefore != null) {
            TreeMap ordered = new TreeMap<>();
            for (Map.Entry entry : boostBefore.entrySet()) {
                if (!(entry.getValue() instanceof Number)) {
                    throw new IllegalArgumentException("boost_before must be a flat object who's values are numbers.");
                }
                ordered.put(Integer.valueOf(entry.getKey()), ((Number)entry.getValue()).floatValue());
            }
            PositionBoostingHitEnumWrapper boosting = new PositionBoostingHitEnumWrapper(e);
            e = boosting;
            for (Map.Entry entry: ordered.entrySet()) {
                boosting.add(entry.getKey(), entry.getValue());
            }
        }

        // Filter 0 weight hits which pop out from the TokenStreamHitEnum,
        // phrase match misses, and boost_before being used as a filter.
        return new WeightFilteredHitEnumWrapper(e, 0f);
    }

    private HitEnum buildHitEnumForSource() throws IOException {
        if (context.field.fieldOptions().options() != null) {
            String hitSource = (String) context.field.fieldOptions().options().get("hit_source");
            if (hitSource != null) {
                switch (hitSource) {
                    case "postings":
                        if (!canUsePostingsHitEnum()) {
                            throw new IllegalArgumentException(
                                    "Can't use postings as a hit source without setting index_options to postings");
                        }
                        return buildPostingsHitEnum();
                    case "vectors":
                        if (!canUseVectorsHitEnum()) {
                            throw new IllegalArgumentException(
                                    "Can't use vectors as a hit source without setting term_vector to with_positions_offsets");
                        }
                        return buildTermVectorsHitEnum();
                    case "analyze":
                        return buildTokenStreamHitEnum();
                    default:
                        throw new IllegalArgumentException("Unknown hit source:  " + hitSource);
                }
            }
        }
        if (canUsePostingsHitEnum()) {
            return buildPostingsHitEnum();
        }
        if (canUseVectorsHitEnum()) {
            return buildTermVectorsHitEnum();
        }
        return buildTokenStreamHitEnum();
    }

    private boolean canUsePostingsHitEnum() {
        return context.fieldType.getTextSearchInfo().hasPositions()
                && context.fieldType.getTextSearchInfo().hasOffsets();
    }

    private boolean canUseVectorsHitEnum() {
        return context.fieldType.getTextSearchInfo().termVectors() == TextSearchInfo.TermVector.OFFSETS;
    }

    private HitEnum buildPostingsHitEnum() throws IOException {
        return PostingsHitEnum.fromPostings(context.hitContext.reader(),
                context.hitContext.docId(), context.fieldType.name(),
                weigher.acceptableTerms(), getQueryWeigher(), getCorpusWeigher(false), weigher);
    }

    private HitEnum buildTermVectorsHitEnum() throws IOException {
        return PostingsHitEnum.fromTermVectors(context.hitContext.reader(),
                context.hitContext.docId(), context.fieldType.name(),
                weigher.acceptableTerms(), getQueryWeigher(), getCorpusWeigher(false), weigher);
    }

    private HitEnum buildTokenStreamHitEnum() throws IOException {
        Analyzer analyzer = context.fieldType.indexAnalyzer();

        if (analyzer == null) {
            analyzer = context.context.mapperService().indexAnalyzer();
        }
        return buildTokenStreamHitEnum(analyzer);
    }

    private HitEnum buildTokenStreamHitEnum(final Analyzer analyzer) throws IOException {
        List fieldValues = getFieldValues();
        switch (fieldValues.size()) {
            case 0:
                // If there isn't any data then we assume there can't be any hits.
                // This is more right than building the token stream hit enum
                // against empty string.
                return EmptyHitEnum.INSTANCE;
            case 1:
                return buildTokenStreamHitEnum(analyzer, fieldValues.get(0));
            default:
                /*
                 * Note that it is super important that this process is _lazy_
                 * because we can't have multiple TokenStreams open per analyzer.
                 */
                Iterator hitEnumsFromStreams = fieldValues
                    .stream()
                    .map(fieldValue -> buildTokenStreamHitEnumAndLength(fieldValue, analyzer))
                    .iterator();
                return new ConcatHitEnum(hitEnumsFromStreams, getPositionGap(), 1);
        }
    }

    private HitEnumAndLength buildTokenStreamHitEnumAndLength(String fieldValue, Analyzer analyzer) {
        try {
            if (tokenStream != null) {
                tokenStream.close();
            }
            return new HitEnumAndLength(buildTokenStreamHitEnum(analyzer,
                    fieldValue), fieldValue.length());
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private HitEnum buildTokenStreamHitEnum(Analyzer analyzer, String source) {
        TokenStream tokenStream;
        try {
            tokenStream = analyzer.tokenStream(context.fieldName, source);
        } catch (IllegalStateException e) {
            // Uhg, I wish we didn't have this limitation but it isn't really
            // very common and shouldn't be too big of a problem.
            throw new UnsupportedOperationException(
                    "If analyzing to find hits each matched field must have a unique analyzer.", e);
        }
        this.tokenStream = tokenStream;
        return new TokenStreamHitEnum(tokenStream, getQueryWeigher(), getCorpusWeigher(true), weigher);
    }

    private TermWeigher getQueryWeigher() {
        return weigher;
    }

    private TermWeigher getCorpusWeigher(boolean mightWeighTermsMultipleTimes) {
        // No need to add fancy term weights if there is only one term or we
        // aren't using score order.
        if (weigher.singleTerm() || !executionContext.scoreMatters()) {
            return new ConstantTermWeigher<>();
        }
        Boolean useDefaultSimilarity = (Boolean) executionContext.getOption("default_similarity");
        if (useDefaultSimilarity == null || useDefaultSimilarity) {
            // Use a top level reader to fetch the frequency information
            TermWeigher corpusWeigher = new DefaultSimilarityTermWeigher(context.hitContext.topLevelReader(),
                    context.fieldName);
            // TODO maybe switch to a recycling instance on the off chance that
            // we find a ton of terms in the document. That'd require more work
            // to make sure everything is properly Releasable.
            if (mightWeighTermsMultipleTimes) {
                corpusWeigher = new CachingTermWeigher<>(new BytesRefTermWeigherCache(
                        BigArrays.NON_RECYCLING_INSTANCE), corpusWeigher);
            }
            return corpusWeigher;
        }
        return new ConstantTermWeigher<>();
    }

    public int getPositionGap() {
        if (this.positionGap == POSITION_GAP_INIT) {
            this.positionGap = context.fieldType.indexAnalyzer().getPositionIncrementGap(context.fieldType.name());
        }
        return this.positionGap;
    }

    /**
     * Does this field exist?
     */
    public boolean exists() {
        return context.fieldType != null;
    }
}