org.apache.lucene.search.uhighlight.PhraseHelper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
Apache Lucene (module: highlighter)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.queries.spans.SpanCollector;
import org.apache.lucene.queries.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.queries.spans.SpanScorer;
import org.apache.lucene.queries.spans.Spans;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

/**
 * Helps the {@link FieldOffsetStrategy} with position sensitive queries (e.g. highlight phrases
 * correctly). This is a stateful class holding information about the query, but it can (and is)
 * re-used across highlighting documents. Despite this state, it's immutable after construction.
 *
 * @lucene.internal
 */
// TODO rename to SpanHighlighting ?
public class PhraseHelper {

  public static final PhraseHelper NONE =
      new PhraseHelper(
          new MatchAllDocsQuery(),
          "_ignored_",
          (s) -> false,
          spanQuery -> null,
          query -> null,
          true);

  private final String fieldName;
  private final Set positionInsensitiveTerms; // (TermQuery terms)
  private final Set spanQueries;
  private final boolean willRewrite;
  private final Predicate fieldMatcher;

  /**
   * Constructor. {@code rewriteQueryPred} is an extension hook to override the default choice of
   * {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types
   * are rewritten, so use this to return {@link Boolean#FALSE} if you know the query doesn't need
   * to be rewritten. Similarly, {@code preExtractRewriteFunction} is also an extension hook for
   * extract to allow different queries to be set before the {@link WeightedSpanTermExtractor}'s
   * extraction is invoked. {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause
   * that needs to be "rewritten", which is usually limited to just a {@link
   * SpanMultiTermQueryWrapper} but could be other custom ones. {@code fieldMatcher} The field name
   * predicate to use for extracting the query part that must be highlighted.
   */
  public PhraseHelper(
      Query query,
      String field,
      Predicate fieldMatcher,
      Function rewriteQueryPred,
      Function> preExtractRewriteFunction,
      boolean ignoreQueriesNeedingRewrite) {
    this.fieldName = field;
    this.fieldMatcher = fieldMatcher;
    // filter terms to those we want
    positionInsensitiveTerms = new HashSet<>();
    spanQueries = new HashSet<>();

    // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls

    // boolean wrapped in 1-ary array so it's mutable from inner class
    boolean[] mustRewriteHolder = {false};

    // For TermQueries or other position insensitive queries, collect the Terms.
    // For other Query types, WSTE will convert to an equivalent SpanQuery.  NOT extracting position
    // spans here.
    new WeightedSpanTermExtractor(field) {
      // anonymous constructor
      {
        setExpandMultiTermQuery(true); // necessary for mustRewriteQuery(spanQuery) to work.

        try {
          // null because we won't actually extract right now; we're not collecting
          extract(query, 1f, null);
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }

      @Override
      protected void extract(Query query, float boost, Map terms)
          throws IOException {
        Collection newQueriesToExtract = preExtractRewriteFunction.apply(query);
        if (newQueriesToExtract != null) {
          for (Query newQuery : newQueriesToExtract) {
            extract(newQuery, boost, terms);
          }
        } else {
          super.extract(query, boost, terms);
        }
      }

      @Override
      protected boolean isQueryUnsupported(Class clazz) {
        if (clazz.isAssignableFrom(MultiTermQuery.class)) {
          return true; // We do MTQ processing separately in MultiTermHighlighting.java
        }
        return true; // TODO set to false and provide a hook to customize certain queries.
      }

      // called on Query types that are NOT position sensitive, e.g. TermQuery
      @Override
      protected void extractWeightedTerms(
          Map terms, Query query, float boost) {
        query.visit(
            new QueryVisitor() {
              @Override
              public boolean acceptField(String field) {
                return fieldMatcher.test(field);
              }

              @Override
              public void consumeTerms(Query query, Term... terms) {
                for (Term term : terms) {
                  positionInsensitiveTerms.add(term.bytes());
                }
              }
            });
      }

      // called on SpanQueries. Some other position-sensitive queries like PhraseQuery are converted
      // beforehand
      @Override
      protected void extractWeightedSpanTerms(
          Map terms, SpanQuery spanQuery, float boost)
          throws IOException {
        // if this span query isn't for this field, skip it.
        Set fieldNameSet = new HashSet<>(); // TODO reuse.  note: almost always size 1
        collectSpanQueryFields(spanQuery, fieldNameSet);
        for (String spanField : fieldNameSet) {
          if (!fieldMatcher.test(spanField)) {
            return;
          }
        }

        boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
        if (ignoreQueriesNeedingRewrite && mustRewriteQuery) {
          return; // ignore this query
        }
        mustRewriteHolder[0] |= mustRewriteQuery;

        spanQueries.add(spanQuery);
      }

      @Override
      protected boolean mustRewriteQuery(SpanQuery spanQuery) {
        Boolean rewriteQ = rewriteQueryPred.apply(spanQuery); // allow to override
        return rewriteQ != null ? rewriteQ : super.mustRewriteQuery(spanQuery);
      }
    }; // calling the constructor triggered the extraction/visiting we want.  Hacky; yes.

    willRewrite = mustRewriteHolder[0];
  }

  public Set getSpanQueries() {
    return spanQueries;
  }

  /** If there is no position sensitivity then use of the instance of this class can be ignored. */
  public boolean hasPositionSensitivity() {
    return spanQueries.isEmpty() == false;
  }

  /**
   * Rewrite is needed for handling a {@link SpanMultiTermQueryWrapper} (MTQ / wildcards) or some
   * custom things. When true, the resulting term list will probably be different than what it was
   * known to be initially.
   */
  public boolean willRewrite() {
    return willRewrite;
  }

  /** Returns the terms that are position-insensitive (sorted). */
  public BytesRef[] getAllPositionInsensitiveTerms() {
    BytesRef[] result =
        positionInsensitiveTerms.toArray(new BytesRef[positionInsensitiveTerms.size()]);
    Arrays.sort(result);
    return result;
  }

  /**
   * Given the internal SpanQueries, produce a number of OffsetsEnum into the {@code results} param.
   */
  public void createOffsetsEnumsForSpans(
      LeafReader leafReader, int docId, List results) throws IOException {
    leafReader = new SingleFieldWithOffsetsFilterLeafReader(leafReader, fieldName);
    // TODO avoid searcher and do what it does to rewrite & get weight?
    IndexSearcher searcher = new IndexSearcher(leafReader);
    searcher.setQueryCache(null);

    // for each SpanQuery, grab it's Spans and put it into a PriorityQueue
    PriorityQueue spansPriorityQueue =
        new PriorityQueue(spanQueries.size()) {
          @Override
          protected boolean lessThan(Spans a, Spans b) {
            return a.startPosition() <= b.startPosition();
          }
        };
    for (Query query : spanQueries) {
      Weight weight =
          searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1);
      Scorer scorer = weight.scorer(leafReader.getContext());
      if (scorer == null) {
        continue;
      }
      TwoPhaseIterator twoPhaseIterator = scorer.twoPhaseIterator();
      if (twoPhaseIterator != null) {
        if (twoPhaseIterator.approximation().advance(docId) != docId
            || !twoPhaseIterator.matches()) {
          continue;
        }
      } else if (scorer.iterator().advance(docId)
          != docId) { // preposition, and return doing nothing if find none
        continue;
      }

      Spans spans = ((SpanScorer) scorer).getSpans();
      assert spans.docID() == docId;
      if (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        spansPriorityQueue.add(spans);
      }
    }

    // Iterate the Spans in the PriorityQueue, collecting as we go.  By using a PriorityQueue
    // ordered by position,
    //   the underlying offsets in our collector will be mostly appended to the end of arrays
    // (efficient).
    // note: alternatively it'd interesting if we produced one OffsetsEnum that internally advanced
    //   this PriorityQueue when nextPosition is called; it would cap what we have to cache for
    // large docs and
    //   exiting early (due to maxLen) is easy.
    //   But at least we have an accurate "freq" and it shouldn't be too much data to collect.  Even
    // SpanScorer
    //   navigates the spans fully to compute a good freq (and thus score)!
    OffsetSpanCollector spanCollector = new OffsetSpanCollector();
    while (spansPriorityQueue.size() > 0) {
      Spans spans = spansPriorityQueue.top();
      // TODO limit to a capped endOffset length somehow so we can break this loop early
      spans.collect(spanCollector);

      if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) {
        spansPriorityQueue.pop();
      } else {
        spansPriorityQueue.updateTop();
      }
    }
    results.addAll(spanCollector.termToOffsetsEnums.values());
  }

  /**
   * Needed to support the ability to highlight a query irrespective of the field a query refers to
   * (aka requireFieldMatch=false). This reader will just delegate every call to a single field in
   * the wrapped LeafReader. This way we ensure that all queries going through this reader target
   * the same field.
   */
  private static final class SingleFieldWithOffsetsFilterLeafReader extends FilterLeafReader {
    final String fieldName;

    SingleFieldWithOffsetsFilterLeafReader(LeafReader in, String fieldName) {
      super(in);
      this.fieldName = fieldName;
    }

    @Override
    public FieldInfos getFieldInfos() {
      throw new UnsupportedOperationException(); // TODO merge them
    }

    @Override
    public Terms terms(String field) throws IOException {
      // ensure the underlying PostingsEnum returns offsets.  It's sad we have to do this to use the
      // SpanCollector.
      return new FilterTerms(super.terms(fieldName)) {
        @Override
        public TermsEnum iterator() throws IOException {
          return new FilterTermsEnum(in.iterator()) {
            @Override
            public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
              return super.postings(reuse, flags | PostingsEnum.OFFSETS);
            }
          };
        }
      };
    }

    @Override
    public NumericDocValues getNormValues(String field) throws IOException {
      return super.getNormValues(fieldName);
    }

    @Override
    public CacheHelper getCoreCacheHelper() {
      return null;
    }

    @Override
    public CacheHelper getReaderCacheHelper() {
      return null;
    }
  }

  private class OffsetSpanCollector implements SpanCollector {
    Map termToOffsetsEnums = new HashMap<>();

    @Override
    public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
      if (!fieldMatcher.test(term.field())) {
        return;
      }

      SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
      if (offsetsEnum == null) {
        // If it's pos insensitive we handle it outside of PhraseHelper.  term.field() is from the
        // Query.
        if (positionInsensitiveTerms.contains(term.bytes())) {
          return;
        }
        offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
        termToOffsetsEnums.put(term.bytes(), offsetsEnum);
      }
      offsetsEnum.add(postings.startOffset(), postings.endOffset());
    }

    @Override
    public void reset() { // called when at a new position.  We don't care.
    }
  }

  private static class SpanCollectedOffsetsEnum extends OffsetsEnum {
    // TODO perhaps optionally collect (and expose) payloads?
    private final BytesRef term;
    private final int[] startOffsets;
    private final int[] endOffsets;
    private int numPairs = 0;
    private int enumIdx = -1;

    private SpanCollectedOffsetsEnum(BytesRef term, int postingsFreq) {
      this.term = term;
      // hopefully not wasteful?  At least we needn't resize it.
      this.startOffsets = new int[postingsFreq];
      this.endOffsets = new int[postingsFreq];
    }

    // called from collector before it's navigated
    void add(int startOffset, int endOffset) {
      assert enumIdx == -1 : "bad state";

      // loop backwards since we expect a match at the end or close to it.  We expect O(1) not O(N).
      int pairIdx = numPairs - 1;
      for (; pairIdx >= 0; pairIdx--) {
        int iStartOffset = startOffsets[pairIdx];
        int iEndOffset = endOffsets[pairIdx];
        int cmp = Integer.compare(iStartOffset, startOffset);
        if (cmp == 0) {
          cmp = Integer.compare(iEndOffset, endOffset);
        }
        if (cmp == 0) {
          return; // we already have this offset-pair for this term
        } else if (cmp < 0) {
          break; // we will insert offsetPair to the right of pairIdx
        }
      }
      // pairIdx is now one position to the left of where we insert the new pair
      // shift right any pairs by one to make room
      final int shiftLen = numPairs - (pairIdx + 1);
      if (shiftLen > 0) {
        System.arraycopy(startOffsets, pairIdx + 1, startOffsets, pairIdx + 2, shiftLen);
        System.arraycopy(endOffsets, pairIdx + 1, endOffsets, pairIdx + 2, shiftLen);
      }
      // now we can place the offset pair
      startOffsets[pairIdx + 1] = startOffset;
      endOffsets[pairIdx + 1] = endOffset;
      numPairs++;
    }

    @Override
    public boolean nextPosition() throws IOException {
      return ++enumIdx < numPairs;
    }

    @Override
    public int freq() throws IOException {
      return numPairs;
    }

    @Override
    public BytesRef getTerm() throws IOException {
      return term;
    }

    @Override
    public int startOffset() throws IOException {
      return startOffsets[enumIdx];
    }

    @Override
    public int endOffset() throws IOException {
      return endOffsets[enumIdx];
    }
  }
}