org.apache.lucene.search.uhighlight.PhraseHelper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-highlighter Show documentation
This is the highlighter for apache lucene java
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.uhighlight;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.function.Predicate;

import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.highlight.WeightedSpanTerm;
import org.apache.lucene.search.highlight.WeightedSpanTermExtractor;
import org.apache.lucene.search.spans.SpanCollector;
import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanScorer;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

/**
 * Helps the {@link FieldOffsetStrategy} with position sensitive queries (e.g. highlight phrases correctly).
 * This is a stateful class holding information about the query, but it can (and is) re-used across highlighting
 * documents.  Despite this state, it's immutable after construction.
 *
 * @lucene.internal
 */
// TODO rename to SpanHighlighting ?
public class PhraseHelper {

  public static final PhraseHelper NONE = new PhraseHelper(new MatchAllDocsQuery(), "_ignored_",
      (s) -> false, spanQuery -> null, query -> null, true);

  private final String fieldName;
  private final Set positionInsensitiveTerms; // (TermQuery terms)
  private final Set spanQueries;
  private final boolean willRewrite;
  private final Predicate fieldMatcher;

  /**
   * Constructor.
   * {@code rewriteQueryPred} is an extension hook to override the default choice of
   * {@link WeightedSpanTermExtractor#mustRewriteQuery(SpanQuery)}. By default unknown query types are rewritten,
   * so use this to return {@link Boolean#FALSE} if you know the query doesn't need to be rewritten.
   * Similarly, {@code preExtractRewriteFunction} is also an extension hook for extract to allow different queries
   * to be set before the {@link WeightedSpanTermExtractor}'s extraction is invoked.
   * {@code ignoreQueriesNeedingRewrite} effectively ignores any query clause that needs to be "rewritten", which is
   * usually limited to just a {@link SpanMultiTermQueryWrapper} but could be other custom ones.
   * {@code fieldMatcher} The field name predicate to use for extracting the query part that must be highlighted.
   */
  public PhraseHelper(Query query, String field, Predicate fieldMatcher, Function rewriteQueryPred,
                      Function> preExtractRewriteFunction,
                      boolean ignoreQueriesNeedingRewrite) {
    this.fieldName = field;
    this.fieldMatcher = fieldMatcher;
    // filter terms to those we want
    positionInsensitiveTerms = new HashSet<>();
    spanQueries = new HashSet<>();

    // TODO Have toSpanQuery(query) Function as an extension point for those with custom Query impls

    boolean[] mustRewriteHolder = {false}; // boolean wrapped in 1-ary array so it's mutable from inner class

    // For TermQueries or other position insensitive queries, collect the Terms.
    // For other Query types, WSTE will convert to an equivalent SpanQuery.  NOT extracting position spans here.
    new WeightedSpanTermExtractor(field) {
      //anonymous constructor
      {
        setExpandMultiTermQuery(true); //necessary for mustRewriteQuery(spanQuery) to work.

        try {
          extract(query, 1f, null); // null because we won't actually extract right now; we're not collecting
        } catch (Exception e) {
          throw new RuntimeException(e);
        }
      }

      @Override
      protected void extract(Query query, float boost, Map terms) throws IOException {
        Collection newQueriesToExtract = preExtractRewriteFunction.apply(query);
        if (newQueriesToExtract != null) {
          for (Query newQuery : newQueriesToExtract) {
            extract(newQuery, boost, terms);
          }
        } else {
          super.extract(query, boost, terms);
        }
      }

      @Override
      protected boolean isQueryUnsupported(Class clazz) {
        if (clazz.isAssignableFrom(MultiTermQuery.class)) {
          return true; //We do MTQ processing separately in MultiTermHighlighting.java
        }
        return true; //TODO set to false and provide a hook to customize certain queries.
      }

      // called on Query types that are NOT position sensitive, e.g. TermQuery
      @Override
      protected void extractWeightedTerms(Map terms, Query query, float boost) {
        query.visit(new QueryVisitor() {
          @Override
          public boolean acceptField(String field) {
            return fieldMatcher.test(field);
          }
          @Override
          public void consumeTerms(Query query, Term... terms) {
            for (Term term : terms) {
              positionInsensitiveTerms.add(term.bytes());
            }
          }
        });
      }

      // called on SpanQueries. Some other position-sensitive queries like PhraseQuery are converted beforehand
      @Override
      protected void extractWeightedSpanTerms(Map terms, SpanQuery spanQuery,
                                              float boost) throws IOException {
        // if this span query isn't for this field, skip it.
        Set fieldNameSet = new HashSet<>();//TODO reuse.  note: almost always size 1
        collectSpanQueryFields(spanQuery, fieldNameSet);
        for (String spanField : fieldNameSet) {
          if (!fieldMatcher.test(spanField)) {
            return;
          }
        }

        boolean mustRewriteQuery = mustRewriteQuery(spanQuery);
        if (ignoreQueriesNeedingRewrite && mustRewriteQuery) {
          return;// ignore this query
        }
        mustRewriteHolder[0] |= mustRewriteQuery;

        spanQueries.add(spanQuery);
      }

      @Override
      protected boolean mustRewriteQuery(SpanQuery spanQuery) {
        Boolean rewriteQ = rewriteQueryPred.apply(spanQuery);// allow to override
        return rewriteQ != null ? rewriteQ : super.mustRewriteQuery(spanQuery);
      }
    }; // calling the constructor triggered the extraction/visiting we want.  Hacky; yes.

    willRewrite = mustRewriteHolder[0];
  }

  public Set getSpanQueries() {
    return spanQueries;
  }

  /**
   * If there is no position sensitivity then use of the instance of this class can be ignored.
   */
  public boolean hasPositionSensitivity() {
    return spanQueries.isEmpty() == false;
  }

  /**
   * Rewrite is needed for handling a {@link SpanMultiTermQueryWrapper} (MTQ / wildcards) or some
   * custom things.  When true, the resulting term list will probably be different than what it was known
   * to be initially.
   */
  public boolean willRewrite() {
    return willRewrite;
  }

  /** Returns the terms that are position-insensitive (sorted). */
  public BytesRef[] getAllPositionInsensitiveTerms() {
    BytesRef[] result = positionInsensitiveTerms.toArray(new BytesRef[positionInsensitiveTerms.size()]);
    Arrays.sort(result);
    return result;
  }

  /** Given the internal SpanQueries, produce a number of OffsetsEnum into the {@code results} param. */
  public void createOffsetsEnumsForSpans(LeafReader leafReader, int docId, List results) throws IOException {
    leafReader = new SingleFieldWithOffsetsFilterLeafReader(leafReader, fieldName);
    //TODO avoid searcher and do what it does to rewrite & get weight?
    IndexSearcher searcher = new IndexSearcher(leafReader);
    searcher.setQueryCache(null);

    // for each SpanQuery, grab it's Spans and put it into a PriorityQueue
    PriorityQueue spansPriorityQueue = new PriorityQueue(spanQueries.size()) {
      @Override
      protected boolean lessThan(Spans a, Spans b) {
        return a.startPosition() <= b.startPosition();
      }
    };
    for (Query query : spanQueries) {
      Weight weight = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1);
      Scorer scorer = weight.scorer(leafReader.getContext());
      if (scorer == null) {
        continue;
      }
      TwoPhaseIterator twoPhaseIterator = scorer.twoPhaseIterator();
      if (twoPhaseIterator != null) {
        if (twoPhaseIterator.approximation().advance(docId) != docId || !twoPhaseIterator.matches()) {
          continue;
        }
      } else if (scorer.iterator().advance(docId) != docId) { // preposition, and return doing nothing if find none
        continue;
      }

      Spans spans = ((SpanScorer) scorer).getSpans();
      assert spans.docID() == docId;
      if (spans.nextStartPosition() != Spans.NO_MORE_POSITIONS) {
        spansPriorityQueue.add(spans);
      }
    }

    // Iterate the Spans in the PriorityQueue, collecting as we go.  By using a PriorityQueue ordered by position,
    //   the underlying offsets in our collector will be mostly appended to the end of arrays (efficient).
    // note: alternatively it'd interesting if we produced one OffsetsEnum that internally advanced
    //   this PriorityQueue when nextPosition is called; it would cap what we have to cache for large docs and
    //   exiting early (due to maxLen) is easy.
    //   But at least we have an accurate "freq" and it shouldn't be too much data to collect.  Even SpanScorer
    //   navigates the spans fully to compute a good freq (and thus score)!
    OffsetSpanCollector spanCollector = new OffsetSpanCollector();
    while (spansPriorityQueue.size() > 0) {
      Spans spans = spansPriorityQueue.top();
      //TODO limit to a capped endOffset length somehow so we can break this loop early
      spans.collect(spanCollector);

      if (spans.nextStartPosition() == Spans.NO_MORE_POSITIONS) {
        spansPriorityQueue.pop();
      } else {
        spansPriorityQueue.updateTop();
      }
    }
    results.addAll(spanCollector.termToOffsetsEnums.values());
  }

  /**
   * Needed to support the ability to highlight a query irrespective of the field a query refers to
   * (aka requireFieldMatch=false).
   * This reader will just delegate every call to a single field in the wrapped
   * LeafReader. This way we ensure that all queries going through this reader target the same field.
   */
  private static final class SingleFieldWithOffsetsFilterLeafReader extends FilterLeafReader {
    final String fieldName;

    SingleFieldWithOffsetsFilterLeafReader(LeafReader in, String fieldName) {
      super(in);
      this.fieldName = fieldName;
    }

    @Override
    public FieldInfos getFieldInfos() {
      throw new UnsupportedOperationException();//TODO merge them
    }

    @Override
    public Terms terms(String field) throws IOException {
      // ensure the underlying PostingsEnum returns offsets.  It's sad we have to do this to use the SpanCollector.
      return new FilterTerms(super.terms(fieldName)) {
        @Override
        public TermsEnum iterator() throws IOException {
          return new FilterTermsEnum(in.iterator()) {
            @Override
            public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
              return super.postings(reuse, flags | PostingsEnum.OFFSETS);
            }
          };
        }
      };
    }

    @Override
    public NumericDocValues getNormValues(String field) throws IOException {
      return super.getNormValues(fieldName);
    }

    @Override
    public CacheHelper getCoreCacheHelper() {
      return null;
    }

    @Override
    public CacheHelper getReaderCacheHelper() {
      return null;
    }
  }

  private class OffsetSpanCollector implements SpanCollector {
    Map termToOffsetsEnums = new HashMap<>();

    @Override
    public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
      if (!fieldMatcher.test(term.field())) {
        return;
      }

      SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
      if (offsetsEnum == null) {
        // If it's pos insensitive we handle it outside of PhraseHelper.  term.field() is from the Query.
        if (positionInsensitiveTerms.contains(term.bytes())) {
          return;
        }
        offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
        termToOffsetsEnums.put(term.bytes(), offsetsEnum);
      }
      offsetsEnum.add(postings.startOffset(), postings.endOffset());
    }

    @Override
    public void reset() { // called when at a new position.  We don't care.
    }
  }

  private static class SpanCollectedOffsetsEnum extends OffsetsEnum {
    // TODO perhaps optionally collect (and expose) payloads?
    private final BytesRef term;
    private final int[] startOffsets;
    private final int[] endOffsets;
    private int numPairs = 0;
    private int enumIdx = -1;

    private SpanCollectedOffsetsEnum(BytesRef term, int postingsFreq) {
      this.term = term;
      this.startOffsets = new int[postingsFreq]; // hopefully not wasteful?  At least we needn't resize it.
      this.endOffsets = new int[postingsFreq];
    }

    // called from collector before it's navigated
    void add(int startOffset, int endOffset) {
      assert enumIdx == -1 : "bad state";

      // loop backwards since we expect a match at the end or close to it.  We expect O(1) not O(N).
      int pairIdx = numPairs - 1;
      for (; pairIdx >= 0; pairIdx--) {
        int iStartOffset = startOffsets[pairIdx];
        int iEndOffset = endOffsets[pairIdx];
        int cmp = Integer.compare(iStartOffset, startOffset);
        if (cmp == 0) {
          cmp = Integer.compare(iEndOffset, endOffset);
        }
        if (cmp == 0) {
          return; // we already have this offset-pair for this term
        } else if (cmp < 0) {
          break; //we will insert offsetPair to the right of pairIdx
        }
      }
      // pairIdx is now one position to the left of where we insert the new pair
      // shift right any pairs by one to make room
      final int shiftLen = numPairs - (pairIdx + 1);
      if (shiftLen > 0) {
        System.arraycopy(startOffsets, pairIdx + 1, startOffsets, pairIdx + 2, shiftLen);
        System.arraycopy(endOffsets, pairIdx + 1, endOffsets, pairIdx + 2, shiftLen);
      }
      // now we can place the offset pair
      startOffsets[pairIdx + 1] = startOffset;
      endOffsets[pairIdx + 1] = endOffset;
      numPairs++;
    }

    @Override
    public boolean nextPosition() throws IOException {
      return ++enumIdx < numPairs;
    }

    @Override
    public int freq() throws IOException {
      return numPairs;
    }

    @Override
    public BytesRef getTerm() throws IOException {
      return term;
    }

    @Override
    public int startOffset() throws IOException {
      return startOffsets[enumIdx];
    }

    @Override
    public int endOffset() throws IOException {
      return endOffsets[enumIdx];
    }
  }

}