org.apache.lucene.queries.spans.SpanTermQuery Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-queries Show documentation
Lucene Queries Module
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.queries.spans;

import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;

/**
 * Matches spans containing a term. This should not be used for terms that are indexed at position
 * Integer.MAX_VALUE.
 */
public class SpanTermQuery extends SpanQuery {

  protected final Term term;
  protected final TermStates termStates;

  /** Construct a SpanTermQuery matching the named term's spans. */
  public SpanTermQuery(Term term) {
    this.term = Objects.requireNonNull(term);
    this.termStates = null;
  }

  /**
   * Expert: Construct a SpanTermQuery matching the named term's spans, using the provided
   * TermStates
   */
  public SpanTermQuery(Term term, TermStates termStates) {
    this.term = Objects.requireNonNull(term);
    this.termStates = termStates;
  }

  /** Return the term whose spans are matched. */
  public Term getTerm() {
    return term;
  }

  /**
   * Returns the {@link TermStates} passed to the constructor, or null if it was not passed.
   *
   * @lucene.experimental
   */
  public TermStates getTermStates() {
    return termStates;
  }

  @Override
  public String getField() {
    return term.field();
  }

  @Override
  public SpanWeight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
      throws IOException {
    final TermStates context;
    final IndexReaderContext topContext = searcher.getTopReaderContext();
    if (termStates == null || termStates.wasBuiltFor(topContext) == false) {
      context = TermStates.build(searcher, term, scoreMode.needsScores());
    } else {
      context = termStates;
    }
    return new SpanTermWeight(
        context,
        searcher,
        scoreMode.needsScores() ? Collections.singletonMap(term, context) : null,
        boost);
  }

  @Override
  public void visit(QueryVisitor visitor) {
    if (visitor.acceptField(term.field())) {
      visitor.consumeTerms(this, term);
    }
  }

  /**
   * Creates SpanTermQuery scorer instances
   *
   * @lucene.internal
   */
  public class SpanTermWeight extends SpanWeight {

    final TermStates termStates;

    public SpanTermWeight(
        TermStates termStates, IndexSearcher searcher, Map terms, float boost)
        throws IOException {
      super(SpanTermQuery.this, searcher, terms, boost);
      this.termStates = termStates;
      assert termStates != null : "TermStates must not be null";
    }

    @Override
    public boolean isCacheable(LeafReaderContext ctx) {
      return true;
    }

    @Override
    public void extractTermStates(Map contexts) {
      contexts.put(term, termStates);
    }

    @Override
    public Spans getSpans(final LeafReaderContext context, Postings requiredPostings)
        throws IOException {

      assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context))
          : "The top-reader used to create Weight is not the same as the current reader's top-reader ("
              + ReaderUtil.getTopLevelContext(context);

      final TermState state = termStates.get(context);
      if (state == null) { // term is not present in that reader
        assert context.reader().docFreq(term) == 0
            : "no termstate found but term exists in reader term=" + term;
        return null;
      }

      final Terms terms = context.reader().terms(term.field());
      if (terms == null) return null;
      if (terms.hasPositions() == false)
        throw new IllegalStateException(
            "field \""
                + term.field()
                + "\" was indexed without position data; cannot run SpanTermQuery (term="
                + term.text()
                + ")");

      final TermsEnum termsEnum = terms.iterator();
      termsEnum.seekExact(term.bytes(), state);

      final PostingsEnum postings =
          termsEnum.postings(null, requiredPostings.getRequiredPostings());
      float positionsCost = termPositionsCost(termsEnum) * PHRASE_TO_SPAN_TERM_POSITIONS_COST;
      return new TermSpans(getSimScorer(context), postings, term, positionsCost);
    }
  }

  /**
   * A guess of the relative cost of dealing with the term positions when using a SpanNearQuery
   * instead of a PhraseQuery.
   */
  private static final float PHRASE_TO_SPAN_TERM_POSITIONS_COST = 4.0f;

  private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;

  private static final int TERM_OPS_PER_POS = 7;

  /**
   * Returns an expected cost in simple operations of processing the occurrences of a term in a
   * document that contains the term.
   *
   * @param termsEnum The term is the term at which this TermsEnum is positioned.
   *     This is a copy of org.apache.lucene.search.PhraseQuery.termPositionsCost(). 

   *     TODO: keep only a single copy of this method and the constants used in it when
   *     SpanTermQuery moves to the o.a.l.search package.
   */
  static float termPositionsCost(TermsEnum termsEnum) throws IOException {
    int docFreq = termsEnum.docFreq();
    assert docFreq > 0;
    long totalTermFreq = termsEnum.totalTermFreq();
    assert totalTermFreq > 0;
    float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq;
    return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
  }

  @Override
  public String toString(String field) {
    StringBuilder buffer = new StringBuilder();
    if (term.field().equals(field)) buffer.append(term.text());
    else buffer.append(term.toString());
    return buffer.toString();
  }

  @Override
  public int hashCode() {
    return classHash() ^ term.hashCode();
  }

  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) && term.equals(((SpanTermQuery) other).term);
  }
}