org.apache.lucene.queries.intervals.TermIntervalsSource Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-queries Show documentation
Lucene Queries Module
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.queries.intervals;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.BytesRef;

class TermIntervalsSource extends IntervalsSource {

  final BytesRef term;

  TermIntervalsSource(BytesRef term) {
    this.term = term;
  }

  @Override
  public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
    Terms terms = ctx.reader().terms(field);
    if (terms == null) {
      return null;
    }
    if (terms.hasPositions() == false) {
      throw new IllegalArgumentException(
          "Cannot create an IntervalIterator over field "
              + field
              + " because it has no indexed positions");
    }
    TermsEnum te = terms.iterator();
    if (te.seekExact(term) == false) {
      return null;
    }
    return intervals(term, te);
  }

  static IntervalIterator intervals(BytesRef term, TermsEnum te) throws IOException {
    PostingsEnum pe = te.postings(null, PostingsEnum.POSITIONS);
    float cost = termPositionsCost(te);
    return new IntervalIterator() {

      @Override
      public int docID() {
        return pe.docID();
      }

      @Override
      public int nextDoc() throws IOException {
        int doc = pe.nextDoc();
        reset();
        return doc;
      }

      @Override
      public int advance(int target) throws IOException {
        int doc = pe.advance(target);
        reset();
        return doc;
      }

      @Override
      public long cost() {
        return pe.cost();
      }

      int pos = -1, upto;

      @Override
      public int start() {
        return pos;
      }

      @Override
      public int end() {
        return pos;
      }

      @Override
      public int gaps() {
        return 0;
      }

      @Override
      public int nextInterval() throws IOException {
        if (upto <= 0) {
          return pos = NO_MORE_INTERVALS;
        }
        upto--;
        return pos = pe.nextPosition();
      }

      @Override
      public float matchCost() {
        return cost;
      }

      private void reset() throws IOException {
        if (pe.docID() == NO_MORE_DOCS) {
          upto = -1;
          pos = NO_MORE_INTERVALS;
        } else {
          upto = pe.freq();
          pos = -1;
        }
      }

      @Override
      public String toString() {
        return term.utf8ToString() + ":" + super.toString();
      }
    };
  }

  @Override
  public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc)
      throws IOException {
    Terms terms = ctx.reader().terms(field);
    if (terms == null) {
      return null;
    }
    if (terms.hasPositions() == false) {
      throw new IllegalArgumentException(
          "Cannot create an IntervalIterator over field "
              + field
              + " because it has no indexed positions");
    }
    TermsEnum te = terms.iterator();
    if (te.seekExact(term) == false) {
      return null;
    }
    return matches(te, doc, field);
  }

  static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
    TermQuery query = new TermQuery(new Term(field, te.term()));
    PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
    if (pe.advance(doc) != doc) {
      return null;
    }
    return new IntervalMatchesIterator() {

      @Override
      public int gaps() {
        return 0;
      }

      @Override
      public int width() {
        return 1;
      }

      int upto = pe.freq();
      int pos = -1;

      @Override
      public boolean next() throws IOException {
        if (upto <= 0) {
          pos = IntervalIterator.NO_MORE_INTERVALS;
          return false;
        }
        upto--;
        pos = pe.nextPosition();
        return true;
      }

      @Override
      public int startPosition() {
        return pos;
      }

      @Override
      public int endPosition() {
        return pos;
      }

      @Override
      public int startOffset() throws IOException {
        return pe.startOffset();
      }

      @Override
      public int endOffset() throws IOException {
        return pe.endOffset();
      }

      @Override
      public MatchesIterator getSubMatches() {
        return null;
      }

      @Override
      public Query getQuery() {
        return query;
      }
    };
  }

  @Override
  public int minExtent() {
    return 1;
  }

  @Override
  public Collection pullUpDisjunctions() {
    return Collections.singleton(this);
  }

  @Override
  public int hashCode() {
    return Objects.hash(term);
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (o == null || getClass() != o.getClass()) return false;
    TermIntervalsSource that = (TermIntervalsSource) o;
    return Objects.equals(term, that.term);
  }

  @Override
  public String toString() {
    return term.utf8ToString();
  }

  @Override
  public void visit(String field, QueryVisitor visitor) {
    visitor.consumeTerms(new IntervalQuery(field, this), new Term(field, term));
  }

  /**
   * A guess of the average number of simple operations for the initial seek and buffer refill per
   * document for the positions of a term. See also {@link
   * Lucene912PostingsReader.EverythingEnum#nextPosition()}.
   *
   * Aside: Instead of being constant this could depend among others on {@link
   * Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
   * TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
   * {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
   * size of the device storing the index.
   */
  private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;

  /**
   * Number of simple operations in {@link Lucene912PostingsReader.EverythingEnum#nextPosition()}
   * when no seek or buffer refill is done.
   */
  private static final int TERM_OPS_PER_POS = 7;

  /**
   * Returns an expected cost in simple operations of processing the occurrences of a term in a
   * document that contains the term. This is for use by {@link TwoPhaseIterator#matchCost}
   * implementations.
   *
   * @param termsEnum The term is the term at which this TermsEnum is positioned.
   */
  static float termPositionsCost(TermsEnum termsEnum) throws IOException {
    // TODO: When intervals move to core, refactor to use the copy of this in PhraseQuery
    int docFreq = termsEnum.docFreq();
    assert docFreq > 0;
    long totalTermFreq = termsEnum.totalTermFreq();
    float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq;
    return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
  }
}