org.apache.lucene.queries.intervals.TermIntervalsSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queries.intervals;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Objects;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsFormat;
import org.apache.lucene.codecs.lucene912.Lucene912PostingsReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchesIterator;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.util.BytesRef;
class TermIntervalsSource extends IntervalsSource {
final BytesRef term;
TermIntervalsSource(BytesRef term) {
this.term = term;
}
@Override
public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException {
Terms terms = ctx.reader().terms(field);
if (terms == null) {
return null;
}
if (terms.hasPositions() == false) {
throw new IllegalArgumentException(
"Cannot create an IntervalIterator over field "
+ field
+ " because it has no indexed positions");
}
TermsEnum te = terms.iterator();
if (te.seekExact(term) == false) {
return null;
}
return intervals(term, te);
}
static IntervalIterator intervals(BytesRef term, TermsEnum te) throws IOException {
PostingsEnum pe = te.postings(null, PostingsEnum.POSITIONS);
float cost = termPositionsCost(te);
return new IntervalIterator() {
@Override
public int docID() {
return pe.docID();
}
@Override
public int nextDoc() throws IOException {
int doc = pe.nextDoc();
reset();
return doc;
}
@Override
public int advance(int target) throws IOException {
int doc = pe.advance(target);
reset();
return doc;
}
@Override
public long cost() {
return pe.cost();
}
int pos = -1, upto;
@Override
public int start() {
return pos;
}
@Override
public int end() {
return pos;
}
@Override
public int gaps() {
return 0;
}
@Override
public int nextInterval() throws IOException {
if (upto <= 0) {
return pos = NO_MORE_INTERVALS;
}
upto--;
return pos = pe.nextPosition();
}
@Override
public float matchCost() {
return cost;
}
private void reset() throws IOException {
if (pe.docID() == NO_MORE_DOCS) {
upto = -1;
pos = NO_MORE_INTERVALS;
} else {
upto = pe.freq();
pos = -1;
}
}
@Override
public String toString() {
return term.utf8ToString() + ":" + super.toString();
}
};
}
@Override
public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc)
throws IOException {
Terms terms = ctx.reader().terms(field);
if (terms == null) {
return null;
}
if (terms.hasPositions() == false) {
throw new IllegalArgumentException(
"Cannot create an IntervalIterator over field "
+ field
+ " because it has no indexed positions");
}
TermsEnum te = terms.iterator();
if (te.seekExact(term) == false) {
return null;
}
return matches(te, doc, field);
}
static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
TermQuery query = new TermQuery(new Term(field, te.term()));
PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
if (pe.advance(doc) != doc) {
return null;
}
return new IntervalMatchesIterator() {
@Override
public int gaps() {
return 0;
}
@Override
public int width() {
return 1;
}
int upto = pe.freq();
int pos = -1;
@Override
public boolean next() throws IOException {
if (upto <= 0) {
pos = IntervalIterator.NO_MORE_INTERVALS;
return false;
}
upto--;
pos = pe.nextPosition();
return true;
}
@Override
public int startPosition() {
return pos;
}
@Override
public int endPosition() {
return pos;
}
@Override
public int startOffset() throws IOException {
return pe.startOffset();
}
@Override
public int endOffset() throws IOException {
return pe.endOffset();
}
@Override
public MatchesIterator getSubMatches() {
return null;
}
@Override
public Query getQuery() {
return query;
}
};
}
@Override
public int minExtent() {
return 1;
}
@Override
public Collection pullUpDisjunctions() {
return Collections.singleton(this);
}
@Override
public int hashCode() {
return Objects.hash(term);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TermIntervalsSource that = (TermIntervalsSource) o;
return Objects.equals(term, that.term);
}
@Override
public String toString() {
return term.utf8ToString();
}
@Override
public void visit(String field, QueryVisitor visitor) {
visitor.consumeTerms(new IntervalQuery(field, this), new Term(field, term));
}
/**
* A guess of the average number of simple operations for the initial seek and buffer refill per
* document for the positions of a term. See also {@link
* Lucene912PostingsReader.EverythingEnum#nextPosition()}.
*
* Aside: Instead of being constant this could depend among others on {@link
* Lucene912PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link
* TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs),
* {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block
* size of the device storing the index.
*/
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
/**
* Number of simple operations in {@link Lucene912PostingsReader.EverythingEnum#nextPosition()}
* when no seek or buffer refill is done.
*/
private static final int TERM_OPS_PER_POS = 7;
/**
* Returns an expected cost in simple operations of processing the occurrences of a term in a
* document that contains the term. This is for use by {@link TwoPhaseIterator#matchCost}
* implementations.
*
* @param termsEnum The term is the term at which this TermsEnum is positioned.
*/
static float termPositionsCost(TermsEnum termsEnum) throws IOException {
// TODO: When intervals move to core, refactor to use the copy of this in PhraseQuery
int docFreq = termsEnum.docFreq();
assert docFreq > 0;
long totalTermFreq = termsEnum.totalTermFreq();
float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq;
return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
}
}