org.apache.lucene.spatial.prefix.PrefixTreeFacetCounter Maven / Gradle / Ivy
Show all versions of lucene-spatial-extras Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.spatial.prefix;
import java.io.IOException;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.spatial.prefix.tree.Cell;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.apache.lucene.util.Bits;
import org.locationtech.spatial4j.shape.Shape;
/**
* Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
*
* NOTE: If for a given document and a given field using {@link
* org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy} multiple values are indexed (i.e.
* multi-valued) and at least one of them is a non-point, then there is a possibility of
* double-counting the document in the facet results. Since each shape is independently turned into
* grid cells at a resolution chosen by the shape's size, it's possible they will be indexed at
* different resolutions. This means the document could be present in BOTH the postings for a cell
* in both its prefix and leaf variants. To avoid this, use a single valued field with a {@link
* org.locationtech.spatial4j.shape.ShapeCollection} (or WKT equivalent). Or calculate a suitable
* level/distErr to index both and call {@link
* org.apache.lucene.spatial.prefix.PrefixTreeStrategy#createIndexableFields(org.locationtech.spatial4j.shape.Shape,
* int)} with the same value for all shapes for a given document/field.
*
* @lucene.experimental
*/
public class PrefixTreeFacetCounter {
/** A callback/visitor of facet counts. */
public abstract static class FacetVisitor {
/** Called at the start of the segment, if there is indexed data. */
public void startOfSegment() {}
/**
* Called for cells with a leaf, or cells at the target facet level. {@code count} is greater
* than zero. When an ancestor cell is given with non-zero count, the count can be considered to
* be added to all cells below. You won't necessarily get a cell at level {@code facetLevel} if
* the indexed data is courser (bigger).
*/
public abstract void visit(Cell cell, int count);
}
private PrefixTreeFacetCounter() {}
/**
* Computes facets using a callback/visitor style design, allowing flexibility for the caller to
* determine what to do with each underlying count.
*
* @param strategy the prefix tree strategy (contains the field reference, grid, max levels)
* @param context the IndexReader's context
* @param topAcceptDocs a Bits to limit counted docs. If null, live docs are counted.
* @param queryShape the shape to limit the range of facet counts to
* @param facetLevel the maximum depth (detail) of faceted cells
* @param facetVisitor the visitor/callback to receive the counts
*/
public static void compute(
PrefixTreeStrategy strategy,
IndexReaderContext context,
Bits topAcceptDocs,
Shape queryShape,
int facetLevel,
FacetVisitor facetVisitor)
throws IOException {
// We collect per-leaf
for (final LeafReaderContext leafCtx : context.leaves()) {
// determine leaf acceptDocs Bits
Bits leafAcceptDocs;
if (topAcceptDocs == null) {
leafAcceptDocs = leafCtx.reader().getLiveDocs(); // filter deleted
} else {
leafAcceptDocs =
new Bits() {
@Override
public boolean get(int index) {
return topAcceptDocs.get(leafCtx.docBase + index);
}
@Override
public int length() {
return leafCtx.reader().maxDoc();
}
};
}
compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor);
}
}
/** Lower-level per-leaf segment method. */
public static void compute(
final PrefixTreeStrategy strategy,
final LeafReaderContext context,
final Bits acceptDocs,
final Shape queryShape,
final int facetLevel,
final FacetVisitor facetVisitor)
throws IOException {
if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) {
throw new IllegalArgumentException(
"acceptDocs bits length "
+ acceptDocs.length()
+ " != leaf maxdoc "
+ context.reader().maxDoc());
}
final SpatialPrefixTree tree = strategy.getGrid();
// scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely
// another scanLevel would be much faster and it tends to be a risky knob (can help a little,
// can hurt a ton).
// TODO use RPT's configured scan level? Do we know better here? Hard to say.
final int scanLevel = tree.getMaxLevels();
// AbstractVisitingPrefixTreeFilter is a Lucene Filter. We don't need a filter; we use it for
// its great prefix-tree
// traversal code. TODO consider refactoring if/when it makes sense (more use cases than this)
new AbstractVisitingPrefixTreeQuery(
queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) {
@Override
public String toString(String field) {
return "anonPrefixTreeQuery"; // un-used
}
@Override
public DocIdSet getDocIdSet(LeafReaderContext contexts) throws IOException {
assert facetLevel == super.detailLevel; // same thing, FYI. (constant)
return new VisitorTemplate(context) {
@Override
protected void start() throws IOException {
facetVisitor.startOfSegment();
}
@Override
protected DocIdSet finish() throws IOException {
return null; // unused;
}
@Override
protected boolean visitPrefix(Cell cell) throws IOException {
// At facetLevel...
if (cell.getLevel() == facetLevel) {
// Count docs
visitLeaf(cell); // we're not a leaf but we treat it as such at facet level
return false; // don't descend further; this is enough detail
}
// We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if
// no
// matching docs. We could do this at all levels or never but the closer we get to the
// facet level, the
// higher the probability this is worthwhile. We do when docFreq == 1 because it's a
// cheap check, especially
// due to "pulsing" in the codec.
// TODO this opt should move to VisitorTemplate (which contains an optimization TODO to
// this effect)
if (cell.getLevel() == facetLevel - 1 || termsEnum.docFreq() == 1) {
if (!hasDocsAtThisTerm()) {
return false;
}
}
return true;
}
@Override
protected void visitLeaf(Cell cell) throws IOException {
final int count = countDocsAtThisTerm();
if (count > 0) {
facetVisitor.visit(cell, count);
}
}
private int countDocsAtThisTerm() throws IOException {
if (acceptDocs == null) {
return termsEnum.docFreq();
}
int count = 0;
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (acceptDocs.get(postingsEnum.docID()) == false) {
continue;
}
count++;
}
return count;
}
private boolean hasDocsAtThisTerm() throws IOException {
if (acceptDocs == null) {
return true;
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
int nextDoc = postingsEnum.nextDoc();
while (nextDoc != DocIdSetIterator.NO_MORE_DOCS && acceptDocs.get(nextDoc) == false) {
nextDoc = postingsEnum.nextDoc();
}
return nextDoc != DocIdSetIterator.NO_MORE_DOCS;
}
}.getDocIdSet();
}
}.getDocIdSet(context);
}
}