All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.spatial.prefix.PrefixTreeFacetCounter Maven / Gradle / Ivy

There is a newer version: 10.1.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.spatial.prefix;

import java.io.IOException;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.spatial.prefix.tree.Cell;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.apache.lucene.util.Bits;
import org.locationtech.spatial4j.shape.Shape;

/**
 * Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
 *
 * 

NOTE: If for a given document and a given field using {@link * org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy} multiple values are indexed (i.e. * multi-valued) and at least one of them is a non-point, then there is a possibility of * double-counting the document in the facet results. Since each shape is independently turned into * grid cells at a resolution chosen by the shape's size, it's possible they will be indexed at * different resolutions. This means the document could be present in BOTH the postings for a cell * in both its prefix and leaf variants. To avoid this, use a single valued field with a {@link * org.locationtech.spatial4j.shape.ShapeCollection} (or WKT equivalent). Or calculate a suitable * level/distErr to index both and call {@link * org.apache.lucene.spatial.prefix.PrefixTreeStrategy#createIndexableFields(org.locationtech.spatial4j.shape.Shape, * int)} with the same value for all shapes for a given document/field. * * @lucene.experimental */ public class PrefixTreeFacetCounter { /** A callback/visitor of facet counts. */ public abstract static class FacetVisitor { /** Called at the start of the segment, if there is indexed data. */ public void startOfSegment() {} /** * Called for cells with a leaf, or cells at the target facet level. {@code count} is greater * than zero. When an ancestor cell is given with non-zero count, the count can be considered to * be added to all cells below. You won't necessarily get a cell at level {@code facetLevel} if * the indexed data is courser (bigger). */ public abstract void visit(Cell cell, int count); } private PrefixTreeFacetCounter() {} /** * Computes facets using a callback/visitor style design, allowing flexibility for the caller to * determine what to do with each underlying count. * * @param strategy the prefix tree strategy (contains the field reference, grid, max levels) * @param context the IndexReader's context * @param topAcceptDocs a Bits to limit counted docs. If null, live docs are counted. * @param queryShape the shape to limit the range of facet counts to * @param facetLevel the maximum depth (detail) of faceted cells * @param facetVisitor the visitor/callback to receive the counts */ public static void compute( PrefixTreeStrategy strategy, IndexReaderContext context, Bits topAcceptDocs, Shape queryShape, int facetLevel, FacetVisitor facetVisitor) throws IOException { // We collect per-leaf for (final LeafReaderContext leafCtx : context.leaves()) { // determine leaf acceptDocs Bits Bits leafAcceptDocs; if (topAcceptDocs == null) { leafAcceptDocs = leafCtx.reader().getLiveDocs(); // filter deleted } else { leafAcceptDocs = new Bits() { @Override public boolean get(int index) { return topAcceptDocs.get(leafCtx.docBase + index); } @Override public int length() { return leafCtx.reader().maxDoc(); } }; } compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor); } } /** Lower-level per-leaf segment method. */ public static void compute( final PrefixTreeStrategy strategy, final LeafReaderContext context, final Bits acceptDocs, final Shape queryShape, final int facetLevel, final FacetVisitor facetVisitor) throws IOException { if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) { throw new IllegalArgumentException( "acceptDocs bits length " + acceptDocs.length() + " != leaf maxdoc " + context.reader().maxDoc()); } final SpatialPrefixTree tree = strategy.getGrid(); // scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely // another scanLevel would be much faster and it tends to be a risky knob (can help a little, // can hurt a ton). // TODO use RPT's configured scan level? Do we know better here? Hard to say. final int scanLevel = tree.getMaxLevels(); // AbstractVisitingPrefixTreeFilter is a Lucene Filter. We don't need a filter; we use it for // its great prefix-tree // traversal code. TODO consider refactoring if/when it makes sense (more use cases than this) new AbstractVisitingPrefixTreeQuery( queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) { @Override public String toString(String field) { return "anonPrefixTreeQuery"; // un-used } @Override public DocIdSet getDocIdSet(LeafReaderContext contexts) throws IOException { assert facetLevel == super.detailLevel; // same thing, FYI. (constant) return new VisitorTemplate(context) { @Override protected void start() throws IOException { facetVisitor.startOfSegment(); } @Override protected DocIdSet finish() throws IOException { return null; // unused; } @Override protected boolean visitPrefix(Cell cell) throws IOException { // At facetLevel... if (cell.getLevel() == facetLevel) { // Count docs visitLeaf(cell); // we're not a leaf but we treat it as such at facet level return false; // don't descend further; this is enough detail } // We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if // no // matching docs. We could do this at all levels or never but the closer we get to the // facet level, the // higher the probability this is worthwhile. We do when docFreq == 1 because it's a // cheap check, especially // due to "pulsing" in the codec. // TODO this opt should move to VisitorTemplate (which contains an optimization TODO to // this effect) if (cell.getLevel() == facetLevel - 1 || termsEnum.docFreq() == 1) { if (!hasDocsAtThisTerm()) { return false; } } return true; } @Override protected void visitLeaf(Cell cell) throws IOException { final int count = countDocsAtThisTerm(); if (count > 0) { facetVisitor.visit(cell, count); } } private int countDocsAtThisTerm() throws IOException { if (acceptDocs == null) { return termsEnum.docFreq(); } int count = 0; postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { if (acceptDocs.get(postingsEnum.docID()) == false) { continue; } count++; } return count; } private boolean hasDocsAtThisTerm() throws IOException { if (acceptDocs == null) { return true; } postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); int nextDoc = postingsEnum.nextDoc(); while (nextDoc != DocIdSetIterator.NO_MORE_DOCS && acceptDocs.get(nextDoc) == false) { nextDoc = postingsEnum.nextDoc(); } return nextDoc != DocIdSetIterator.NO_MORE_DOCS; } }.getDocIdSet(); } }.getDocIdSet(context); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy