org.apache.lucene.spatial.prefix.PrefixTreeFacetCounter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-spatial-extras Show documentation
Apache Lucene (module: spatial-extras)
There is a newer version: 9.11.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.spatial.prefix;

import java.io.IOException;

import org.locationtech.spatial4j.shape.Shape;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.spatial.prefix.tree.Cell;
import org.apache.lucene.spatial.prefix.tree.SpatialPrefixTree;
import org.apache.lucene.util.Bits;

/**
 * Computes facets on cells for {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy}.
 * 
 * NOTE: If for a given document and a given field using
 * {@link org.apache.lucene.spatial.prefix.RecursivePrefixTreeStrategy}
 * multiple values are indexed (i.e. multi-valued) and at least one of them is a non-point, then there is a possibility
 * of double-counting the document in the facet results.  Since each shape is independently turned into grid cells at
 * a resolution chosen by the shape's size, it's possible they will be indexed at different resolutions.  This means
 * the document could be present in BOTH the postings for a cell in both its prefix and leaf variants.  To avoid this,
 * use a single valued field with a {@link org.locationtech.spatial4j.shape.ShapeCollection} (or WKT equivalent).  Or
 * calculate a suitable level/distErr to index both and call
 * {@link org.apache.lucene.spatial.prefix.PrefixTreeStrategy#createIndexableFields(org.locationtech.spatial4j.shape.Shape, int)}
 * with the same value for all shapes for a given document/field.
 *
 * @lucene.experimental
 */
public class PrefixTreeFacetCounter {

  /** A callback/visitor of facet counts. */
  public static abstract class FacetVisitor {
    /** Called at the start of the segment, if there is indexed data. */
    public void startOfSegment() {}

    /** Called for cells with a leaf, or cells at the target facet level.  {@code count} is greater than zero.
     * When an ancestor cell is given with non-zero count, the count can be considered to be added to all cells
     * below. You won't necessarily get a cell at level {@code facetLevel} if the indexed data is courser (bigger).
     */
    public abstract void visit(Cell cell, int count);
  }

  private PrefixTreeFacetCounter() {
  }

  /**
   * Computes facets using a callback/visitor style design, allowing flexibility for the caller to determine what to do
   * with each underlying count.
   * @param strategy the prefix tree strategy (contains the field reference, grid, max levels)
   * @param context the IndexReader's context
   * @param topAcceptDocs a Bits to limit counted docs. If null, live docs are counted.
   * @param queryShape the shape to limit the range of facet counts to
   * @param facetLevel the maximum depth (detail) of faceted cells
   * @param facetVisitor the visitor/callback to receive the counts
   */
  public static void compute(PrefixTreeStrategy strategy, IndexReaderContext context, Bits topAcceptDocs,
                             Shape queryShape, int facetLevel, FacetVisitor facetVisitor)
      throws IOException {
    //We collect per-leaf
    for (final LeafReaderContext leafCtx : context.leaves()) {
      //determine leaf acceptDocs Bits
      Bits leafAcceptDocs;
      if (topAcceptDocs == null) {
        leafAcceptDocs = leafCtx.reader().getLiveDocs();//filter deleted
      } else {
        leafAcceptDocs = new Bits() {
          @Override
          public boolean get(int index) {
            return topAcceptDocs.get(leafCtx.docBase + index);
          }

          @Override
          public int length() {
            return leafCtx.reader().maxDoc();
          }
        };
      }

      compute(strategy, leafCtx, leafAcceptDocs, queryShape, facetLevel, facetVisitor);
    }
  }

  /** Lower-level per-leaf segment method. */
  public static void compute(final PrefixTreeStrategy strategy, final LeafReaderContext context, final Bits acceptDocs,
                             final Shape queryShape, final int facetLevel, final FacetVisitor facetVisitor)
      throws IOException {
    if (acceptDocs != null && acceptDocs.length() != context.reader().maxDoc()) {
      throw new IllegalArgumentException(
          "acceptDocs bits length " + acceptDocs.length() +" != leaf maxdoc " + context.reader().maxDoc());
    }
    final SpatialPrefixTree tree = strategy.getGrid();

    //scanLevel is an optimization knob of AbstractVisitingPrefixTreeFilter. It's unlikely
    // another scanLevel would be much faster and it tends to be a risky knob (can help a little, can hurt a ton).
    // TODO use RPT's configured scan level?  Do we know better here?  Hard to say.
    final int scanLevel = tree.getMaxLevels();
    //AbstractVisitingPrefixTreeFilter is a Lucene Filter.  We don't need a filter; we use it for its great prefix-tree
    // traversal code.  TODO consider refactoring if/when it makes sense (more use cases than this)
    new AbstractVisitingPrefixTreeQuery(queryShape, strategy.getFieldName(), tree, facetLevel, scanLevel) {

      @Override
      public String toString(String field) {
        return "anonPrefixTreeQuery";//un-used
      }

      @Override
      public DocIdSet getDocIdSet(LeafReaderContext contexts) throws IOException {
        assert facetLevel == super.detailLevel;//same thing, FYI. (constant)

        return new VisitorTemplate(context) {

          @Override
          protected void start() throws IOException {
            facetVisitor.startOfSegment();
          }

          @Override
          protected DocIdSet finish() throws IOException {
            return null;//unused;
          }

          @Override
          protected boolean visitPrefix(Cell cell) throws IOException {
            // At facetLevel...
            if (cell.getLevel() == facetLevel) {
              // Count docs
              visitLeaf(cell);//we're not a leaf but we treat it as such at facet level
              return false;//don't descend further; this is enough detail
            }

            // We optimize for discriminating filters (reflected in acceptDocs) and short-circuit if no
            // matching docs. We could do this at all levels or never but the closer we get to the facet level, the
            // higher the probability this is worthwhile. We do when docFreq == 1 because it's a cheap check, especially
            // due to "pulsing" in the codec.
            //TODO this opt should move to VisitorTemplate (which contains an optimization TODO to this effect)
            if (cell.getLevel() == facetLevel - 1 || termsEnum.docFreq() == 1) {
              if (!hasDocsAtThisTerm()) {
                return false;
              }
            }
            return true;
          }

          @Override
          protected void visitLeaf(Cell cell) throws IOException {
            final int count = countDocsAtThisTerm();
            if (count > 0) {
              facetVisitor.visit(cell, count);
            }
          }

          private int countDocsAtThisTerm() throws IOException {
            if (acceptDocs == null) {
              return termsEnum.docFreq();
            }
            int count = 0;
            postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
            while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
              if (acceptDocs.get(postingsEnum.docID()) == false) {
                continue;
              }
              count++;
            }
            return count;
          }

          private boolean hasDocsAtThisTerm() throws IOException {
            if (acceptDocs == null) {
              return true;
            }
            postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
            int nextDoc = postingsEnum.nextDoc();
            while (nextDoc != DocIdSetIterator.NO_MORE_DOCS && acceptDocs.get(nextDoc) == false) {
              nextDoc = postingsEnum.nextDoc();
            }
            return nextDoc != DocIdSetIterator.NO_MORE_DOCS;
          }

        }.getDocIdSet();
      }
    }.getDocIdSet(context);
  }
}