org.apache.lucene.index.PointValues Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.apache.servicemix.bundles.lucene
This OSGi bundle wraps ${pkgArtifactId} ${pkgVersion} jar file.
There is a newer version: 6.4.2_1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.math.BigInteger;
import java.net.InetAddress;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.DoublePoint;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.InetAddressPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.LatLonPoint;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.ArrayUtil.ByteArrayComparator;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.bkd.BKDConfig;

/**
 * Access to indexed numeric values.
 *
 * Points represent numeric values and are indexed differently than ordinary text. Instead of an
 * inverted index, points are indexed with datastructures such as KD-trees. These structures are optimized for
 * operations such as range, distance, nearest-neighbor, and
 * point-in-polygon queries.
 *
 * 
Basic Point Types
 *
 * 
 *   
 *   
 *   
 *   
 *   
 *   
 *   
 *   
 *   
 * Basic point types in Java and LuceneJava type Lucene class
{@code int} {@link IntPoint}
{@code long} {@link LongPoint}
{@code float} {@link FloatPoint}
{@code double} {@link DoublePoint}
{@code byte[]} {@link BinaryPoint}
{@link InetAddress} {@link InetAddressPoint}
{@link BigInteger} BigIntegerPoint*
 *
 * * in the lucene-sandbox jar

 *
 * Basic Lucene point types behave like their java peers: for example {@link IntPoint} represents
 * a signed 32-bit {@link Integer}, supporting values ranging from {@link Integer#MIN_VALUE} to
 * {@link Integer#MAX_VALUE}, ordered consistent with {@link Integer#compareTo(Integer)}. In
 * addition to indexing support, point classes also contain static methods (such as {@link
 * IntPoint#newRangeQuery(String, int, int)}) for creating common queries. For example:
 *
 * 
 *   // add year 1970 to document
 *   document.add(new IntPoint("year", 1970));
 *   // index document
 *   writer.addDocument(document);
 *   ...
 *   // issue range query of 1960-1980
 *   Query query = IntPoint.newRangeQuery("year", 1960, 1980);
 *   TopDocs docs = searcher.search(query, ...);
 * 
 *
 * Geospatial Point Types
 *
 * Although basic point types such as {@link DoublePoint} support points in multi-dimensional space
 * too, Lucene has specialized classes for location data. These classes are optimized for location
 * data: they are more space-efficient and support special operations such as distance and
 * polygon queries. There are currently two implementations: 

 *
 * 
 *   {@link LatLonPoint}: indexes {@code (latitude,longitude)} as {@code (x,y)} in
 *       two-dimensional space.
 *   
Geo3DPoint*
 *       in lucene-spatial3d: indexes {@code (latitude,longitude)} as {@code (x,y,z)} in
 *       three-dimensional space.
 * 
 *
 * * does not support altitude, 3D here means "uses three dimensions under-the-hood"

 *
 * Advanced usage
 *
 * Custom structures can be created on top of single- or multi- dimensional basic types, on top of
 * {@link BinaryPoint} for more flexibility, or via custom {@link Field} subclasses.
 *
 * @lucene.experimental
 */
public abstract class PointValues {

  /** Maximum number of bytes for each dimension */
  public static final int MAX_NUM_BYTES = 16;

  /** Maximum number of dimensions */
  public static final int MAX_DIMENSIONS = BKDConfig.MAX_DIMS;

  /** Maximum number of index dimensions */
  public static final int MAX_INDEX_DIMENSIONS = BKDConfig.MAX_INDEX_DIMS;

  /**
   * Return the cumulated number of points across all leaves of the given {@link IndexReader}.
   * Leaves that do not have points for the given field are ignored.
   *
   * @see PointValues#size()
   */
  public static long size(IndexReader reader, String field) throws IOException {
    long size = 0;
    for (LeafReaderContext ctx : reader.leaves()) {
      PointValues values = ctx.reader().getPointValues(field);
      if (values != null) {
        size += values.size();
      }
    }
    return size;
  }

  /**
   * Return the cumulated number of docs that have points across all leaves of the given {@link
   * IndexReader}. Leaves that do not have points for the given field are ignored.
   *
   * @see PointValues#getDocCount()
   */
  public static int getDocCount(IndexReader reader, String field) throws IOException {
    int count = 0;
    for (LeafReaderContext ctx : reader.leaves()) {
      PointValues values = ctx.reader().getPointValues(field);
      if (values != null) {
        count += values.getDocCount();
      }
    }
    return count;
  }

  /**
   * Return the minimum packed values across all leaves of the given {@link IndexReader}. Leaves
   * that do not have points for the given field are ignored.
   *
   * @see PointValues#getMinPackedValue()
   */
  public static byte[] getMinPackedValue(IndexReader reader, String field) throws IOException {
    byte[] minValue = null;
    for (LeafReaderContext ctx : reader.leaves()) {
      PointValues values = ctx.reader().getPointValues(field);
      if (values == null) {
        continue;
      }
      byte[] leafMinValue = values.getMinPackedValue();
      if (leafMinValue == null) {
        continue;
      }
      if (minValue == null) {
        minValue = leafMinValue.clone();
      } else {
        final int numDimensions = values.getNumIndexDimensions();
        final int numBytesPerDimension = values.getBytesPerDimension();
        final ByteArrayComparator comparator =
            ArrayUtil.getUnsignedComparator(numBytesPerDimension);
        for (int i = 0; i < numDimensions; ++i) {
          int offset = i * numBytesPerDimension;
          if (comparator.compare(leafMinValue, offset, minValue, offset) < 0) {
            System.arraycopy(leafMinValue, offset, minValue, offset, numBytesPerDimension);
          }
        }
      }
    }
    return minValue;
  }

  /**
   * Return the maximum packed values across all leaves of the given {@link IndexReader}. Leaves
   * that do not have points for the given field are ignored.
   *
   * @see PointValues#getMaxPackedValue()
   */
  public static byte[] getMaxPackedValue(IndexReader reader, String field) throws IOException {
    byte[] maxValue = null;
    for (LeafReaderContext ctx : reader.leaves()) {
      PointValues values = ctx.reader().getPointValues(field);
      if (values == null) {
        continue;
      }
      byte[] leafMaxValue = values.getMaxPackedValue();
      if (leafMaxValue == null) {
        continue;
      }
      if (maxValue == null) {
        maxValue = leafMaxValue.clone();
      } else {
        final int numDimensions = values.getNumIndexDimensions();
        final int numBytesPerDimension = values.getBytesPerDimension();
        final ByteArrayComparator comparator =
            ArrayUtil.getUnsignedComparator(numBytesPerDimension);
        for (int i = 0; i < numDimensions; ++i) {
          int offset = i * numBytesPerDimension;
          if (comparator.compare(leafMaxValue, offset, maxValue, offset) > 0) {
            System.arraycopy(leafMaxValue, offset, maxValue, offset, numBytesPerDimension);
          }
        }
      }
    }
    return maxValue;
  }

  /** Default constructor */
  protected PointValues() {}

  /** Used by {@link #intersect} to check how each recursive cell corresponds to the query. */
  public enum Relation {
    /** Return this if the cell is fully contained by the query */
    CELL_INSIDE_QUERY,
    /** Return this if the cell and query do not overlap */
    CELL_OUTSIDE_QUERY,
    /** Return this if the cell partially overlaps the query */
    CELL_CROSSES_QUERY
  };

  /** Create a new {@link PointTree} to navigate the index */
  public abstract PointTree getPointTree() throws IOException;

  /**
   * Basic operations to read the KD-tree.
   *
   * @lucene.experimental
   */
  public interface PointTree extends Cloneable {

    /** Clone, the current node becomes the root of the new tree. */
    PointTree clone();

    /**
     * Move to the first child node and return {@code true} upon success. Returns {@code false} for
     * leaf nodes and {@code true} otherwise.
     */
    boolean moveToChild() throws IOException;

    /**
     * Move to the next sibling node and return {@code true} upon success. Returns {@code false} if
     * the current node has no more siblings.
     */
    boolean moveToSibling() throws IOException;

    /**
     * Move to the parent node and return {@code true} upon success. Returns {@code false} for the
     * root node and {@code true} otherwise.
     */
    boolean moveToParent() throws IOException;

    /** Return the minimum packed value of the current node. */
    byte[] getMinPackedValue();

    /** Return the maximum packed value of the current node. */
    byte[] getMaxPackedValue();

    /** Return the number of points below the current node. */
    long size();

    /** Visit all the docs below the current node. */
    void visitDocIDs(IntersectVisitor visitor) throws IOException;

    /** Visit all the docs and values below the current node. */
    void visitDocValues(IntersectVisitor visitor) throws IOException;
  }

  /**
   * We recurse the {@link PointTree}, using a provided instance of this to guide the recursion.
   *
   * @lucene.experimental
   */
  public interface IntersectVisitor {
    /**
     * Called for all documents in a leaf cell that's fully contained by the query. The consumer
     * should blindly accept the docID.
     */
    void visit(int docID) throws IOException;

    /**
     * Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implementations may have
     * their optimizations.
     *
     * It is guaranteed that the given iterator is not positioned;
     */
    default void visit(DocIdSetIterator iterator) throws IOException {
      int docID;
      while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        visit(docID);
      }
    }

    /**
     * Similar to {@link IntersectVisitor#visit(int)}, but a bulk visit and implements may have
     * their optimizations. Even if the implementation does the same thing this method, this may be
     * a speed improvement due to fewer virtual calls.
     */
    default void visit(IntsRef ref) throws IOException {
      for (int i = ref.offset; i < ref.length + ref.offset; i++) {
        visit(ref.ints[i]);
      }
    }

    /**
     * Called for all documents in a leaf cell that crosses the query. The consumer should
     * scrutinize the packedValue to decide whether to accept it. In the 1D case, values are visited
     * in increasing order, and in the case of ties, in increasing docID order.
     */
    void visit(int docID, byte[] packedValue) throws IOException;

    /**
     * Similar to {@link IntersectVisitor#visit(int, byte[])} but in this case the packedValue can
     * have more than one docID associated to it. The provided iterator should not escape the scope
     * of this method so that implementations of PointValues are free to reuse it,
     */
    default void visit(DocIdSetIterator iterator, byte[] packedValue) throws IOException {
      int docID;
      while ((docID = iterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
        visit(docID, packedValue);
      }
    }

    /**
     * Called for non-leaf cells to test how the cell relates to the query, to determine how to
     * further recurse down the tree.
     */
    Relation compare(byte[] minPackedValue, byte[] maxPackedValue);

    /** Notifies the caller that this many documents are about to be visited */
    default void grow(int count) {}
  }

  /**
   * Finds all documents and points matching the provided visitor. This method does not enforce live
   * documents, so it's up to the caller to test whether each document is deleted, if necessary.
   */
  public final void intersect(IntersectVisitor visitor) throws IOException {
    final PointTree pointTree = getPointTree();
    intersect(visitor, pointTree);
    assert pointTree.moveToParent() == false;
  }

  private static void intersect(IntersectVisitor visitor, PointTree pointTree) throws IOException {
    while (true) {
      Relation compare =
          visitor.compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue());
      if (compare == Relation.CELL_INSIDE_QUERY) {
        // This cell is fully inside the query shape: recursively add all points in this cell
        // without filtering
        pointTree.visitDocIDs(visitor);
      } else if (compare == Relation.CELL_CROSSES_QUERY) {
        // The cell crosses the shape boundary, or the cell fully contains the query, so we fall
        // through and do full filtering:
        if (pointTree.moveToChild()) {
          continue;
        }
        // TODO: we can assert that the first value here in fact matches what the pointTree
        // claimed?
        // Leaf node; scan and filter all points in this block:
        pointTree.visitDocValues(visitor);
      }
      while (pointTree.moveToSibling() == false) {
        if (pointTree.moveToParent() == false) {
          return;
        }
      }
    }
  }

  /**
   * Estimate the number of points that would be visited by {@link #intersect} with the given {@link
   * IntersectVisitor}. This should run many times faster than {@link #intersect(IntersectVisitor)}.
   */
  public final long estimatePointCount(IntersectVisitor visitor) {
    try {
      final PointTree pointTree = getPointTree();
      final long count = estimatePointCount(visitor, pointTree, Long.MAX_VALUE);
      assert pointTree.moveToParent() == false;
      return count;
    } catch (IOException ioe) {
      throw new UncheckedIOException(ioe);
    }
  }

  /**
   * Estimate if the point count that would be matched by {@link #intersect} with the given {@link
   * IntersectVisitor} is greater than or equal to the upperBound.
   *
   * @lucene.internal
   */
  public static boolean isEstimatedPointCountGreaterThanOrEqualTo(
      IntersectVisitor visitor, PointTree pointTree, long upperBound) throws IOException {
    return estimatePointCount(visitor, pointTree, upperBound) >= upperBound;
  }

  /**
   * Estimate the number of documents that would be matched by {@link #intersect} with the given
   * {@link IntersectVisitor}. The estimation will terminate when the point count gets greater than
   * or equal to the upper bound.
   *
   * TODO: will broad-first help estimation terminate earlier?
   */
  private static long estimatePointCount(
      IntersectVisitor visitor, PointTree pointTree, long upperBound) throws IOException {
    Relation r = visitor.compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue());
    switch (r) {
      case CELL_OUTSIDE_QUERY:
        // This cell is fully outside the query shape: no points added
        return 0L;
      case CELL_INSIDE_QUERY:
        // This cell is fully inside the query shape: add all points
        return pointTree.size();
      case CELL_CROSSES_QUERY:
        // The cell crosses the shape boundary: keep recursing
        if (pointTree.moveToChild()) {
          long cost = 0;
          do {
            cost += estimatePointCount(visitor, pointTree, upperBound - cost);
          } while (cost < upperBound && pointTree.moveToSibling());
          pointTree.moveToParent();
          return cost;
        } else {
          // Assume half the points matched
          return (pointTree.size() + 1) / 2;
        }
      default:
        throw new IllegalArgumentException("Unreachable code");
    }
  }

  /**
   * Estimate the number of documents that would be matched by {@link #intersect} with the given
   * {@link IntersectVisitor}. This should run many times faster than {@link
   * #intersect(IntersectVisitor)}.
   *
   * @see DocIdSetIterator#cost
   */
  public final long estimateDocCount(IntersectVisitor visitor) {
    long estimatedPointCount = estimatePointCount(visitor);
    int docCount = getDocCount();
    double size = size();
    if (estimatedPointCount >= size) {
      // math all docs
      return docCount;
    } else if (size == docCount || estimatedPointCount == 0L) {
      // if the point count estimate is 0 or we have only single values
      // return this estimate
      return estimatedPointCount;
    } else {
      // in case of multi values estimate the number of docs using the solution provided in
      // https://math.stackexchange.com/questions/1175295/urn-problem-probability-of-drawing-balls-of-k-unique-colors
      // then approximate the solution for points per doc << size() which results in the expression
      // D * (1 - ((N - n) / N)^(N/D))
      // where D is the total number of docs, N the total number of points and n the estimated point
      // count
      long docEstimate =
          (long) (docCount * (1d - Math.pow((size - estimatedPointCount) / size, size / docCount)));
      return docEstimate == 0L ? 1L : docEstimate;
    }
  }

  /**
   * Returns minimum value for each dimension, packed, or null if {@link #size} is 0
   */
  public abstract byte[] getMinPackedValue() throws IOException;

  /**
   * Returns maximum value for each dimension, packed, or null if {@link #size} is 0
   */
  public abstract byte[] getMaxPackedValue() throws IOException;

  /** Returns how many dimensions are represented in the values */
  public abstract int getNumDimensions() throws IOException;

  /** Returns how many dimensions are used for the index */
  public abstract int getNumIndexDimensions() throws IOException;

  /** Returns the number of bytes per dimension */
  public abstract int getBytesPerDimension() throws IOException;

  /** Returns the total number of indexed points across all documents. */
  public abstract long size();

  /** Returns the total number of documents that have indexed at least one point. */
  public abstract int getDocCount();
}
Java type	Lucene class
{@code int}	{@link IntPoint}
{@code long}	{@link LongPoint}
{@code float}	{@link FloatPoint}
{@code double}	{@link DoublePoint}
{@code byte[]}	{@link BinaryPoint}
{@link InetAddress}	{@link InetAddressPoint}
{@link BigInteger}	BigIntegerPoint*