All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.sandbox.search.NearestNeighbor Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.sandbox.search;

import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;

import java.io.IOException;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.PointTree;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SloppyMath;

/**
 * KNN search on top of 2D lat/lon indexed points.
 *
 * @lucene.experimental
 */
class NearestNeighbor {

  static class Cell implements Comparable {
    final int readerIndex;
    final byte[] minPacked;
    final byte[] maxPacked;
    final PointTree index;

    /**
     * The closest distance from a point in this cell to the query point, computed as a sort key
     * through {@link SloppyMath#haversinSortKey}. Note that this is an approximation to the closest
     * distance, and there could be a point in the cell that is closer.
     */
    final double distanceSortKey;

    public Cell(
        PointTree index,
        int readerIndex,
        byte[] minPacked,
        byte[] maxPacked,
        double distanceSortKey) {
      this.index = index;
      this.readerIndex = readerIndex;
      this.minPacked = minPacked.clone();
      this.maxPacked = maxPacked.clone();
      this.distanceSortKey = distanceSortKey;
    }

    @Override
    public int compareTo(Cell other) {
      return Double.compare(distanceSortKey, other.distanceSortKey);
    }

    @Override
    public String toString() {
      double minLat = decodeLatitude(minPacked, 0);
      double minLon = decodeLongitude(minPacked, Integer.BYTES);
      double maxLat = decodeLatitude(maxPacked, 0);
      double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
      return "Cell(readerIndex="
          + readerIndex
          + " "
          + index.toString()
          + " lat="
          + minLat
          + " TO "
          + maxLat
          + ", lon="
          + minLon
          + " TO "
          + maxLon
          + "; distanceSortKey="
          + distanceSortKey
          + ")";
    }
  }

  private static class NearestVisitor implements IntersectVisitor {

    public int curDocBase;
    public Bits curLiveDocs;
    final int topN;
    final PriorityQueue hitQueue;
    final double pointLat;
    final double pointLon;
    private int setBottomCounter;

    private double minLon = Double.NEGATIVE_INFINITY;
    private double maxLon = Double.POSITIVE_INFINITY;
    private double minLat = Double.NEGATIVE_INFINITY;
    private double maxLat = Double.POSITIVE_INFINITY;

    // second set of longitude ranges to check (for cross-dateline case)
    private double minLon2 = Double.POSITIVE_INFINITY;

    public NearestVisitor(
        PriorityQueue hitQueue, int topN, double pointLat, double pointLon) {
      this.hitQueue = hitQueue;
      this.topN = topN;
      this.pointLat = pointLat;
      this.pointLon = pointLon;
    }

    @Override
    public void visit(int docID) {
      throw new AssertionError();
    }

    private void maybeUpdateBBox() {
      if (setBottomCounter < 1024 || (setBottomCounter & 0x3F) == 0x3F) {
        NearestHit hit = hitQueue.peek();
        Rectangle box =
            Rectangle.fromPointDistance(
                pointLat, pointLon, SloppyMath.haversinMeters(hit.distanceSortKey));
        // System.out.println("    update bbox to " + box);
        minLat = box.minLat;
        maxLat = box.maxLat;
        if (box.crossesDateline()) {
          // box1
          minLon = Double.NEGATIVE_INFINITY;
          maxLon = box.maxLon;
          // box2
          minLon2 = box.minLon;
        } else {
          minLon = box.minLon;
          maxLon = box.maxLon;
          // disable box2
          minLon2 = Double.POSITIVE_INFINITY;
        }
      }
      setBottomCounter++;
    }

    @Override
    public void visit(int docID, byte[] packedValue) {
      // System.out.println("visit docID=" + docID + " liveDocs=" + curLiveDocs);

      if (curLiveDocs != null && curLiveDocs.get(docID) == false) {
        return;
      }

      double docLatitude = decodeLatitude(packedValue, 0);
      double docLongitude = decodeLongitude(packedValue, Integer.BYTES);

      // test bounding box
      if (docLatitude < minLat || docLatitude > maxLat) {
        return;
      }
      if ((docLongitude < minLon || docLongitude > maxLon) && (docLongitude < minLon2)) {
        return;
      }

      // Use the haversin sort key when comparing hits, as it is faster to compute than the true
      // distance.
      double distanceSortKey =
          SloppyMath.haversinSortKey(pointLat, pointLon, docLatitude, docLongitude);

      // System.out.println("    visit docID=" + docID + " distanceSortKey=" + distanceSortKey + "
      // docLat=" + docLatitude + " docLon=" + docLongitude);

      int fullDocID = curDocBase + docID;

      if (hitQueue.size() == topN) {
        // queue already full
        NearestHit hit = hitQueue.peek();
        // System.out.println("      bottom distanceSortKey=" + hit.distanceSortKey);
        // we don't collect docs in order here, so we must also test the tie-break case ourselves:
        if (distanceSortKey < hit.distanceSortKey
            || (distanceSortKey == hit.distanceSortKey && fullDocID < hit.docID)) {
          hitQueue.poll();
          hit.docID = fullDocID;
          hit.distanceSortKey = distanceSortKey;
          hitQueue.offer(hit);
          // System.out.println("      ** keep2, now bottom=" + hit);
          maybeUpdateBBox();
        }

      } else {
        NearestHit hit = new NearestHit();
        hit.docID = fullDocID;
        hit.distanceSortKey = distanceSortKey;
        hitQueue.offer(hit);
        // System.out.println("      ** keep1, now bottom=" + hit);
      }
    }

    @Override
    public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
      double cellMinLat = decodeLatitude(minPackedValue, 0);
      double cellMinLon = decodeLongitude(minPackedValue, Integer.BYTES);
      double cellMaxLat = decodeLatitude(maxPackedValue, 0);
      double cellMaxLon = decodeLongitude(maxPackedValue, Integer.BYTES);

      if (cellMaxLat < minLat
          || maxLat < cellMinLat
          || ((cellMaxLon < minLon || maxLon < cellMinLon) && cellMaxLon < minLon2)) {
        // this cell is outside our search bbox; don't bother exploring any more
        return Relation.CELL_OUTSIDE_QUERY;
      }
      return Relation.CELL_CROSSES_QUERY;
    }
  }

  /** Holds one hit from {@link NearestNeighbor#nearest} */
  static class NearestHit {
    public int docID;

    /**
     * The distance from the hit to the query point, computed as a sort key through {@link
     * SloppyMath#haversinSortKey}.
     */
    public double distanceSortKey;

    @Override
    public String toString() {
      return "NearestHit(docID=" + docID + " distanceSortKey=" + distanceSortKey + ")";
    }
  }

  // TODO: can we somehow share more with, or simply directly use, the
  // LatLonPointDistanceComparator?  It's really doing the same thing as
  // our hitQueue...

  public static NearestHit[] nearest(
      double pointLat,
      double pointLon,
      List readers,
      List liveDocs,
      IntArrayList docBases,
      final int n)
      throws IOException {

    // System.out.println("NEAREST: readers=" + readers + " liveDocs=" + liveDocs + " pointLat=" +
    // pointLat + " pointLon=" + pointLon);
    // Holds closest collected points seen so far:
    // TODO: if we used lucene's PQ we could just updateTop instead of poll/offer:
    final PriorityQueue hitQueue =
        new PriorityQueue<>(
            n,
            (a, b) -> {
              // sort by opposite distanceSortKey natural order
              int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey);
              if (cmp != 0) {
                return -cmp;
              }

              // tie-break by higher docID:
              return b.docID - a.docID;
            });

    // Holds all cells, sorted by closest to the point:
    PriorityQueue cellQueue = new PriorityQueue<>();

    NearestVisitor visitor = new NearestVisitor(hitQueue, n, pointLat, pointLon);

    // Add root cell for each reader into the queue:
    for (int i = 0; i < readers.size(); i++) {
      PointValues reader = readers.get(i);
      byte[] minPackedValue = reader.getMinPackedValue();
      byte[] maxPackedValue = reader.getMaxPackedValue();
      PointTree indexTree = reader.getPointTree();

      cellQueue.offer(
          new Cell(
              indexTree,
              i,
              reader.getMinPackedValue(),
              reader.getMaxPackedValue(),
              approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
    }

    while (cellQueue.size() > 0) {
      Cell cell = cellQueue.poll();
      // System.out.println("  visit " + cell);
      if (visitor.compare(cell.minPacked, cell.maxPacked) == Relation.CELL_OUTSIDE_QUERY) {
        continue;
      }

      // TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to
      // break once this "best" cell is fully outside of the hitQueue bottom's radius:

      if (cell.index.moveToChild() == false) {
        // System.out.println("    leaf");
        // Leaf block: visit all points and possibly collect them:
        visitor.curDocBase = docBases.get(cell.readerIndex);
        visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
        cell.index.visitDocValues(visitor);
        // System.out.println("    now " + hitQueue.size() + " hits");
      } else {
        // System.out.println("    non-leaf");
        // Non-leaf block: split into two cells and put them back into the queue:

        // we must clone the index so that we can recurse left and right "concurrently":
        PointTree newIndex = cell.index.clone();

        cellQueue.offer(
            new Cell(
                newIndex,
                cell.readerIndex,
                newIndex.getMinPackedValue(),
                newIndex.getMaxPackedValue(),
                approxBestDistance(
                    newIndex.getMinPackedValue(),
                    newIndex.getMaxPackedValue(),
                    pointLat,
                    pointLon)));

        // TODO: we are assuming a binary tree
        if (cell.index.moveToSibling()) {
          cellQueue.offer(
              new Cell(
                  cell.index,
                  cell.readerIndex,
                  cell.index.getMinPackedValue(),
                  cell.index.getMaxPackedValue(),
                  approxBestDistance(
                      cell.index.getMinPackedValue(),
                      cell.index.getMaxPackedValue(),
                      pointLat,
                      pointLon)));
        }
      }
    }

    NearestHit[] hits = new NearestHit[hitQueue.size()];
    int downTo = hitQueue.size() - 1;
    while (hitQueue.size() != 0) {
      hits[downTo] = hitQueue.poll();
      downTo--;
    }

    return hits;
  }

  // NOTE: incoming args never cross the dateline, since they are a BKD cell
  private static double approxBestDistance(
      byte[] minPackedValue, byte[] maxPackedValue, double pointLat, double pointLon) {
    double minLat = decodeLatitude(minPackedValue, 0);
    double minLon = decodeLongitude(minPackedValue, Integer.BYTES);
    double maxLat = decodeLatitude(maxPackedValue, 0);
    double maxLon = decodeLongitude(maxPackedValue, Integer.BYTES);
    return approxBestDistance(minLat, maxLat, minLon, maxLon, pointLat, pointLon);
  }

  // NOTE: incoming args never cross the dateline, since they are a BKD cell
  private static double approxBestDistance(
      double minLat,
      double maxLat,
      double minLon,
      double maxLon,
      double pointLat,
      double pointLon) {

    // TODO: can we make this the trueBestDistance?  I.e., minimum distance between the point and
    // ANY point on the box?  we can speed things
    // up if so, but not enrolling any BKD cell whose true best distance is > bottom of the current
    // hit queue

    if (pointLat >= minLat && pointLat <= maxLat && pointLon >= minLon && pointLon <= maxLon) {
      // point is inside the cell!
      return 0.0;
    }

    double d1 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, minLon);
    double d2 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, maxLon);
    double d3 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, maxLon);
    double d4 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, minLon);
    return Math.min(Math.min(d1, d2), Math.min(d3, d4));
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy