org.apache.lucene.sandbox.search.NearestNeighbor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-sandbox Show documentation
Show all versions of lucene-sandbox Show documentation
Apache Lucene (module: sandbox)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.sandbox.search;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;
import java.io.IOException;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.PointTree;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.SloppyMath;
/**
* KNN search on top of 2D lat/lon indexed points.
*
* @lucene.experimental
*/
class NearestNeighbor {
static class Cell implements Comparable {
final int readerIndex;
final byte[] minPacked;
final byte[] maxPacked;
final PointTree index;
/**
* The closest distance from a point in this cell to the query point, computed as a sort key
* through {@link SloppyMath#haversinSortKey}. Note that this is an approximation to the closest
* distance, and there could be a point in the cell that is closer.
*/
final double distanceSortKey;
public Cell(
PointTree index,
int readerIndex,
byte[] minPacked,
byte[] maxPacked,
double distanceSortKey) {
this.index = index;
this.readerIndex = readerIndex;
this.minPacked = minPacked.clone();
this.maxPacked = maxPacked.clone();
this.distanceSortKey = distanceSortKey;
}
@Override
public int compareTo(Cell other) {
return Double.compare(distanceSortKey, other.distanceSortKey);
}
@Override
public String toString() {
double minLat = decodeLatitude(minPacked, 0);
double minLon = decodeLongitude(minPacked, Integer.BYTES);
double maxLat = decodeLatitude(maxPacked, 0);
double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
return "Cell(readerIndex="
+ readerIndex
+ " "
+ index.toString()
+ " lat="
+ minLat
+ " TO "
+ maxLat
+ ", lon="
+ minLon
+ " TO "
+ maxLon
+ "; distanceSortKey="
+ distanceSortKey
+ ")";
}
}
private static class NearestVisitor implements IntersectVisitor {
public int curDocBase;
public Bits curLiveDocs;
final int topN;
final PriorityQueue hitQueue;
final double pointLat;
final double pointLon;
private int setBottomCounter;
private double minLon = Double.NEGATIVE_INFINITY;
private double maxLon = Double.POSITIVE_INFINITY;
private double minLat = Double.NEGATIVE_INFINITY;
private double maxLat = Double.POSITIVE_INFINITY;
// second set of longitude ranges to check (for cross-dateline case)
private double minLon2 = Double.POSITIVE_INFINITY;
public NearestVisitor(
PriorityQueue hitQueue, int topN, double pointLat, double pointLon) {
this.hitQueue = hitQueue;
this.topN = topN;
this.pointLat = pointLat;
this.pointLon = pointLon;
}
@Override
public void visit(int docID) {
throw new AssertionError();
}
private void maybeUpdateBBox() {
if (setBottomCounter < 1024 || (setBottomCounter & 0x3F) == 0x3F) {
NearestHit hit = hitQueue.peek();
Rectangle box =
Rectangle.fromPointDistance(
pointLat, pointLon, SloppyMath.haversinMeters(hit.distanceSortKey));
// System.out.println(" update bbox to " + box);
minLat = box.minLat;
maxLat = box.maxLat;
if (box.crossesDateline()) {
// box1
minLon = Double.NEGATIVE_INFINITY;
maxLon = box.maxLon;
// box2
minLon2 = box.minLon;
} else {
minLon = box.minLon;
maxLon = box.maxLon;
// disable box2
minLon2 = Double.POSITIVE_INFINITY;
}
}
setBottomCounter++;
}
@Override
public void visit(int docID, byte[] packedValue) {
// System.out.println("visit docID=" + docID + " liveDocs=" + curLiveDocs);
if (curLiveDocs != null && curLiveDocs.get(docID) == false) {
return;
}
double docLatitude = decodeLatitude(packedValue, 0);
double docLongitude = decodeLongitude(packedValue, Integer.BYTES);
// test bounding box
if (docLatitude < minLat || docLatitude > maxLat) {
return;
}
if ((docLongitude < minLon || docLongitude > maxLon) && (docLongitude < minLon2)) {
return;
}
// Use the haversin sort key when comparing hits, as it is faster to compute than the true
// distance.
double distanceSortKey =
SloppyMath.haversinSortKey(pointLat, pointLon, docLatitude, docLongitude);
// System.out.println(" visit docID=" + docID + " distanceSortKey=" + distanceSortKey + "
// docLat=" + docLatitude + " docLon=" + docLongitude);
int fullDocID = curDocBase + docID;
if (hitQueue.size() == topN) {
// queue already full
NearestHit hit = hitQueue.peek();
// System.out.println(" bottom distanceSortKey=" + hit.distanceSortKey);
// we don't collect docs in order here, so we must also test the tie-break case ourselves:
if (distanceSortKey < hit.distanceSortKey
|| (distanceSortKey == hit.distanceSortKey && fullDocID < hit.docID)) {
hitQueue.poll();
hit.docID = fullDocID;
hit.distanceSortKey = distanceSortKey;
hitQueue.offer(hit);
// System.out.println(" ** keep2, now bottom=" + hit);
maybeUpdateBBox();
}
} else {
NearestHit hit = new NearestHit();
hit.docID = fullDocID;
hit.distanceSortKey = distanceSortKey;
hitQueue.offer(hit);
// System.out.println(" ** keep1, now bottom=" + hit);
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
double cellMinLat = decodeLatitude(minPackedValue, 0);
double cellMinLon = decodeLongitude(minPackedValue, Integer.BYTES);
double cellMaxLat = decodeLatitude(maxPackedValue, 0);
double cellMaxLon = decodeLongitude(maxPackedValue, Integer.BYTES);
if (cellMaxLat < minLat
|| maxLat < cellMinLat
|| ((cellMaxLon < minLon || maxLon < cellMinLon) && cellMaxLon < minLon2)) {
// this cell is outside our search bbox; don't bother exploring any more
return Relation.CELL_OUTSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
}
/** Holds one hit from {@link NearestNeighbor#nearest} */
static class NearestHit {
public int docID;
/**
* The distance from the hit to the query point, computed as a sort key through {@link
* SloppyMath#haversinSortKey}.
*/
public double distanceSortKey;
@Override
public String toString() {
return "NearestHit(docID=" + docID + " distanceSortKey=" + distanceSortKey + ")";
}
}
// TODO: can we somehow share more with, or simply directly use, the
// LatLonPointDistanceComparator? It's really doing the same thing as
// our hitQueue...
public static NearestHit[] nearest(
double pointLat,
double pointLon,
List readers,
List liveDocs,
List docBases,
final int n)
throws IOException {
// System.out.println("NEAREST: readers=" + readers + " liveDocs=" + liveDocs + " pointLat=" +
// pointLat + " pointLon=" + pointLon);
// Holds closest collected points seen so far:
// TODO: if we used lucene's PQ we could just updateTop instead of poll/offer:
final PriorityQueue hitQueue =
new PriorityQueue<>(
n,
new Comparator() {
@Override
public int compare(NearestHit a, NearestHit b) {
// sort by opposite distanceSortKey natural order
int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey);
if (cmp != 0) {
return -cmp;
}
// tie-break by higher docID:
return b.docID - a.docID;
}
});
// Holds all cells, sorted by closest to the point:
PriorityQueue cellQueue = new PriorityQueue<>();
NearestVisitor visitor = new NearestVisitor(hitQueue, n, pointLat, pointLon);
// Add root cell for each reader into the queue:
for (int i = 0; i < readers.size(); i++) {
PointValues reader = readers.get(i);
byte[] minPackedValue = reader.getMinPackedValue();
byte[] maxPackedValue = reader.getMaxPackedValue();
PointTree indexTree = reader.getPointTree();
cellQueue.offer(
new Cell(
indexTree,
i,
reader.getMinPackedValue(),
reader.getMaxPackedValue(),
approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
}
while (cellQueue.size() > 0) {
Cell cell = cellQueue.poll();
// System.out.println(" visit " + cell);
if (visitor.compare(cell.minPacked, cell.maxPacked) == Relation.CELL_OUTSIDE_QUERY) {
continue;
}
// TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to
// break once this "best" cell is fully outside of the hitQueue bottom's radius:
if (cell.index.moveToChild() == false) {
// System.out.println(" leaf");
// Leaf block: visit all points and possibly collect them:
visitor.curDocBase = docBases.get(cell.readerIndex);
visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
cell.index.visitDocValues(visitor);
// System.out.println(" now " + hitQueue.size() + " hits");
} else {
// System.out.println(" non-leaf");
// Non-leaf block: split into two cells and put them back into the queue:
// we must clone the index so that we can recurse left and right "concurrently":
PointTree newIndex = cell.index.clone();
cellQueue.offer(
new Cell(
newIndex,
cell.readerIndex,
newIndex.getMinPackedValue(),
newIndex.getMaxPackedValue(),
approxBestDistance(
newIndex.getMinPackedValue(),
newIndex.getMaxPackedValue(),
pointLat,
pointLon)));
// TODO: we are assuming a binary tree
if (cell.index.moveToSibling()) {
cellQueue.offer(
new Cell(
cell.index,
cell.readerIndex,
cell.index.getMinPackedValue(),
cell.index.getMaxPackedValue(),
approxBestDistance(
cell.index.getMinPackedValue(),
cell.index.getMaxPackedValue(),
pointLat,
pointLon)));
}
}
}
NearestHit[] hits = new NearestHit[hitQueue.size()];
int downTo = hitQueue.size() - 1;
while (hitQueue.size() != 0) {
hits[downTo] = hitQueue.poll();
downTo--;
}
return hits;
}
// NOTE: incoming args never cross the dateline, since they are a BKD cell
private static double approxBestDistance(
byte[] minPackedValue, byte[] maxPackedValue, double pointLat, double pointLon) {
double minLat = decodeLatitude(minPackedValue, 0);
double minLon = decodeLongitude(minPackedValue, Integer.BYTES);
double maxLat = decodeLatitude(maxPackedValue, 0);
double maxLon = decodeLongitude(maxPackedValue, Integer.BYTES);
return approxBestDistance(minLat, maxLat, minLon, maxLon, pointLat, pointLon);
}
// NOTE: incoming args never cross the dateline, since they are a BKD cell
private static double approxBestDistance(
double minLat,
double maxLat,
double minLon,
double maxLon,
double pointLat,
double pointLon) {
// TODO: can we make this the trueBestDistance? I.e., minimum distance between the point and
// ANY point on the box? we can speed things
// up if so, but not enrolling any BKD cell whose true best distance is > bottom of the current
// hit queue
if (pointLat >= minLat && pointLat <= maxLat && pointLon >= minLon && pointLon <= maxLon) {
// point is inside the cell!
return 0.0;
}
double d1 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, minLon);
double d2 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, maxLon);
double d3 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, maxLon);
double d4 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, minLon);
return Math.min(Math.min(d1, d2), Math.min(d3, d4));
}
}
| |