org.apache.lucene.search.NearestNeighbor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-sandbox Show documentation
Show all versions of lucene-sandbox Show documentation
Apache Lucene (module: sandbox)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;
import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SloppyMath;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDReader.IndexTree;
import org.apache.lucene.util.bkd.BKDReader.IntersectState;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;
/**
* KNN search on top of 2D lat/lon indexed points.
*
* @lucene.experimental
*/
class NearestNeighbor {
static class Cell implements Comparable {
final int readerIndex;
final byte[] minPacked;
final byte[] maxPacked;
final IndexTree index;
/**
* The closest distance from a point in this cell to the query point, computed as a sort key through
* {@link SloppyMath#haversinSortKey}. Note that this is an approximation to the closest distance,
* and there could be a point in the cell that is closer.
*/
final double distanceSortKey;
public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceSortKey) {
this.index = index;
this.readerIndex = readerIndex;
this.minPacked = minPacked.clone();
this.maxPacked = maxPacked.clone();
this.distanceSortKey = distanceSortKey;
}
public int compareTo(Cell other) {
return Double.compare(distanceSortKey, other.distanceSortKey);
}
@Override
public String toString() {
double minLat = decodeLatitude(minPacked, 0);
double minLon = decodeLongitude(minPacked, Integer.BYTES);
double maxLat = decodeLatitude(maxPacked, 0);
double maxLon = decodeLongitude(maxPacked, Integer.BYTES);
return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceSortKey=" + distanceSortKey + ")";
}
}
private static class NearestVisitor implements IntersectVisitor {
public int curDocBase;
public Bits curLiveDocs;
final int topN;
final PriorityQueue hitQueue;
final double pointLat;
final double pointLon;
private int setBottomCounter;
private double minLon = Double.NEGATIVE_INFINITY;
private double maxLon = Double.POSITIVE_INFINITY;
private double minLat = Double.NEGATIVE_INFINITY;
private double maxLat = Double.POSITIVE_INFINITY;
// second set of longitude ranges to check (for cross-dateline case)
private double minLon2 = Double.POSITIVE_INFINITY;
public NearestVisitor(PriorityQueue hitQueue, int topN, double pointLat, double pointLon) {
this.hitQueue = hitQueue;
this.topN = topN;
this.pointLat = pointLat;
this.pointLon = pointLon;
}
@Override
public void visit(int docID) {
throw new AssertionError();
}
private void maybeUpdateBBox() {
if (setBottomCounter < 1024 || (setBottomCounter & 0x3F) == 0x3F) {
NearestHit hit = hitQueue.peek();
Rectangle box = Rectangle.fromPointDistance(pointLat, pointLon,
SloppyMath.haversinMeters(hit.distanceSortKey));
//System.out.println(" update bbox to " + box);
minLat = box.minLat;
maxLat = box.maxLat;
if (box.crossesDateline()) {
// box1
minLon = Double.NEGATIVE_INFINITY;
maxLon = box.maxLon;
// box2
minLon2 = box.minLon;
} else {
minLon = box.minLon;
maxLon = box.maxLon;
// disable box2
minLon2 = Double.POSITIVE_INFINITY;
}
}
setBottomCounter++;
}
@Override
public void visit(int docID, byte[] packedValue) {
//System.out.println("visit docID=" + docID + " liveDocs=" + curLiveDocs);
if (curLiveDocs != null && curLiveDocs.get(docID) == false) {
return;
}
double docLatitude = decodeLatitude(packedValue, 0);
double docLongitude = decodeLongitude(packedValue, Integer.BYTES);
// test bounding box
if (docLatitude < minLat || docLatitude > maxLat) {
return;
}
if ((docLongitude < minLon || docLongitude > maxLon) && (docLongitude < minLon2)) {
return;
}
// Use the haversin sort key when comparing hits, as it is faster to compute than the true distance.
double distanceSortKey = SloppyMath.haversinSortKey(pointLat, pointLon, docLatitude, docLongitude);
//System.out.println(" visit docID=" + docID + " distanceSortKey=" + distanceSortKey + " docLat=" + docLatitude + " docLon=" + docLongitude);
int fullDocID = curDocBase + docID;
if (hitQueue.size() == topN) {
// queue already full
NearestHit hit = hitQueue.peek();
//System.out.println(" bottom distanceSortKey=" + hit.distanceSortKey);
// we don't collect docs in order here, so we must also test the tie-break case ourselves:
if (distanceSortKey < hit.distanceSortKey || (distanceSortKey == hit.distanceSortKey && fullDocID < hit.docID)) {
hitQueue.poll();
hit.docID = fullDocID;
hit.distanceSortKey = distanceSortKey;
hitQueue.offer(hit);
//System.out.println(" ** keep2, now bottom=" + hit);
maybeUpdateBBox();
}
} else {
NearestHit hit = new NearestHit();
hit.docID = fullDocID;
hit.distanceSortKey = distanceSortKey;
hitQueue.offer(hit);
//System.out.println(" ** keep1, now bottom=" + hit);
}
}
@Override
public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
double cellMinLat = decodeLatitude(minPackedValue, 0);
double cellMinLon = decodeLongitude(minPackedValue, Integer.BYTES);
double cellMaxLat = decodeLatitude(maxPackedValue, 0);
double cellMaxLon = decodeLongitude(maxPackedValue, Integer.BYTES);
if (cellMaxLat < minLat || maxLat < cellMinLat || ((cellMaxLon < minLon || maxLon < cellMinLon) && cellMaxLon < minLon2)) {
// this cell is outside our search bbox; don't bother exploring any more
return Relation.CELL_OUTSIDE_QUERY;
}
return Relation.CELL_CROSSES_QUERY;
}
}
/** Holds one hit from {@link NearestNeighbor#nearest} */
static class NearestHit {
public int docID;
/**
* The distance from the hit to the query point, computed as a sort key through {@link SloppyMath#haversinSortKey}.
*/
public double distanceSortKey;
@Override
public String toString() {
return "NearestHit(docID=" + docID + " distanceSortKey=" + distanceSortKey + ")";
}
}
// TODO: can we somehow share more with, or simply directly use, the LatLonPointDistanceComparator? It's really doing the same thing as
// our hitQueue...
public static NearestHit[] nearest(double pointLat, double pointLon, List readers, List liveDocs, List docBases, final int n) throws IOException {
//System.out.println("NEAREST: readers=" + readers + " liveDocs=" + liveDocs + " pointLat=" + pointLat + " pointLon=" + pointLon);
// Holds closest collected points seen so far:
// TODO: if we used lucene's PQ we could just updateTop instead of poll/offer:
final PriorityQueue hitQueue = new PriorityQueue<>(n, new Comparator() {
@Override
public int compare(NearestHit a, NearestHit b) {
// sort by opposite distanceSortKey natural order
int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey);
if (cmp != 0) {
return -cmp;
}
// tie-break by higher docID:
return b.docID - a.docID;
}
});
// Holds all cells, sorted by closest to the point:
PriorityQueue cellQueue = new PriorityQueue<>();
NearestVisitor visitor = new NearestVisitor(hitQueue, n, pointLat, pointLon);
List states = new ArrayList<>();
// Add root cell for each reader into the queue:
int bytesPerDim = -1;
for(int i=0;i 0) {
Cell cell = cellQueue.poll();
//System.out.println(" visit " + cell);
// TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
BKDReader reader = readers.get(cell.readerIndex);
if (cell.index.isLeafNode()) {
//System.out.println(" leaf");
// Leaf block: visit all points and possibly collect them:
visitor.curDocBase = docBases.get(cell.readerIndex);
visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex));
//System.out.println(" now " + hitQueue.size() + " hits");
} else {
//System.out.println(" non-leaf");
// Non-leaf block: split into two cells and put them back into the queue:
if (visitor.compare(cell.minPacked, cell.maxPacked) == Relation.CELL_OUTSIDE_QUERY) {
continue;
}
BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue());
int splitDim = cell.index.getSplitDim();
// we must clone the index so that we we can recurse left and right "concurrently":
IndexTree newIndex = cell.index.clone();
byte[] splitPackedValue = cell.maxPacked.clone();
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
cell.index.pushLeft();
cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue,
approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
splitPackedValue = cell.minPacked.clone();
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim);
newIndex.pushRight();
cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked,
approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
}
}
NearestHit[] hits = new NearestHit[hitQueue.size()];
int downTo = hitQueue.size()-1;
while (hitQueue.size() != 0) {
hits[downTo] = hitQueue.poll();
downTo--;
}
return hits;
}
// NOTE: incoming args never cross the dateline, since they are a BKD cell
private static double approxBestDistance(byte[] minPackedValue, byte[] maxPackedValue, double pointLat, double pointLon) {
double minLat = decodeLatitude(minPackedValue, 0);
double minLon = decodeLongitude(minPackedValue, Integer.BYTES);
double maxLat = decodeLatitude(maxPackedValue, 0);
double maxLon = decodeLongitude(maxPackedValue, Integer.BYTES);
return approxBestDistance(minLat, maxLat, minLon, maxLon, pointLat, pointLon);
}
// NOTE: incoming args never cross the dateline, since they are a BKD cell
private static double approxBestDistance(double minLat, double maxLat, double minLon, double maxLon, double pointLat, double pointLon) {
// TODO: can we make this the trueBestDistance? I.e., minimum distance between the point and ANY point on the box? we can speed things
// up if so, but not enrolling any BKD cell whose true best distance is > bottom of the current hit queue
if (pointLat >= minLat && pointLat <= maxLat && pointLon >= minLon && pointLon <= maxLon) {
// point is inside the cell!
return 0.0;
}
double d1 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, minLon);
double d2 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, maxLon);
double d3 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, maxLon);
double d4 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, minLon);
return Math.min(Math.min(d1, d2), Math.min(d3, d4));
}
}
| |