org.apache.lucene.facet.range.LongRangeCounter Maven / Gradle / Ivy
Show all versions of lucene-facet Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.range;
import java.util.Arrays;
import java.util.Comparator;
/**
* Counter for numeric ranges. Works for both single- and multi-valued cases (assuming you use it
* correctly).
*
* Usage notes: When counting a document field that only has a single value, callers should call
* addSingleValued() with the value. Whenever a document field has multiple values, callers should
* call startMultiValuedDoc() at the beginning of processing the document, followed by
* addMultiValued() with each value before finally calling endMultiValuedDoc() at the end of
* processing the document. The call to endMultiValuedDoc() will respond with a boolean indicating
* whether-or-not the specific document matched against at least one of the ranges being counted.
* Finally, after processing all documents, the caller should call finish(). This final call will
* ensure the contents of the user-provided {@code countBuffer} contains accurate counts (each index
* corresponding to the provided {@code LongRange} in {@code ranges}). The final call to finish()
* will also report how many additional documents did not match against any ranges. The combination
* of the endMultiValuedDoc() boolean responses and the number reported by finish() communicates the
* total number of missing documents. Note that the call to finish() will not report any documents
* already reported missing by endMultiValuedDoc().
*/
abstract class LongRangeCounter {
/** accumulated counts for all of the ranges */
private final int[] countBuffer;
/**
* for multi-value docs, we keep track of the last elementary interval we've counted so we can use
* that as a lower-bound when counting subsequent values. this takes advantage of the fact that
* values within a given doc are sorted.
*/
protected int multiValuedDocLastSeenElementaryInterval;
static LongRangeCounter create(LongRange[] ranges, int[] countBuffer) {
if (hasOverlappingRanges(ranges)) {
return new OverlappingLongRangeCounter(ranges, countBuffer);
} else {
return new ExclusiveLongRangeCounter(ranges, countBuffer);
}
}
protected LongRangeCounter(int[] countBuffer) {
// We'll populate the user-provided count buffer with range counts:
this.countBuffer = countBuffer;
}
/** Start processing a new doc. It's unnecessary to call this for single-value cases. */
void startMultiValuedDoc() {
multiValuedDocLastSeenElementaryInterval = -1;
}
/**
* Finish processing a new doc. Returns whether-or-not the document contributed a count to at
* least one range. It's unnecessary to call this for single-value cases.
*/
abstract boolean endMultiValuedDoc();
/** Count a single valued doc */
void addSingleValued(long v) {
// NOTE: this works too, but it's ~6% slower on a simple
// test with a high-freq TermQuery w/ range faceting on
// wikimediumall:
/*
int index = Arrays.binarySearch(boundaries, v);
if (index < 0) {
index = -index-1;
}
leafCounts[index]++;
*/
// Binary search to find matched elementary range; we
// are guaranteed to find a match because the last
// boundary is Long.MAX_VALUE:
long[] boundaries = boundaries();
int lo = 0;
int hi = boundaries.length - 1;
while (true) {
int mid = (lo + hi) >>> 1;
if (v <= boundaries[mid]) {
if (mid == 0) {
processSingleValuedHit(mid);
return;
} else {
hi = mid - 1;
}
} else if (v > boundaries[mid + 1]) {
lo = mid + 1;
} else {
processSingleValuedHit(mid + 1);
return;
}
}
}
/** Count a multi-valued doc value */
void addMultiValued(long v) {
if (rangeCount() == 0) {
return; // don't bother if there aren't any requested ranges
}
long[] boundaries = boundaries();
// First check if we've "advanced" beyond the last elementary interval we counted for this doc.
// If we haven't, there's no sense doing anything else:
if (multiValuedDocLastSeenElementaryInterval != -1
&& v <= boundaries[multiValuedDocLastSeenElementaryInterval]) {
return;
}
// Also check if we've already counted the last elementary interval. If so, there's nothing
// else to count for this doc:
final int nextCandidateElementaryInterval = multiValuedDocLastSeenElementaryInterval + 1;
if (nextCandidateElementaryInterval == boundaries.length) {
return;
}
// Binary search in the range of the next candidate interval up to the last interval:
int lo = nextCandidateElementaryInterval;
int hi = boundaries.length - 1;
while (true) {
int mid = (lo + hi) >>> 1;
if (v <= boundaries[mid]) {
if (mid == nextCandidateElementaryInterval) {
processMultiValuedHit(mid);
multiValuedDocLastSeenElementaryInterval = mid;
return;
} else {
hi = mid - 1;
}
} else if (v > boundaries[mid + 1]) {
lo = mid + 1;
} else {
int idx = mid + 1;
processMultiValuedHit(idx);
multiValuedDocLastSeenElementaryInterval = idx;
return;
}
}
}
/**
* Finish processing all documents. This will return the number of docs that didn't contribute to
* any ranges (that weren't already reported when calling endMultiValuedDoc()).
*/
abstract int finish();
/** Provide boundary information for elementary intervals (max inclusive value per interval) */
protected abstract long[] boundaries();
/** Process a single-value "hit" against an elementary interval. */
protected abstract void processSingleValuedHit(int elementaryIntervalNum);
/** Process a multi-value "hit" against an elementary interval. */
protected abstract void processMultiValuedHit(int elementaryIntervalNum);
/** Increment the specified range by one. */
protected final void increment(int rangeNum) {
countBuffer[rangeNum]++;
}
/** Increment the specified range by the specified count. */
protected final void increment(int rangeNum, int count) {
countBuffer[rangeNum] += count;
}
/** Number of ranges requested by the caller. */
protected final int rangeCount() {
return countBuffer.length;
}
/** Determine whether-or-not any requested ranges overlap */
private static boolean hasOverlappingRanges(LongRange[] ranges) {
if (ranges.length == 0) {
return false;
}
// Copy before sorting so we don't mess with the caller's original ranges:
LongRange[] sortedRanges = new LongRange[ranges.length];
System.arraycopy(ranges, 0, sortedRanges, 0, ranges.length);
Arrays.sort(sortedRanges, Comparator.comparingLong(r -> r.min));
long previousMax = sortedRanges[0].max;
for (int i = 1; i < sortedRanges.length; i++) {
// Ranges overlap if the next min is <= the previous max (note that LongRange models
// closed ranges, so equal limit points are considered overlapping):
if (sortedRanges[i].min <= previousMax) {
return true;
}
previousMax = sortedRanges[i].max;
}
return false;
}
protected static final class InclusiveRange {
final long start;
final long end;
InclusiveRange(long start, long end) {
assert end >= start;
this.start = start;
this.end = end;
}
@Override
public String toString() {
return start + " to " + end;
}
}
}