org.apache.lucene.facet.range.OverlappingLongRangeCounter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-facet Show documentation
Show all versions of lucene-facet Show documentation
Apache Lucene (module: facet)
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.range;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.internal.hppc.IntArrayList;
import org.apache.lucene.internal.hppc.IntCursor;
import org.apache.lucene.internal.hppc.LongArrayList;
import org.apache.lucene.internal.hppc.LongIntHashMap;
import org.apache.lucene.util.FixedBitSet;
/**
* This implementation supports requested ranges that overlap. Because of this, we use a
* segment-tree to more efficiently aggregate counts into ranges at the end of processing. We also
* need to worry about double-counting issues since it's possible that multiple elementary
* intervals, although mutually-exclusive, can roll-up to the same requested range. This creates
* some complexity with how we need to handle multi-valued documents.
*/
class OverlappingLongRangeCounter extends LongRangeCounter {
/** segment tree root node */
private final LongRangeNode root;
/** elementary interval boundaries used for efficient counting (bsearch to find interval) */
private final long[] boundaries;
/**
* whether-or-not there are elementary interval counts that still need to be rolled up at the end
*/
private boolean hasUnflushedCounts;
// Needed only for counting single-valued docs:
/** counts seen in each elementary interval */
private int[] singleValuedElementaryIntervalCounts;
// Needed only for counting multi-valued docs:
/** whether-or-not an elementary interval has seen at least one match for a single doc */
private FixedBitSet multiValuedDocElementaryIntervalHits;
/** whether-or-not a requested range has seen at least one match for a single doc */
private FixedBitSet multiValuedDocRangeHits;
// Used during rollup
private int elementaryIntervalUpto;
/** number of counted documents that haven't matched any requested ranges */
private int missingCount;
OverlappingLongRangeCounter(LongRange[] ranges, int[] countBuffer) {
super(countBuffer);
// Build elementary intervals:
List elementaryIntervals = buildElementaryIntervals(ranges);
// Build binary tree on top of intervals:
root = split(0, elementaryIntervals.size(), elementaryIntervals);
// Set outputs, so we know which range to output for each node in the tree:
for (int i = 0; i < ranges.length; i++) {
root.addOutputs(i, ranges[i]);
}
// Keep track of elementary interval max boundaries for bsearch:
boundaries = new long[elementaryIntervals.size()];
for (int i = 0; i < boundaries.length; i++) {
boundaries[i] = elementaryIntervals.get(i).end;
}
}
@Override
void startMultiValuedDoc() {
super.startMultiValuedDoc();
// Lazy init a bitset to track the elementary intervals we see of a multi-valued doc:
if (multiValuedDocElementaryIntervalHits == null) {
multiValuedDocElementaryIntervalHits = new FixedBitSet(boundaries.length);
} else {
multiValuedDocElementaryIntervalHits.clear();
}
}
@Override
boolean endMultiValuedDoc() {
assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";
// Short-circuit if the caller didn't specify any ranges to count:
if (rangeCount() == 0) {
return false;
}
// Do the rollup for this doc:
// Lazy init a bitset to track the requested ranges seen for this multi-valued doc:
if (multiValuedDocRangeHits == null) {
multiValuedDocRangeHits = new FixedBitSet(rangeCount());
} else {
multiValuedDocRangeHits.clear();
}
elementaryIntervalUpto = 0;
rollupMultiValued(root);
// Actually increment the count for each matching range, and see if the doc contributed to
// at least one:
boolean docContributedToAtLeastOneRange = false;
for (int i = multiValuedDocRangeHits.nextSetBit(0); i < multiValuedDocRangeHits.length(); ) {
increment(i);
docContributedToAtLeastOneRange = true;
if (++i < multiValuedDocRangeHits.length()) {
i = multiValuedDocRangeHits.nextSetBit(i);
}
}
return docContributedToAtLeastOneRange;
}
@Override
int finish() {
if (hasUnflushedCounts) {
// Rollup any outstanding counts from single-valued cases:
missingCount = 0;
elementaryIntervalUpto = 0;
rollupSingleValued(root, false);
return missingCount;
} else {
return 0;
}
}
@Override
protected long[] boundaries() {
return boundaries;
}
@Override
protected void processSingleValuedHit(int elementaryIntervalNum) {
// Lazy init:
if (singleValuedElementaryIntervalCounts == null) {
singleValuedElementaryIntervalCounts = new int[boundaries.length];
}
singleValuedElementaryIntervalCounts[elementaryIntervalNum]++;
hasUnflushedCounts = true;
}
@Override
protected void processMultiValuedHit(int elementaryIntervalNum) {
assert multiValuedDocElementaryIntervalHits != null : "must call startDoc() first";
multiValuedDocElementaryIntervalHits.set(elementaryIntervalNum);
}
private static LongRangeNode split(int start, int end, List elementaryIntervals) {
if (start == end - 1) {
// leaf
InclusiveRange range = elementaryIntervals.get(start);
return new LongRangeNode(range.start, range.end, null, null, start);
} else {
int mid = (start + end) >>> 1;
LongRangeNode left = split(start, mid, elementaryIntervals);
LongRangeNode right = split(mid, end, elementaryIntervals);
return new LongRangeNode(left.start, right.end, left, right, -1);
}
}
/**
* Rolls up all the single-valued doc counts. Note that this is done once at the end of processing
* all documents (as part of {@link #finish()}. This is done in bulk at the end for efficiency
* purposes (vs. after ever document). This works only for cases where documents have a
* single-value. Multi-valued docs need to get rolled up after each document to ensure there's no
* double-counting (see {@link #rollupMultiValued(LongRangeNode)})
*/
private int rollupSingleValued(LongRangeNode node, boolean sawOutputs) {
int count;
sawOutputs |= node.outputs != null;
if (node.left != null) {
count = rollupSingleValued(node.left, sawOutputs);
count += rollupSingleValued(node.right, sawOutputs);
} else {
// Leaf:
count = singleValuedElementaryIntervalCounts[elementaryIntervalUpto];
elementaryIntervalUpto++;
if (sawOutputs == false) {
// This is a missing count (no output ranges were seen "above" us):
missingCount += count;
}
}
if (node.outputs != null) {
for (IntCursor rangeIndex : node.outputs) {
increment(rangeIndex.value, count);
}
}
return count;
}
/**
* Rolls up all the multi-valued doc counts. Note that this is done at the end of each document
* (as part of {@link #endMultiValuedDoc()}). All of the counts contributed by a single document
* get rolled up into the appropriate ranges in this step. It must be done after each document so
* that counts don't get double-counted, and so we know whether-or-not an individual doc actually
* contributed to any of the user-requested ranges.
*/
private boolean rollupMultiValued(LongRangeNode node) {
boolean containedHit;
if (node.left != null) {
containedHit = rollupMultiValued(node.left);
containedHit |= rollupMultiValued(node.right);
} else {
// Leaf:
containedHit = multiValuedDocElementaryIntervalHits.get(elementaryIntervalUpto);
elementaryIntervalUpto++;
}
if (containedHit && node.outputs != null) {
for (IntCursor rangeIndex : node.outputs) {
multiValuedDocRangeHits.set(rangeIndex.value);
}
}
return containedHit;
}
private static List buildElementaryIntervals(LongRange[] ranges) {
// Maps all range inclusive endpoints to int flags; 1
// = start of interval, 2 = end of interval. We need to
// track the start vs end case separately because if a
// given point is both, then it must be its own
// elementary interval:
LongIntHashMap endsMap = new LongIntHashMap();
endsMap.put(Long.MIN_VALUE, 1);
endsMap.put(Long.MAX_VALUE, 2);
for (LongRange range : ranges) {
int index = endsMap.indexOf(range.min);
if (index < 0) {
endsMap.indexInsert(index, range.min, 1);
} else {
endsMap.indexReplace(index, endsMap.indexGet(index) | 1);
}
index = endsMap.indexOf(range.max);
if (index < 0) {
endsMap.indexInsert(index, range.max, 2);
} else {
endsMap.indexReplace(index, endsMap.indexGet(index) | 2);
}
}
LongArrayList endsList = new LongArrayList(endsMap.size());
endsList.addAll(endsMap.keys());
Arrays.sort(endsList.buffer, 0, endsList.size());
// Build elementaryIntervals (a 1D Venn diagram):
List elementaryIntervals = new ArrayList<>();
int upto = 1;
long v = endsList.get(0);
long prev;
if (endsMap.get(v) == 3) {
elementaryIntervals.add(new InclusiveRange(v, v));
prev = v + 1;
} else {
prev = v;
}
while (upto < endsList.size()) {
v = endsList.get(upto);
int flags = endsMap.get(v);
if (flags == 3) {
// This point is both an end and a start; we need to
// separate it:
if (v > prev) {
elementaryIntervals.add(new InclusiveRange(prev, v - 1));
}
elementaryIntervals.add(new InclusiveRange(v, v));
prev = v + 1;
} else if (flags == 1) {
// This point is only the start of an interval;
// attach it to next interval:
if (v > prev) {
elementaryIntervals.add(new InclusiveRange(prev, v - 1));
}
prev = v;
} else {
assert flags == 2;
// This point is only the end of an interval; attach
// it to last interval:
elementaryIntervals.add(new InclusiveRange(prev, v));
prev = v + 1;
}
upto++;
}
return elementaryIntervals;
}
/** Holds one node of the segment tree. */
public static final class LongRangeNode {
final LongRangeNode left;
final LongRangeNode right;
// Our range, inclusive:
final long start;
final long end;
// If we are a leaf, the index into elementary ranges that we point to:
final int elementaryIntervalIndex;
// Which range indices to output when a query goes
// through this node:
IntArrayList outputs;
public LongRangeNode(
long start,
long end,
LongRangeNode left,
LongRangeNode right,
int elementaryIntervalIndex) {
this.start = start;
this.end = end;
this.left = left;
this.right = right;
this.elementaryIntervalIndex = elementaryIntervalIndex;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
toString(sb, 0);
return sb.toString();
}
static void indent(StringBuilder sb, int depth) {
for (int i = 0; i < depth; i++) {
sb.append(" ");
}
}
/** Recursively assigns range outputs to each node. */
void addOutputs(int index, LongRange range) {
if (start >= range.min && end <= range.max) {
// Our range is fully included in the incoming
// range; add to our output list:
if (outputs == null) {
outputs = new IntArrayList();
}
outputs.add(index);
} else if (left != null) {
assert right != null;
// Recurse:
left.addOutputs(index, range);
right.addOutputs(index, range);
}
}
void toString(StringBuilder sb, int depth) {
indent(sb, depth);
if (left == null) {
assert right == null;
sb.append("leaf: ").append(start).append(" to ").append(end);
} else {
sb.append("node: ").append(start).append(" to ").append(end);
}
if (outputs != null) {
sb.append(" outputs=");
sb.append(outputs);
}
sb.append('\n');
if (left != null) {
assert right != null;
left.toString(sb, depth + 1);
right.toString(sb, depth + 1);
}
}
}
}