org.apache.lucene.facet.RandomSamplingFacetsCollector Maven / Gradle / Ivy
Show all versions of lucene-facet Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet;
/**
* Collects hits for subsequent faceting, using sampling if needed. Once you've run a search and
* collect hits into this, instantiate one of the {@link Facets} subclasses to do the facet
* counting. Note that this collector does not collect the scores of matching docs (i.e. {@link
* FacetsCollector.MatchingDocs#scores}) is {@code null}.
*
* If you require the original set of hits, you can call {@link #getOriginalMatchingDocs()}.
* Also, since the counts of the top-facets is based on the sampled set, you can amortize the counts
* by calling {@link #amortizeFacetCounts}.
*/
public class RandomSamplingFacetsCollector extends FacetsCollector {
/**
* Faster alternative for java.util.Random, inspired by
* http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random- performance-in-java/
*
*
Has a period of 2^64-1
*/
private static class XORShift64Random {
private long x;
/** Creates a xorshift random generator using the provided seed */
public XORShift64Random(long seed) {
x = seed == 0 ? 0xdeadbeef : seed;
}
/** Get the next random long value */
public long randomLong() {
x ^= (x << 21);
x ^= (x >>> 35);
x ^= (x << 4);
return x;
}
/** Get the next random int, between 0 (inclusive) and n (exclusive) */
public int nextInt(int n) {
int res = (int) (randomLong() % n);
return (res < 0) ? -res : res;
}
}
private static final int NOT_CALCULATED = -1;
private final int sampleSize;
private final XORShift64Random random;
private double samplingRate;
private List sampledDocs;
private int totalHits = NOT_CALCULATED;
private int leftoverBin = NOT_CALCULATED;
private int leftoverIndex = NOT_CALCULATED;
/**
* Constructor with the given sample size and default seed.
*
* @see #RandomSamplingFacetsCollector(int, long)
*/
public RandomSamplingFacetsCollector(int sampleSize) {
this(sampleSize, 0);
}
/**
* Constructor with the given sample size and seed.
*
* @param sampleSize The preferred sample size. If the number of hits is greater than the size,
* sampling will be done using a sample ratio of sampling size / totalN. For example: 1000
* hits, sample size = 10 results in samplingRatio of 0.01. If the number of hits is lower, no
* sampling is done at all
* @param seed The random seed. If {@code 0} then a seed will be chosen for you.
*/
public RandomSamplingFacetsCollector(int sampleSize, long seed) {
super(false);
this.sampleSize = sampleSize;
this.random = new XORShift64Random(seed);
this.sampledDocs = null;
}
/**
* Returns the sampled list of the matching documents. Note that a {@link
* FacetsCollector.MatchingDocs} instance is returned per segment, even if no hits from that
* segment are included in the sampled set.
*
* Note: One or more of the MatchingDocs might be empty (not containing any hits) as result of
* sampling.
*
*
Note: {@code MatchingDocs.totalHits} is copied from the original MatchingDocs, scores is set
* to {@code null}
*/
@Override
public List getMatchingDocs() {
List matchingDocs = super.getMatchingDocs();
if (totalHits == NOT_CALCULATED) {
totalHits = 0;
for (MatchingDocs md : matchingDocs) {
totalHits += md.totalHits;
}
}
if (totalHits <= sampleSize) {
return matchingDocs;
}
if (sampledDocs == null) {
samplingRate = (1.0 * sampleSize) / totalHits;
sampledDocs = createSampledDocs(matchingDocs);
}
return sampledDocs;
}
/** Returns the original matching documents. */
public List getOriginalMatchingDocs() {
return super.getMatchingDocs();
}
/** Create a sampled copy of the matching documents list. */
private List createSampledDocs(List matchingDocsList) {
List sampledDocsList = new ArrayList<>(matchingDocsList.size());
for (MatchingDocs docs : matchingDocsList) {
sampledDocsList.add(createSample(docs));
}
return sampledDocsList;
}
/** Create a sampled of the given hits. */
private MatchingDocs createSample(MatchingDocs docs) {
int maxdoc = docs.context.reader().maxDoc();
// TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse
FixedBitSet sampleDocs = new FixedBitSet(maxdoc);
int binSize = (int) (1.0 / samplingRate);
try {
int counter = 0;
int limit, randomIndex;
if (leftoverBin != NOT_CALCULATED) {
limit = leftoverBin;
// either NOT_CALCULATED, which means we already sampled from that bin,
// or the next document to sample
randomIndex = leftoverIndex;
} else {
limit = binSize;
randomIndex = random.nextInt(binSize);
}
final DocIdSetIterator it = docs.bits.iterator();
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
if (counter == randomIndex) {
sampleDocs.set(doc);
}
counter++;
if (counter >= limit) {
counter = 0;
limit = binSize;
randomIndex = random.nextInt(binSize);
}
}
if (counter == 0) {
// we either exhausted the bin and the iterator at the same time, or
// this segment had no results. in the latter case we might want to
// carry leftover to the next segment as is, but that complicates the
// code and doesn't seem so important.
leftoverBin = leftoverIndex = NOT_CALCULATED;
} else {
leftoverBin = limit - counter;
if (randomIndex > counter) {
// the document to sample is in the next bin
leftoverIndex = randomIndex - counter;
} else if (randomIndex < counter) {
// we sampled a document from the bin, so just skip over remaining
// documents in the bin in the next segment.
leftoverIndex = NOT_CALCULATED;
}
}
return new MatchingDocs(docs.context, new BitDocIdSet(sampleDocs), docs.totalHits, null);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Note: if you use a counting {@link Facets} implementation, you can amortize the sampled counts
* by calling this method. Uses the {@link FacetsConfig} and the {@link IndexSearcher} to
* determine the upper bound for each facet value.
*/
public FacetResult amortizeFacetCounts(
FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
if (res == null || totalHits <= sampleSize) {
return res;
}
LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
IndexReader reader = searcher.getIndexReader();
DimConfig dimConfig = config.getDimConfig(res.dim);
// +2 to prepend dimension, append child label
String[] childPath = new String[res.path.length + 2];
childPath[0] = res.dim;
System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse
for (int i = 0; i < res.labelValues.length; i++) {
childPath[res.path.length + 1] = res.labelValues[i].label;
String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
correctedCount = Math.min(max, correctedCount);
fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
}
// cap the total count on the total number of non-deleted documents in the reader
int correctedTotalCount = res.value.intValue();
if (correctedTotalCount > 0) {
correctedTotalCount =
Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
}
return new FacetResult(
res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
}
/** Returns the sampling rate that was used. */
public double getSamplingRate() {
return samplingRate;
}
/**
* Creates a {@link CollectorManager} for concurrent random sampling through {@link
* RandomSamplingFacetsCollector}
*/
public static CollectorManager
createManager(int sampleSize, long seed) {
return new CollectorManager<>() {
@Override
public RandomSamplingFacetsCollector newCollector() {
return new RandomSamplingFacetsCollector(sampleSize, seed);
}
@Override
public RandomSamplingFacetsCollector reduce(
Collection collectors) {
if (collectors == null || collectors.size() == 0) {
return new RandomSamplingFacetsCollector(sampleSize, seed);
}
if (collectors.size() == 1) {
return collectors.iterator().next();
}
return new ReducedRandomSamplingFacetsCollector(sampleSize, seed, collectors);
}
};
}
private static class ReducedRandomSamplingFacetsCollector extends RandomSamplingFacetsCollector {
ReducedRandomSamplingFacetsCollector(
int sampleSize, long seed, Collection facetsCollectors) {
super(sampleSize, seed);
facetsCollectors.forEach(
facetsCollector ->
getOriginalMatchingDocs().addAll(facetsCollector.getOriginalMatchingDocs()));
}
}
}