All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.facet.RandomSamplingFacetsCollector Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.CollectorManager;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet;

/**
 * Collects hits for subsequent faceting, using sampling if needed. Once you've run a search and
 * collect hits into this, instantiate one of the {@link Facets} subclasses to do the facet
 * counting. Note that this collector does not collect the scores of matching docs (i.e. {@link
 * FacetsCollector.MatchingDocs#scores()}) is {@code null}.
 *
 * 

If you require the original set of hits, you can call {@link #getOriginalMatchingDocs()}. * Also, since the counts of the top-facets is based on the sampled set, you can amortize the counts * by calling {@link #amortizeFacetCounts}. */ public class RandomSamplingFacetsCollector extends FacetsCollector { /** * Faster alternative for java.util.Random, inspired by * http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random- performance-in-java/ * *

Has a period of 2^64-1 */ private static class XORShift64Random { private long x; /** Creates a xorshift random generator using the provided seed */ public XORShift64Random(long seed) { x = seed == 0 ? 0xdeadbeef : seed; } /** Get the next random long value */ public long randomLong() { x ^= (x << 21); x ^= (x >>> 35); x ^= (x << 4); return x; } /** Get the next random int, between 0 (inclusive) and n (exclusive) */ public int nextInt(int n) { int res = (int) (randomLong() % n); return (res < 0) ? -res : res; } } private static final int NOT_CALCULATED = -1; private final int sampleSize; private final XORShift64Random random; private double samplingRate; private List sampledDocs; private int totalHits = NOT_CALCULATED; private int leftoverBin = NOT_CALCULATED; private int leftoverIndex = NOT_CALCULATED; /** * Constructor with the given sample size and default seed. * * @see #RandomSamplingFacetsCollector(int, long) */ public RandomSamplingFacetsCollector(int sampleSize) { this(sampleSize, 0); } /** * Constructor with the given sample size and seed. * * @param sampleSize The preferred sample size. If the number of hits is greater than the size, * sampling will be done using a sample ratio of sampling size / totalN. For example: 1000 * hits, sample size = 10 results in samplingRatio of 0.01. If the number of hits is lower, no * sampling is done at all * @param seed The random seed. If {@code 0} then a seed will be chosen for you. */ public RandomSamplingFacetsCollector(int sampleSize, long seed) { super(false); this.sampleSize = sampleSize; this.random = new XORShift64Random(seed); this.sampledDocs = null; } /** * Returns the sampled list of the matching documents. Note that a {@link * FacetsCollector.MatchingDocs} instance is returned per segment, even if no hits from that * segment are included in the sampled set. * *

Note: One or more of the MatchingDocs might be empty (not containing any hits) as result of * sampling. * *

Note: {@code MatchingDocs.totalHits} is copied from the original MatchingDocs, scores is set * to {@code null} */ @Override public List getMatchingDocs() { List matchingDocs = super.getMatchingDocs(); if (totalHits == NOT_CALCULATED) { totalHits = 0; for (MatchingDocs md : matchingDocs) { totalHits += md.totalHits(); } } if (totalHits <= sampleSize) { return matchingDocs; } if (sampledDocs == null) { samplingRate = (1.0 * sampleSize) / totalHits; sampledDocs = createSampledDocs(matchingDocs); } return sampledDocs; } /** Returns the original matching documents. */ public List getOriginalMatchingDocs() { return super.getMatchingDocs(); } /** Create a sampled copy of the matching documents list. */ private List createSampledDocs(List matchingDocsList) { List sampledDocsList = new ArrayList<>(matchingDocsList.size()); for (MatchingDocs docs : matchingDocsList) { sampledDocsList.add(createSample(docs)); } return sampledDocsList; } /** Create a sampled of the given hits. */ private MatchingDocs createSample(MatchingDocs docs) { int maxdoc = docs.context().reader().maxDoc(); // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse FixedBitSet sampleDocs = new FixedBitSet(maxdoc); int binSize = (int) (1.0 / samplingRate); try { int counter = 0; int limit, randomIndex; if (leftoverBin != NOT_CALCULATED) { limit = leftoverBin; // either NOT_CALCULATED, which means we already sampled from that bin, // or the next document to sample randomIndex = leftoverIndex; } else { limit = binSize; randomIndex = random.nextInt(binSize); } final DocIdSetIterator it = docs.bits().iterator(); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { if (counter == randomIndex) { sampleDocs.set(doc); } counter++; if (counter >= limit) { counter = 0; limit = binSize; randomIndex = random.nextInt(binSize); } } if (counter == 0) { // we either exhausted the bin and the iterator at the same time, or // this segment had no results. in the latter case we might want to // carry leftover to the next segment as is, but that complicates the // code and doesn't seem so important. leftoverBin = leftoverIndex = NOT_CALCULATED; } else { leftoverBin = limit - counter; if (randomIndex > counter) { // the document to sample is in the next bin leftoverIndex = randomIndex - counter; } else if (randomIndex < counter) { // we sampled a document from the bin, so just skip over remaining // documents in the bin in the next segment. leftoverIndex = NOT_CALCULATED; } } return new MatchingDocs(docs.context(), new BitDocIdSet(sampleDocs), docs.totalHits(), null); } catch (IOException e) { throw new RuntimeException(e); } } /** * Note: if you use a counting {@link Facets} implementation, you can amortize the sampled counts * by calling this method. Uses the {@link FacetsConfig} and the {@link IndexSearcher} to * determine the upper bound for each facet value. */ public FacetResult amortizeFacetCounts( FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException { if (res == null || totalHits <= sampleSize) { return res; } LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length]; IndexReader reader = searcher.getIndexReader(); DimConfig dimConfig = config.getDimConfig(res.dim); // +2 to prepend dimension, append child label String[] childPath = new String[res.path.length + 2]; childPath[0] = res.dim; System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse for (int i = 0; i < res.labelValues.length; i++) { childPath[res.path.length + 1] = res.labelValues[i].label; String fullPath = FacetsConfig.pathToString(childPath, childPath.length); int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath)); int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate); correctedCount = Math.min(max, correctedCount); fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount); } // cap the total count on the total number of non-deleted documents in the reader int correctedTotalCount = res.value.intValue(); if (correctedTotalCount > 0) { correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate)); } return new FacetResult( res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount); } /** Returns the sampling rate that was used. */ public double getSamplingRate() { return samplingRate; } /** * Creates a {@link CollectorManager} for concurrent random sampling through {@link * RandomSamplingFacetsCollector} */ public static CollectorManager createManager(int sampleSize, long seed) { return new CollectorManager<>() { @Override public RandomSamplingFacetsCollector newCollector() { return new RandomSamplingFacetsCollector(sampleSize, seed); } @Override public RandomSamplingFacetsCollector reduce( Collection collectors) { if (collectors == null || collectors.size() == 0) { return new RandomSamplingFacetsCollector(sampleSize, seed); } if (collectors.size() == 1) { return collectors.iterator().next(); } return new ReducedRandomSamplingFacetsCollector(sampleSize, seed, collectors); } }; } private static class ReducedRandomSamplingFacetsCollector extends RandomSamplingFacetsCollector { ReducedRandomSamplingFacetsCollector( int sampleSize, long seed, Collection facetsCollectors) { super(sampleSize, seed); this.getOriginalMatchingDocs() .addAll(FacetsCollectorManager.reduceMatchingDocs(facetsCollectors)); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy