org.elasticsearch.search.aggregations.bucket.BestDocsDeferringCollector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.aggregations.bucket;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.ObjectArray;
import org.elasticsearch.search.aggregations.BucketCollector;
import org.elasticsearch.search.aggregations.LeafBucketCollector;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
/**
* A specialization of {@link DeferringBucketCollector} that collects all
* matches and then replays only the top scoring documents to child
* aggregations. The method
* {@link BestDocsDeferringCollector#createTopDocsCollector(int)} is designed to
* be overridden and allows subclasses to choose a custom collector
* implementation for determining the top N matches.
*
*/
public class BestDocsDeferringCollector extends DeferringBucketCollector implements Releasable {
final List entries = new ArrayList<>();
BucketCollector deferred;
ObjectArray perBucketSamples;
private int shardSize;
private PerSegmentCollects perSegCollector;
private final BigArrays bigArrays;
/**
* Sole constructor.
*
* @param shardSize
* The number of top-scoring docs to collect for each bucket
*/
public BestDocsDeferringCollector(int shardSize, BigArrays bigArrays) {
this.shardSize = shardSize;
this.bigArrays = bigArrays;
perBucketSamples = bigArrays.newObjectArray(1);
}
@Override
public boolean needsScores() {
return true;
}
/** Set the deferred collectors. */
@Override
public void setDeferredCollector(Iterable deferredCollectors) {
this.deferred = BucketCollector.wrap(deferredCollectors);
}
@Override
public LeafBucketCollector getLeafCollector(LeafReaderContext ctx) throws IOException {
perSegCollector = new PerSegmentCollects(ctx);
entries.add(perSegCollector);
// Deferring collector
return new LeafBucketCollector() {
@Override
public void setScorer(Scorer scorer) throws IOException {
perSegCollector.setScorer(scorer);
}
@Override
public void collect(int doc, long bucket) throws IOException {
perSegCollector.collect(doc, bucket);
}
};
}
// Designed to be overridden by subclasses that may score docs by criteria
// other than Lucene score
protected TopDocsCollector extends ScoreDoc> createTopDocsCollector(int size) throws IOException {
return TopScoreDocCollector.create(size);
}
@Override
public void preCollection() throws IOException {
}
@Override
public void postCollection() throws IOException {
runDeferredAggs();
}
@Override
public void prepareSelectedBuckets(long... selectedBuckets) throws IOException {
// no-op - deferred aggs processed in postCollection call
}
private void runDeferredAggs() throws IOException {
deferred.preCollection();
List allDocs = new ArrayList<>(shardSize);
for (int i = 0; i < perBucketSamples.size(); i++) {
PerParentBucketSamples perBucketSample = perBucketSamples.get(i);
if (perBucketSample == null) {
continue;
}
perBucketSample.getMatches(allDocs);
}
// Sort the top matches by docID for the benefit of deferred collector
ScoreDoc[] docsArr = allDocs.toArray(new ScoreDoc[allDocs.size()]);
Arrays.sort(docsArr, new Comparator() {
@Override
public int compare(ScoreDoc o1, ScoreDoc o2) {
if(o1.doc == o2.doc){
return o1.shardIndex - o2.shardIndex;
}
return o1.doc - o2.doc;
}
});
try {
for (PerSegmentCollects perSegDocs : entries) {
perSegDocs.replayRelatedMatches(docsArr);
}
} catch (IOException e) {
throw new ElasticsearchException("IOException collecting best scoring results", e);
}
deferred.postCollection();
}
class PerParentBucketSamples {
private LeafCollector currentLeafCollector;
private TopDocsCollector extends ScoreDoc> tdc;
private long parentBucket;
private int matchedDocs;
public PerParentBucketSamples(long parentBucket, Scorer scorer, LeafReaderContext readerContext) {
try {
this.parentBucket = parentBucket;
tdc = createTopDocsCollector(shardSize);
currentLeafCollector = tdc.getLeafCollector(readerContext);
setScorer(scorer);
} catch (IOException e) {
throw new ElasticsearchException("IO error creating collector", e);
}
}
public void getMatches(List allDocs) {
TopDocs topDocs = tdc.topDocs();
ScoreDoc[] sd = topDocs.scoreDocs;
matchedDocs = sd.length;
for (ScoreDoc scoreDoc : sd) {
// A bit of a hack to (ab)use shardIndex property here to
// hold a bucket ID but avoids allocating extra data structures
// and users should have bigger concerns if bucket IDs
// exceed int capacity..
scoreDoc.shardIndex = (int) parentBucket;
}
allDocs.addAll(Arrays.asList(sd));
}
public void collect(int doc) throws IOException {
currentLeafCollector.collect(doc);
}
public void setScorer(Scorer scorer) throws IOException {
currentLeafCollector.setScorer(scorer);
}
public void changeSegment(LeafReaderContext readerContext) throws IOException {
currentLeafCollector = tdc.getLeafCollector(readerContext);
}
public int getDocCount() {
return matchedDocs;
}
}
class PerSegmentCollects extends Scorer {
private LeafReaderContext readerContext;
int maxDocId = Integer.MIN_VALUE;
private float currentScore;
private int currentDocId = -1;
private Scorer currentScorer;
PerSegmentCollects(LeafReaderContext readerContext) throws IOException {
// The publisher behaviour for Reader/Scorer listeners triggers a
// call to this constructor with a null scorer so we can't call
// scorer.getWeight() and pass the Weight to our base class.
// However, passing null seems to have no adverse effects here...
super(null);
this.readerContext = readerContext;
for (int i = 0; i < perBucketSamples.size(); i++) {
PerParentBucketSamples perBucketSample = perBucketSamples.get(i);
if (perBucketSample == null) {
continue;
}
perBucketSample.changeSegment(readerContext);
}
}
public void setScorer(Scorer scorer) throws IOException {
this.currentScorer = scorer;
for (int i = 0; i < perBucketSamples.size(); i++) {
PerParentBucketSamples perBucketSample = perBucketSamples.get(i);
if (perBucketSample == null) {
continue;
}
perBucketSample.setScorer(scorer);
}
}
public void replayRelatedMatches(ScoreDoc[] sd) throws IOException {
final LeafBucketCollector leafCollector = deferred.getLeafCollector(readerContext);
leafCollector.setScorer(this);
currentScore = 0;
currentDocId = -1;
if (maxDocId < 0) {
return;
}
for (ScoreDoc scoreDoc : sd) {
// Doc ids from TopDocCollector are root-level Reader so
// need rebasing
int rebased = scoreDoc.doc - readerContext.docBase;
if ((rebased >= 0) && (rebased <= maxDocId)) {
currentScore = scoreDoc.score;
currentDocId = rebased;
// We stored the bucket ID in Lucene's shardIndex property
// for convenience.
leafCollector.collect(rebased, scoreDoc.shardIndex);
}
}
}
@Override
public float score() throws IOException {
return currentScore;
}
@Override
public int freq() throws IOException {
throw new ElasticsearchException("This caching scorer implementation only implements score() and docID()");
}
@Override
public int docID() {
return currentDocId;
}
@Override
public DocIdSetIterator iterator() {
throw new ElasticsearchException("This caching scorer implementation only implements score() and docID()");
}
public void collect(int docId, long parentBucket) throws IOException {
perBucketSamples = bigArrays.grow(perBucketSamples, parentBucket + 1);
PerParentBucketSamples sampler = perBucketSamples.get((int) parentBucket);
if (sampler == null) {
sampler = new PerParentBucketSamples(parentBucket, currentScorer, readerContext);
perBucketSamples.set((int) parentBucket, sampler);
}
sampler.collect(docId);
maxDocId = Math.max(maxDocId, docId);
}
}
public int getDocCount(long parentBucket) {
PerParentBucketSamples sampler = perBucketSamples.get((int) parentBucket);
if (sampler == null) {
// There are conditions where no docs are collected and the aggs
// framework still asks for doc count.
return 0;
}
return sampler.getDocCount();
}
@Override
public void close() throws ElasticsearchException {
Releasables.close(perBucketSamples);
}
}