org.apache.lucene.search.grouping.BlockGroupingCollector Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.grouping;
import java.io.IOException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Pruning;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.PriorityQueue;
// TODO: this sentence is too long for the class summary.
/**
* BlockGroupingCollector performs grouping with a single pass collector, as long as you are
* grouping by a doc block field, ie all documents sharing a given group value were indexed as a doc
* block using the atomic {@link IndexWriter#addDocuments IndexWriter.addDocuments()} or {@link
* IndexWriter#updateDocuments IndexWriter.updateDocuments()} API.
*
* This results in faster performance (~25% faster QPS) than the two-pass grouping collectors,
* with the tradeoff being that the documents in each group must always be indexed as a block. This
* collector also fills in TopGroups.totalGroupCount without requiring the separate {@link
* org.apache.lucene.search.grouping.AllGroupsCollector}. However, this collector does not fill in
* the groupValue of each group; this field will always be null.
*
*
NOTE: this collector makes no effort to verify the docs were in fact indexed as a
* block, so it's up to you to ensure this was the case.
*
*
See {@link org.apache.lucene.search.grouping} for more details including a full code example.
*
* @lucene.experimental
*/
// TODO: TopGroups.merge() won't work with TopGroups returned by this collector, because
// each block will be on a different shard. Add a specialized merge() static method
// to this collector?
public class BlockGroupingCollector extends SimpleCollector {
private int[] pendingSubDocs;
private float[] pendingSubScores;
private int subDocUpto;
private final Sort groupSort;
private final int topNGroups;
private final Weight lastDocPerGroup;
// TODO: specialize into 2 classes, static "create" method:
private final boolean needsScores;
private final FieldComparator>[] comparators;
private final LeafFieldComparator[] leafComparators;
private final int[] reversed;
private final int compIDXEnd;
private int bottomSlot;
private boolean queueFull;
private LeafReaderContext currentReaderContext;
private int topGroupDoc;
private int totalHitCount;
private int totalGroupCount;
private int docBase;
private int groupEndDocID;
private DocIdSetIterator lastDocPerGroupBits;
private Scorable scorer;
private final GroupQueue groupQueue;
private boolean groupCompetes;
private static final class OneGroup {
LeafReaderContext readerContext;
// int groupOrd;
int topGroupDoc;
int[] docs;
float[] scores;
int count;
int comparatorSlot;
}
// Sorts by groupSort. Not static -- uses comparators, reversed
private final class GroupQueue extends PriorityQueue {
public GroupQueue(int size) {
super(size);
}
@Override
protected boolean lessThan(final OneGroup group1, final OneGroup group2) {
// System.out.println(" ltcheck");
assert group1 != group2;
assert group1.comparatorSlot != group2.comparatorSlot;
final int numComparators = comparators.length;
for (int compIDX = 0; compIDX < numComparators; compIDX++) {
final int c =
reversed[compIDX]
* comparators[compIDX].compare(group1.comparatorSlot, group2.comparatorSlot);
if (c != 0) {
// Short circuit
return c > 0;
}
}
// Break ties by docID; lower docID is always sorted first
return group1.topGroupDoc > group2.topGroupDoc;
}
}
// Called when we transition to another group; if the
// group is competitive we insert into the group queue
private void processGroup() throws IOException {
totalGroupCount++;
// System.out.println(" processGroup ord=" + lastGroupOrd + " competes=" + groupCompetes + "
// count=" + subDocUpto + " groupDoc=" + topGroupDoc);
if (groupCompetes) {
if (!queueFull) {
// Startup transient: always add a new OneGroup
final OneGroup og = new OneGroup();
og.count = subDocUpto;
og.topGroupDoc = docBase + topGroupDoc;
og.docs = pendingSubDocs;
pendingSubDocs = new int[10];
if (needsScores) {
og.scores = pendingSubScores;
pendingSubScores = new float[10];
}
og.readerContext = currentReaderContext;
// og.groupOrd = lastGroupOrd;
og.comparatorSlot = bottomSlot;
final OneGroup bottomGroup = groupQueue.add(og);
// System.out.println(" ADD group=" + getGroupString(lastGroupOrd) + " newBottom=" +
// getGroupString(bottomGroup.groupOrd));
queueFull = groupQueue.size() == topNGroups;
if (queueFull) {
// Queue just became full; now set the real bottom
// in the comparators:
bottomSlot = bottomGroup.comparatorSlot;
// System.out.println(" set bottom=" + bottomSlot);
for (int i = 0; i < comparators.length; i++) {
leafComparators[i].setBottom(bottomSlot);
}
// System.out.println(" QUEUE FULL");
} else {
// Queue not full yet -- just advance bottomSlot:
bottomSlot = groupQueue.size();
}
} else {
// Replace bottom element in PQ and then updateTop
final OneGroup og = groupQueue.top();
assert og != null;
og.count = subDocUpto;
og.topGroupDoc = docBase + topGroupDoc;
// Swap pending docs
final int[] savDocs = og.docs;
og.docs = pendingSubDocs;
pendingSubDocs = savDocs;
if (needsScores) {
// Swap pending scores
final float[] savScores = og.scores;
og.scores = pendingSubScores;
pendingSubScores = savScores;
}
og.readerContext = currentReaderContext;
// og.groupOrd = lastGroupOrd;
bottomSlot = groupQueue.updateTop().comparatorSlot;
// System.out.println(" set bottom=" + bottomSlot);
for (int i = 0; i < comparators.length; i++) {
leafComparators[i].setBottom(bottomSlot);
}
}
}
subDocUpto = 0;
}
/**
* Create the single pass collector.
*
* @param groupSort The {@link Sort} used to sort the groups. The top sorted document within each
* group according to groupSort, determines how that group sorts against other groups. This
* must be non-null, ie, if you want to groupSort by relevance use Sort.RELEVANCE.
* @param topNGroups How many top groups to keep.
* @param needsScores true if the collected documents require scores, either because relevance is
* included in the withinGroupSort or because you plan to pass true for either getSscores or
* getMaxScores to {@link #getTopGroups}
* @param lastDocPerGroup a {@link Weight} that marks the last document in each group.
*/
public BlockGroupingCollector(
Sort groupSort, int topNGroups, boolean needsScores, Weight lastDocPerGroup) {
if (topNGroups < 1) {
throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
}
groupQueue = new GroupQueue(topNGroups);
pendingSubDocs = new int[10];
if (needsScores) {
pendingSubScores = new float[10];
}
this.needsScores = needsScores;
this.lastDocPerGroup = lastDocPerGroup;
this.groupSort = groupSort;
this.topNGroups = topNGroups;
final SortField[] sortFields = groupSort.getSort();
comparators = new FieldComparator>[sortFields.length];
leafComparators = new LeafFieldComparator[sortFields.length];
compIDXEnd = comparators.length - 1;
reversed = new int[sortFields.length];
for (int i = 0; i < sortFields.length; i++) {
final SortField sortField = sortFields[i];
comparators[i] = sortField.getComparator(topNGroups, Pruning.NONE);
reversed[i] = sortField.getReverse() ? -1 : 1;
}
}
// TODO: maybe allow no sort on retrieving groups? app
// may want to simply process docs in the group itself?
// typically they will be presented as a "single" result
// in the UI?
/**
* Returns the grouped results. Returns null if the number of groups collected is <=
* groupOffset.
*
* NOTE: This collector is unable to compute the groupValue per group so it will always
* be null. This is normally not a problem, as you can obtain the value just like you obtain other
* values for each matching document (eg, via stored fields, via DocValues, etc.)
*
* @param withinGroupSort The {@link Sort} used to sort documents within each group.
* @param groupOffset Which group to start from
* @param withinGroupOffset Which document to start from within each group
* @param maxDocsPerGroup How many top documents to keep within each group.
*/
public TopGroups> getTopGroups(
Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup)
throws IOException {
// if (queueFull) {
// System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups);
// }
if (groupOffset >= groupQueue.size()) {
return null;
}
int totalGroupedHitCount = 0;
final ScoreAndDoc fakeScorer = new ScoreAndDoc();
float maxScore = Float.MIN_VALUE;
@SuppressWarnings({"unchecked", "rawtypes"})
final GroupDocs