All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.grouping.BlockGroupingCollector Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.grouping;

import java.io.IOException;

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.PriorityQueue;

// TODO: this sentence is too long for the class summary.
/** BlockGroupingCollector performs grouping with a
 *  single pass collector, as long as you are grouping by a
 *  doc block field, ie all documents sharing a given group
 *  value were indexed as a doc block using the atomic
 *  {@link IndexWriter#addDocuments IndexWriter.addDocuments()} 
 *  or {@link IndexWriter#updateDocuments IndexWriter.updateDocuments()} 
 *  API.
 *
 *  

This results in faster performance (~25% faster QPS) * than the two-pass grouping collectors, with the tradeoff * being that the documents in each group must always be * indexed as a block. This collector also fills in * TopGroups.totalGroupCount without requiring the separate * {@link org.apache.lucene.search.grouping.AllGroupsCollector}. However, this collector does * not fill in the groupValue of each group; this field * will always be null. * *

NOTE: this collector makes no effort to verify * the docs were in fact indexed as a block, so it's up to * you to ensure this was the case. * *

See {@link org.apache.lucene.search.grouping} for more * details including a full code example.

* * @lucene.experimental */ // TODO: TopGroups.merge() won't work with TopGroups returned by this collector, because // each block will be on a different shard. Add a specialized merge() static method // to this collector? public class BlockGroupingCollector extends SimpleCollector { private int[] pendingSubDocs; private float[] pendingSubScores; private int subDocUpto; private final Sort groupSort; private final int topNGroups; private final Weight lastDocPerGroup; // TODO: specialize into 2 classes, static "create" method: private final boolean needsScores; private final FieldComparator[] comparators; private final LeafFieldComparator[] leafComparators; private final int[] reversed; private final int compIDXEnd; private int bottomSlot; private boolean queueFull; private LeafReaderContext currentReaderContext; private int topGroupDoc; private int totalHitCount; private int totalGroupCount; private int docBase; private int groupEndDocID; private DocIdSetIterator lastDocPerGroupBits; private Scorable scorer; private final GroupQueue groupQueue; private boolean groupCompetes; private static final class OneGroup { LeafReaderContext readerContext; //int groupOrd; int topGroupDoc; int[] docs; float[] scores; int count; int comparatorSlot; } // Sorts by groupSort. Not static -- uses comparators, reversed private final class GroupQueue extends PriorityQueue { public GroupQueue(int size) { super(size); } @Override protected boolean lessThan(final OneGroup group1, final OneGroup group2) { //System.out.println(" ltcheck"); assert group1 != group2; assert group1.comparatorSlot != group2.comparatorSlot; final int numComparators = comparators.length; for (int compIDX=0;compIDX < numComparators; compIDX++) { final int c = reversed[compIDX] * comparators[compIDX].compare(group1.comparatorSlot, group2.comparatorSlot); if (c != 0) { // Short circuit return c > 0; } } // Break ties by docID; lower docID is always sorted first return group1.topGroupDoc > group2.topGroupDoc; } } // Called when we transition to another group; if the // group is competitive we insert into the group queue private void processGroup() throws IOException { totalGroupCount++; //System.out.println(" processGroup ord=" + lastGroupOrd + " competes=" + groupCompetes + " count=" + subDocUpto + " groupDoc=" + topGroupDoc); if (groupCompetes) { if (!queueFull) { // Startup transient: always add a new OneGroup final OneGroup og = new OneGroup(); og.count = subDocUpto; og.topGroupDoc = docBase + topGroupDoc; og.docs = pendingSubDocs; pendingSubDocs = new int[10]; if (needsScores) { og.scores = pendingSubScores; pendingSubScores = new float[10]; } og.readerContext = currentReaderContext; //og.groupOrd = lastGroupOrd; og.comparatorSlot = bottomSlot; final OneGroup bottomGroup = groupQueue.add(og); //System.out.println(" ADD group=" + getGroupString(lastGroupOrd) + " newBottom=" + getGroupString(bottomGroup.groupOrd)); queueFull = groupQueue.size() == topNGroups; if (queueFull) { // Queue just became full; now set the real bottom // in the comparators: bottomSlot = bottomGroup.comparatorSlot; //System.out.println(" set bottom=" + bottomSlot); for (int i = 0; i < comparators.length; i++) { leafComparators[i].setBottom(bottomSlot); } //System.out.println(" QUEUE FULL"); } else { // Queue not full yet -- just advance bottomSlot: bottomSlot = groupQueue.size(); } } else { // Replace bottom element in PQ and then updateTop final OneGroup og = groupQueue.top(); assert og != null; og.count = subDocUpto; og.topGroupDoc = docBase + topGroupDoc; // Swap pending docs final int[] savDocs = og.docs; og.docs = pendingSubDocs; pendingSubDocs = savDocs; if (needsScores) { // Swap pending scores final float[] savScores = og.scores; og.scores = pendingSubScores; pendingSubScores = savScores; } og.readerContext = currentReaderContext; //og.groupOrd = lastGroupOrd; bottomSlot = groupQueue.updateTop().comparatorSlot; //System.out.println(" set bottom=" + bottomSlot); for (int i = 0; i < comparators.length; i++) { leafComparators[i].setBottom(bottomSlot); } } } subDocUpto = 0; } /** * Create the single pass collector. * * @param groupSort The {@link Sort} used to sort the * groups. The top sorted document within each group * according to groupSort, determines how that group * sorts against other groups. This must be non-null, * ie, if you want to groupSort by relevance use * Sort.RELEVANCE. * @param topNGroups How many top groups to keep. * @param needsScores true if the collected documents * require scores, either because relevance is included * in the withinGroupSort or because you plan to pass true * for either getSscores or getMaxScores to {@link * #getTopGroups} * @param lastDocPerGroup a {@link Weight} that marks the * last document in each group. */ public BlockGroupingCollector(Sort groupSort, int topNGroups, boolean needsScores, Weight lastDocPerGroup) { if (topNGroups < 1) { throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")"); } groupQueue = new GroupQueue(topNGroups); pendingSubDocs = new int[10]; if (needsScores) { pendingSubScores = new float[10]; } this.needsScores = needsScores; this.lastDocPerGroup = lastDocPerGroup; this.groupSort = groupSort; this.topNGroups = topNGroups; final SortField[] sortFields = groupSort.getSort(); comparators = new FieldComparator[sortFields.length]; leafComparators = new LeafFieldComparator[sortFields.length]; compIDXEnd = comparators.length - 1; reversed = new int[sortFields.length]; for (int i = 0; i < sortFields.length; i++) { final SortField sortField = sortFields[i]; comparators[i] = sortField.getComparator(topNGroups, i); reversed[i] = sortField.getReverse() ? -1 : 1; } } // TODO: maybe allow no sort on retrieving groups? app // may want to simply process docs in the group itself? // typically they will be presented as a "single" result // in the UI? /** Returns the grouped results. Returns null if the * number of groups collected is <= groupOffset. * *

NOTE: This collector is unable to compute * the groupValue per group so it will always be null. * This is normally not a problem, as you can obtain the * value just like you obtain other values for each * matching document (eg, via stored fields, via * DocValues, etc.) * * @param withinGroupSort The {@link Sort} used to sort * documents within each group. * @param groupOffset Which group to start from * @param withinGroupOffset Which document to start from * within each group * @param maxDocsPerGroup How many top documents to keep * within each group. */ public TopGroups getTopGroups(Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup) throws IOException { //if (queueFull) { //System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups); //} if (subDocUpto != 0) { processGroup(); } if (groupOffset >= groupQueue.size()) { return null; } int totalGroupedHitCount = 0; final ScoreAndDoc fakeScorer = new ScoreAndDoc(); float maxScore = Float.MIN_VALUE; @SuppressWarnings({"unchecked","rawtypes"}) final GroupDocs[] groups = new GroupDocs[groupQueue.size() - groupOffset]; for(int downTo=groupQueue.size()-groupOffset-1;downTo>=0;downTo--) { final OneGroup og = groupQueue.pop(); // At this point we hold all docs w/ in each group, // unsorted; we now sort them: final TopDocsCollector collector; if (withinGroupSort.equals(Sort.RELEVANCE)) { // Sort by score if (!needsScores) { throw new IllegalArgumentException("cannot sort by relevance within group: needsScores=false"); } collector = TopScoreDocCollector.create(maxDocsPerGroup, Integer.MAX_VALUE); } else { // Sort by fields collector = TopFieldCollector.create(withinGroupSort, maxDocsPerGroup, Integer.MAX_VALUE); // TODO: disable exact counts? } float groupMaxScore = needsScores ? Float.NEGATIVE_INFINITY : Float.NaN; LeafCollector leafCollector = collector.getLeafCollector(og.readerContext); leafCollector.setScorer(fakeScorer); for(int docIDX=0;docIDX[comparators.length]; for(int sortFieldIDX=0;sortFieldIDX(Float.NaN, groupMaxScore, new TotalHits(og.count, TotalHits.Relation.EQUAL_TO), topDocs.scoreDocs, null, groupSortValues); maxScore = Math.max(maxScore, groupMaxScore); } /* while (groupQueue.size() != 0) { final OneGroup og = groupQueue.pop(); //System.out.println(" leftover: og ord=" + og.groupOrd + " count=" + og.count); totalGroupedHitCount += og.count; } */ return new TopGroups<>(new TopGroups<>(groupSort.getSort(), withinGroupSort.getSort(), totalHitCount, totalGroupedHitCount, groups, maxScore), totalGroupCount); } @Override public void setScorer(Scorable scorer) throws IOException { this.scorer = scorer; for (LeafFieldComparator comparator : leafComparators) { comparator.setScorer(scorer); } } @Override public void collect(int doc) throws IOException { // System.out.println("C " + doc); if (doc > groupEndDocID) { // Group changed if (subDocUpto != 0) { processGroup(); } groupEndDocID = lastDocPerGroupBits.advance(doc); //System.out.println(" adv " + groupEndDocID + " " + lastDocPerGroupBits); subDocUpto = 0; groupCompetes = !queueFull; } totalHitCount++; // Always cache doc/score within this group: if (subDocUpto == pendingSubDocs.length) { pendingSubDocs = ArrayUtil.grow(pendingSubDocs); } pendingSubDocs[subDocUpto] = doc; if (needsScores) { if (subDocUpto == pendingSubScores.length) { pendingSubScores = ArrayUtil.grow(pendingSubScores); } pendingSubScores[subDocUpto] = scorer.score(); } subDocUpto++; if (groupCompetes) { if (subDocUpto == 1) { assert !queueFull; //System.out.println(" init copy to bottomSlot=" + bottomSlot); for (LeafFieldComparator fc : leafComparators) { fc.copy(bottomSlot, doc); fc.setBottom(bottomSlot); } topGroupDoc = doc; } else { // Compare to bottomSlot for (int compIDX = 0;; compIDX++) { final int c = reversed[compIDX] * leafComparators[compIDX].compareBottom(doc); if (c < 0) { // Definitely not competitive -- done return; } else if (c > 0) { // Definitely competitive. break; } else if (compIDX == compIDXEnd) { // Ties with bottom, except we know this docID is // > docID in the queue (docs are visited in // order), so not competitive: return; } } //System.out.println(" best w/in group!"); for (LeafFieldComparator fc : leafComparators) { fc.copy(bottomSlot, doc); // Necessary because some comparators cache // details of bottom slot; this forces them to // re-cache: fc.setBottom(bottomSlot); } topGroupDoc = doc; } } else { // We're not sure this group will make it into the // queue yet for (int compIDX = 0;; compIDX++) { final int c = reversed[compIDX] * leafComparators[compIDX].compareBottom(doc); if (c < 0) { // Definitely not competitive -- done //System.out.println(" doc doesn't compete w/ top groups"); return; } else if (c > 0) { // Definitely competitive. break; } else if (compIDX == compIDXEnd) { // Ties with bottom, except we know this docID is // > docID in the queue (docs are visited in // order), so not competitive: //System.out.println(" doc doesn't compete w/ top groups"); return; } } groupCompetes = true; for (LeafFieldComparator fc : leafComparators) { fc.copy(bottomSlot, doc); // Necessary because some comparators cache // details of bottom slot; this forces them to // re-cache: fc.setBottom(bottomSlot); } topGroupDoc = doc; //System.out.println(" doc competes w/ top groups"); } } @Override protected void doSetNextReader(LeafReaderContext readerContext) throws IOException { if (subDocUpto != 0) { processGroup(); } subDocUpto = 0; docBase = readerContext.docBase; //System.out.println("setNextReader base=" + docBase + " r=" + readerContext.reader); Scorer s = lastDocPerGroup.scorer(readerContext); if (s == null) { lastDocPerGroupBits = null; } else { lastDocPerGroupBits = s.iterator(); } groupEndDocID = -1; currentReaderContext = readerContext; for (int i=0; i