org.apache.lucene.search.grouping.BlockGroupingCollector Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-grouping Show documentation
Lucene Grouping Module
There is a newer version: 9.11.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.grouping;

import java.io.IOException;

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopDocsCollector;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.PriorityQueue;

// TODO: this sentence is too long for the class summary.
/** BlockGroupingCollector performs grouping with a
 *  single pass collector, as long as you are grouping by a
 *  doc block field, ie all documents sharing a given group
 *  value were indexed as a doc block using the atomic
 *  {@link IndexWriter#addDocuments IndexWriter.addDocuments()} 
 *  or {@link IndexWriter#updateDocuments IndexWriter.updateDocuments()} 
 *  API.
 *
 *  This results in faster performance (~25% faster QPS)
 *  than the two-pass grouping collectors, with the tradeoff
 *  being that the documents in each group must always be
 *  indexed as a block.  This collector also fills in
 *  TopGroups.totalGroupCount without requiring the separate
 *  {@link org.apache.lucene.search.grouping.AllGroupsCollector}.  However, this collector does
 *  not fill in the groupValue of each group; this field
 *  will always be null.
 *
 *  
NOTE: this collector makes no effort to verify
 *  the docs were in fact indexed as a block, so it's up to
 *  you to ensure this was the case.
 *
 *  
See {@link org.apache.lucene.search.grouping} for more
 *  details including a full code example.
 *
 * @lucene.experimental
 */

// TODO: TopGroups.merge() won't work with TopGroups returned by this collector, because
// each block will be on a different shard.  Add a specialized merge() static method
// to this collector?

public class BlockGroupingCollector extends SimpleCollector {

  private int[] pendingSubDocs;
  private float[] pendingSubScores;
  private int subDocUpto;

  private final Sort groupSort;
  private final int topNGroups;
  private final Weight lastDocPerGroup;

  // TODO: specialize into 2 classes, static "create" method:
  private final boolean needsScores;

  private final FieldComparator[] comparators;
  private final LeafFieldComparator[] leafComparators;
  private final int[] reversed;
  private final int compIDXEnd;
  private int bottomSlot;
  private boolean queueFull;
  private LeafReaderContext currentReaderContext;

  private int topGroupDoc;
  private int totalHitCount;
  private int totalGroupCount;
  private int docBase;
  private int groupEndDocID;
  private DocIdSetIterator lastDocPerGroupBits;
  private Scorable scorer;
  private final GroupQueue groupQueue;
  private boolean groupCompetes;

  private static final class OneGroup {
    LeafReaderContext readerContext;
    //int groupOrd;
    int topGroupDoc;
    int[] docs;
    float[] scores;
    int count;
    int comparatorSlot;
  }
  
  // Sorts by groupSort.  Not static -- uses comparators, reversed
  private final class GroupQueue extends PriorityQueue {

    public GroupQueue(int size) {
      super(size);
    }

    @Override
    protected boolean lessThan(final OneGroup group1, final OneGroup group2) {

      //System.out.println("    ltcheck");
      assert group1 != group2;
      assert group1.comparatorSlot != group2.comparatorSlot;

      final int numComparators = comparators.length;
      for (int compIDX=0;compIDX < numComparators; compIDX++) {
        final int c = reversed[compIDX] * comparators[compIDX].compare(group1.comparatorSlot, group2.comparatorSlot);
        if (c != 0) {
          // Short circuit
          return c > 0;
        }
      }

      // Break ties by docID; lower docID is always sorted first
      return group1.topGroupDoc > group2.topGroupDoc;
    }
  }

  // Called when we transition to another group; if the
  // group is competitive we insert into the group queue
  private void processGroup() throws IOException {
    totalGroupCount++;
    //System.out.println("    processGroup ord=" + lastGroupOrd + " competes=" + groupCompetes + " count=" + subDocUpto + " groupDoc=" + topGroupDoc);
    if (groupCompetes) {
      if (!queueFull) {
        // Startup transient: always add a new OneGroup
        final OneGroup og = new OneGroup();
        og.count = subDocUpto;
        og.topGroupDoc = docBase + topGroupDoc;
        og.docs = pendingSubDocs;
        pendingSubDocs = new int[10];
        if (needsScores) {
          og.scores = pendingSubScores;
          pendingSubScores = new float[10];
        }
        og.readerContext = currentReaderContext;
        //og.groupOrd = lastGroupOrd;
        og.comparatorSlot = bottomSlot;
        final OneGroup bottomGroup = groupQueue.add(og);
        //System.out.println("      ADD group=" + getGroupString(lastGroupOrd) + " newBottom=" + getGroupString(bottomGroup.groupOrd));
        queueFull = groupQueue.size() == topNGroups;
        if (queueFull) {
          // Queue just became full; now set the real bottom
          // in the comparators:
          bottomSlot = bottomGroup.comparatorSlot;
          //System.out.println("    set bottom=" + bottomSlot);
          for (int i = 0; i < comparators.length; i++) {
            leafComparators[i].setBottom(bottomSlot);
          }
          //System.out.println("     QUEUE FULL");
        } else {
          // Queue not full yet -- just advance bottomSlot:
          bottomSlot = groupQueue.size();
        }
      } else {
        // Replace bottom element in PQ and then updateTop
        final OneGroup og = groupQueue.top();
        assert og != null;
        og.count = subDocUpto;
        og.topGroupDoc = docBase + topGroupDoc;
        // Swap pending docs
        final int[] savDocs = og.docs;
        og.docs = pendingSubDocs;
        pendingSubDocs = savDocs;
        if (needsScores) {
          // Swap pending scores
          final float[] savScores = og.scores;
          og.scores = pendingSubScores;
          pendingSubScores = savScores;
        }
        og.readerContext = currentReaderContext;
        //og.groupOrd = lastGroupOrd;
        bottomSlot = groupQueue.updateTop().comparatorSlot;

        //System.out.println("    set bottom=" + bottomSlot);
        for (int i = 0; i < comparators.length; i++) {
          leafComparators[i].setBottom(bottomSlot);
        }
      }
    }
    subDocUpto = 0;
  }

  /**
   * Create the single pass collector.
   *
   *  @param groupSort The {@link Sort} used to sort the
   *    groups.  The top sorted document within each group
   *    according to groupSort, determines how that group
   *    sorts against other groups.  This must be non-null,
   *    ie, if you want to groupSort by relevance use
   *    Sort.RELEVANCE.
   *  @param topNGroups How many top groups to keep.
   *  @param needsScores true if the collected documents
   *    require scores, either because relevance is included
   *    in the withinGroupSort or because you plan to pass true
   *    for either getSscores or getMaxScores to {@link
   *    #getTopGroups}
   *  @param lastDocPerGroup a {@link Weight} that marks the
   *    last document in each group.
   */
  public BlockGroupingCollector(Sort groupSort, int topNGroups, boolean needsScores, Weight lastDocPerGroup) {

    if (topNGroups < 1) {
      throw new IllegalArgumentException("topNGroups must be >= 1 (got " + topNGroups + ")");
    }

    groupQueue = new GroupQueue(topNGroups);
    pendingSubDocs = new int[10];
    if (needsScores) {
      pendingSubScores = new float[10];
    }

    this.needsScores = needsScores;
    this.lastDocPerGroup = lastDocPerGroup;

    this.groupSort = groupSort;
    
    this.topNGroups = topNGroups;

    final SortField[] sortFields = groupSort.getSort();
    comparators = new FieldComparator[sortFields.length];
    leafComparators = new LeafFieldComparator[sortFields.length];
    compIDXEnd = comparators.length - 1;
    reversed = new int[sortFields.length];
    for (int i = 0; i < sortFields.length; i++) {
      final SortField sortField = sortFields[i];
      comparators[i] = sortField.getComparator(topNGroups, i);
      reversed[i] = sortField.getReverse() ? -1 : 1;
    }
  }

  // TODO: maybe allow no sort on retrieving groups?  app
  // may want to simply process docs in the group itself?
  // typically they will be presented as a "single" result
  // in the UI?

  /** Returns the grouped results.  Returns null if the
   *  number of groups collected is <= groupOffset.
   *
   *  NOTE: This collector is unable to compute
   *  the groupValue per group so it will always be null.
   *  This is normally not a problem, as you can obtain the
   *  value just like you obtain other values for each
   *  matching document (eg, via stored fields, via
   *  DocValues, etc.)
   *
   *  @param withinGroupSort The {@link Sort} used to sort
   *    documents within each group.
   *  @param groupOffset Which group to start from
   *  @param withinGroupOffset Which document to start from
   *    within each group
   *  @param maxDocsPerGroup How many top documents to keep
   *     within each group.
   */
  public TopGroups getTopGroups(Sort withinGroupSort, int groupOffset, int withinGroupOffset, int maxDocsPerGroup) throws IOException {

    //if (queueFull) {
    //System.out.println("getTopGroups groupOffset=" + groupOffset + " topNGroups=" + topNGroups);
    //}
    if (subDocUpto != 0) {
      processGroup();
    }
    if (groupOffset >= groupQueue.size()) {
      return null;
    }
    int totalGroupedHitCount = 0;

    final ScoreAndDoc fakeScorer = new ScoreAndDoc();

    float maxScore = Float.MIN_VALUE;

    @SuppressWarnings({"unchecked","rawtypes"})
    final GroupDocs