All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.search.grouping.GroupingSearch Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.grouping;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.search.CachingCollector;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.mutable.MutableValue;

/**
 * Convenience class to perform grouping in a non distributed environment.
 *
 * @lucene.experimental
 */
public class GroupingSearch {

  private final GroupSelector grouper;
  private final Query groupEndDocs;

  private Sort groupSort = Sort.RELEVANCE;
  private Sort sortWithinGroup = Sort.RELEVANCE;

  private int groupDocsOffset;
  private int groupDocsLimit = 1;
  private boolean includeMaxScore = true;

  private Double maxCacheRAMMB;
  private Integer maxDocsToCache;
  private boolean cacheScores;
  private boolean allGroups;
  private boolean allGroupHeads;

  private Collection matchingGroups;
  private Bits matchingGroupHeads;

  /**
   * Constructs a GroupingSearch instance that groups documents by index terms using
   * DocValues. The group field can only have one token per document. This means that the field must
   * not be analysed.
   *
   * @param groupField The name of the field to group by.
   */
  public GroupingSearch(String groupField) {
    this(new TermGroupSelector(groupField), null);
  }

  /**
   * Constructs a GroupingSearch instance that groups documents using a {@link
   * GroupSelector}
   *
   * @param groupSelector a {@link GroupSelector} that defines groups for this GroupingSearch
   */
  public GroupingSearch(GroupSelector groupSelector) {
    this(groupSelector, null);
  }

  /**
   * Constructs a GroupingSearch instance that groups documents by function using a
   * {@link ValueSource} instance.
   *
   * @param groupFunction The function to group by specified as {@link ValueSource}
   * @param valueSourceContext The context of the specified groupFunction
   */
  public GroupingSearch(ValueSource groupFunction, Map valueSourceContext) {
    this(new ValueSourceGroupSelector(groupFunction, valueSourceContext), null);
  }

  /**
   * Constructor for grouping documents by doc block. This constructor can only be used when
   * documents belonging in a group are indexed in one block.
   *
   * @param groupEndDocs The query that marks the last document in all doc blocks
   */
  public GroupingSearch(Query groupEndDocs) {
    this(null, groupEndDocs);
  }

  private GroupingSearch(GroupSelector grouper, Query groupEndDocs) {
    this.grouper = grouper;
    this.groupEndDocs = groupEndDocs;
  }

  /**
   * Executes a grouped search. Both the first pass and second pass are executed on the specified
   * searcher.
   *
   * @param searcher The {@link org.apache.lucene.search.IndexSearcher} instance to execute the
   *     grouped search on.
   * @param query The query to execute with the grouping
   * @param groupOffset The group offset
   * @param groupLimit The number of groups to return from the specified group offset
   * @return the grouped result as a {@link TopGroups} instance
   * @throws IOException If any I/O related errors occur
   */
  @SuppressWarnings("unchecked")
  public  TopGroups search(
      IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException {
    if (grouper != null) {
      return groupByFieldOrFunction(searcher, query, groupOffset, groupLimit);
    } else if (groupEndDocs != null) {
      return (TopGroups) groupByDocBlock(searcher, query, groupOffset, groupLimit);
    } else {
      throw new IllegalStateException(
          "Either groupField, groupFunction or groupEndDocs must be set."); // This can't happen...
    }
  }

  @SuppressWarnings({"unchecked", "rawtypes"})
  protected TopGroups groupByFieldOrFunction(
      IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException {
    int topN = groupOffset + groupLimit;

    final FirstPassGroupingCollector firstPassCollector =
        new FirstPassGroupingCollector(grouper, groupSort, topN);
    final AllGroupsCollector allGroupsCollector =
        allGroups ? new AllGroupsCollector(grouper) : null;
    final AllGroupHeadsCollector allGroupHeadsCollector =
        allGroupHeads ? AllGroupHeadsCollector.newCollector(grouper, sortWithinGroup) : null;

    final Collector firstRound =
        MultiCollector.wrap(firstPassCollector, allGroupsCollector, allGroupHeadsCollector);

    CachingCollector cachedCollector = null;
    if (maxCacheRAMMB != null || maxDocsToCache != null) {
      if (maxCacheRAMMB != null) {
        cachedCollector = CachingCollector.create(firstRound, cacheScores, maxCacheRAMMB);
      } else {
        cachedCollector = CachingCollector.create(firstRound, cacheScores, maxDocsToCache);
      }
      searcher.search(query, cachedCollector);
    } else {
      searcher.search(query, firstRound);
    }

    matchingGroups = allGroups ? allGroupsCollector.getGroups() : Collections.emptyList();
    matchingGroupHeads =
        allGroupHeads
            ? allGroupHeadsCollector.retrieveGroupHeads(searcher.getIndexReader().maxDoc())
            : new Bits.MatchNoBits(searcher.getIndexReader().maxDoc());

    Collection topSearchGroups = firstPassCollector.getTopGroups(groupOffset);
    if (topSearchGroups == null) {
      return new TopGroups(new SortField[0], new SortField[0], 0, 0, new GroupDocs[0], Float.NaN);
    }

    int topNInsideGroup = groupDocsOffset + groupDocsLimit;
    TopGroupsCollector secondPassCollector =
        new TopGroupsCollector(
            grouper, topSearchGroups, groupSort, sortWithinGroup, topNInsideGroup, includeMaxScore);

    if (cachedCollector != null && cachedCollector.isCached()) {
      cachedCollector.replay(secondPassCollector);
    } else {
      searcher.search(query, secondPassCollector);
    }

    if (allGroups) {
      return new TopGroups(
          secondPassCollector.getTopGroups(groupDocsOffset), matchingGroups.size());
    } else {
      return secondPassCollector.getTopGroups(groupDocsOffset);
    }
  }

  protected TopGroups groupByDocBlock(
      IndexSearcher searcher, Query query, int groupOffset, int groupLimit) throws IOException {
    int topN = groupOffset + groupLimit;
    final Query endDocsQuery = searcher.rewrite(this.groupEndDocs);
    final Weight groupEndDocs =
        searcher.createWeight(endDocsQuery, ScoreMode.COMPLETE_NO_SCORES, 1);
    BlockGroupingCollector c =
        new BlockGroupingCollector(
            groupSort,
            topN,
            groupSort.needsScores() || sortWithinGroup.needsScores(),
            groupEndDocs);
    searcher.search(query, c);
    int topNInsideGroup = groupDocsOffset + groupDocsLimit;
    return c.getTopGroups(sortWithinGroup, groupOffset, groupDocsOffset, topNInsideGroup);
  }

  /**
   * Enables caching for the second pass search. The cache will not grow over a specified limit in
   * MB. The cache is filled during the first pass searched and then replayed during the second pass
   * searched. If the cache grows beyond the specified limit, then the cache is purged and not used
   * in the second pass search.
   *
   * @param maxCacheRAMMB The maximum amount in MB the cache is allowed to hold
   * @param cacheScores Whether to cache the scores
   * @return this
   */
  public GroupingSearch setCachingInMB(double maxCacheRAMMB, boolean cacheScores) {
    this.maxCacheRAMMB = maxCacheRAMMB;
    this.maxDocsToCache = null;
    this.cacheScores = cacheScores;
    return this;
  }

  /**
   * Enables caching for the second pass search. The cache will not contain more than the maximum
   * specified documents. The cache is filled during the first pass searched and then replayed
   * during the second pass searched. If the cache grows beyond the specified limit, then the cache
   * is purged and not used in the second pass search.
   *
   * @param maxDocsToCache The maximum number of documents the cache is allowed to hold
   * @param cacheScores Whether to cache the scores
   * @return this
   */
  public GroupingSearch setCaching(int maxDocsToCache, boolean cacheScores) {
    this.maxDocsToCache = maxDocsToCache;
    this.maxCacheRAMMB = null;
    this.cacheScores = cacheScores;
    return this;
  }

  /**
   * Disables any enabled cache.
   *
   * @return this
   */
  public GroupingSearch disableCaching() {
    this.maxCacheRAMMB = null;
    this.maxDocsToCache = null;
    return this;
  }

  /**
   * Specifies how groups are sorted. Defaults to {@link Sort#RELEVANCE}.
   *
   * @param groupSort The sort for the groups.
   * @return this
   */
  public GroupingSearch setGroupSort(Sort groupSort) {
    this.groupSort = groupSort;
    return this;
  }

  /**
   * Specified how documents inside a group are sorted. Defaults to {@link Sort#RELEVANCE}.
   *
   * @param sortWithinGroup The sort for documents inside a group
   * @return this
   */
  public GroupingSearch setSortWithinGroup(Sort sortWithinGroup) {
    this.sortWithinGroup = sortWithinGroup;
    return this;
  }

  /**
   * Specifies the offset for documents inside a group.
   *
   * @param groupDocsOffset The offset for documents inside a
   * @return this
   */
  public GroupingSearch setGroupDocsOffset(int groupDocsOffset) {
    this.groupDocsOffset = groupDocsOffset;
    return this;
  }

  /**
   * Specifies the number of documents to return inside a group from the specified groupDocsOffset.
   *
   * @param groupDocsLimit The number of documents to return inside a group
   * @return this
   */
  public GroupingSearch setGroupDocsLimit(int groupDocsLimit) {
    this.groupDocsLimit = groupDocsLimit;
    return this;
  }

  /**
   * Whether to include the score of the most relevant document per group.
   *
   * @param includeMaxScore Whether to include the score of the most relevant document per group
   * @return this
   */
  public GroupingSearch setIncludeMaxScore(boolean includeMaxScore) {
    this.includeMaxScore = includeMaxScore;
    return this;
  }

  /**
   * Whether to also compute all groups matching the query. This can be used to determine the number
   * of groups, which can be used for accurate pagination.
   *
   * 

When grouping by doc block the number of groups are automatically included in the {@link * TopGroups} and this option doesn't have any influence. * * @param allGroups to also compute all groups matching the query * @return this */ public GroupingSearch setAllGroups(boolean allGroups) { this.allGroups = allGroups; return this; } /** * If {@link #setAllGroups(boolean)} was set to true then all matching groups are * returned, otherwise an empty collection is returned. * * @param The group value type. This can be a {@link BytesRef} or a {@link MutableValue} * instance. If grouping by doc block this the group value is always null. * @return all matching groups are returned, or an empty collection */ @SuppressWarnings({"unchecked", "rawtypes"}) public Collection getAllMatchingGroups() { return (Collection) matchingGroups; } /** * Whether to compute all group heads (most relevant document per group) matching the query. * *

This feature isn't enabled when grouping by doc block. * * @param allGroupHeads Whether to compute all group heads (most relevant document per group) * matching the query * @return this */ public GroupingSearch setAllGroupHeads(boolean allGroupHeads) { this.allGroupHeads = allGroupHeads; return this; } /** * Returns the matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an * empty bit set. * * @return The matching group heads if {@link #setAllGroupHeads(boolean)} was set to true or an * empty bit set */ public Bits getAllGroupHeads() { return matchingGroupHeads; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy