All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.sandbox.search.LargeNumHitsTopDocsCollector Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.sandbox.search;

import static org.apache.lucene.search.TopDocsCollector.EMPTY_TOPDOCS;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.HitQueue;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.TotalHits;

/**
 * Optimized collector for large number of hits. The collector maintains an ArrayList of hits until
 * it accumulates the requested number of hits. Post that, it builds a Priority Queue and starts
 * filtering further hits based on the minimum competitive score.
 */
public final class LargeNumHitsTopDocsCollector implements Collector {
  private final int requestedHitCount;
  private List hits = new ArrayList<>();
  // package private for testing
  HitQueue pq;
  ScoreDoc pqTop;
  int totalHits;

  public LargeNumHitsTopDocsCollector(int requestedHitCount) {
    this.requestedHitCount = requestedHitCount;
    this.totalHits = 0;
  }

  // We always return COMPLETE since this collector should ideally
  // be used only with large number of hits case
  @Override
  public ScoreMode scoreMode() {
    return ScoreMode.COMPLETE;
  }

  @Override
  public LeafCollector getLeafCollector(LeafReaderContext context) {
    final int docBase = context.docBase;
    return new TopScoreDocCollector.ScorerLeafCollector() {

      @Override
      public void setScorer(Scorable scorer) throws IOException {
        super.setScorer(scorer);
      }

      @Override
      public void collect(int doc) throws IOException {
        float score = scorer.score();

        // This collector relies on the fact that scorers produce positive values:
        assert score >= 0; // NOTE: false for NaN

        if (totalHits < requestedHitCount) {
          hits.add(new ScoreDoc(doc + docBase, score));
          totalHits++;
          return;
        } else if (totalHits == requestedHitCount) {
          // Convert the list to a priority queue

          // We should get here only when priority queue
          // has not been built
          assert pq == null;
          assert pqTop == null;
          pq = new HitQueue(requestedHitCount, false);

          for (ScoreDoc scoreDoc : hits) {
            pq.add(scoreDoc);
          }

          pqTop = pq.top();
          hits = null;
        }

        if (score > pqTop.score) {
          pqTop.doc = doc + docBase;
          pqTop.score = score;
          pqTop = pq.updateTop();
        }
        ++totalHits;
      }
    };
  }

  /** Returns the top docs that were collected by this collector. */
  public TopDocs topDocs(int howMany) {

    if (howMany <= 0 || howMany > totalHits) {
      throw new IllegalArgumentException("Incorrect number of hits requested");
    }

    ScoreDoc[] results = new ScoreDoc[howMany];

    // Get the requested results from either hits list or PQ
    populateResults(results, howMany);

    return newTopDocs(results);
  }

  /**
   * Populates the results array with the ScoreDoc instances. This can be overridden in case a
   * different ScoreDoc type should be returned.
   */
  protected void populateResults(ScoreDoc[] results, int howMany) {
    if (pq != null) {
      assert totalHits >= requestedHitCount;
      for (int i = howMany - 1; i >= 0; i--) {
        results[i] = pq.pop();
      }
      return;
    }

    // Total number of hits collected were less than requestedHitCount
    assert totalHits < requestedHitCount;
    Collections.sort(
        hits,
        Comparator.comparing((ScoreDoc scoreDoc) -> scoreDoc.score)
            .reversed()
            .thenComparing(scoreDoc -> scoreDoc.doc));

    for (int i = 0; i < howMany; i++) {
      results[i] = hits.get(i);
    }
  }

  /**
   * Returns a {@link TopDocs} instance containing the given results. If results is
   * null it means there are no results to return, either because there were 0 calls to collect() or
   * because the arguments to topDocs were invalid.
   */
  protected TopDocs newTopDocs(ScoreDoc[] results) {
    return results == null
        ? EMPTY_TOPDOCS
        : new TopDocs(new TotalHits(totalHits, TotalHits.Relation.EQUAL_TO), results);
  }

  /** Returns the top docs that were collected by this collector. */
  public TopDocs topDocs() {
    return topDocs(Math.min(totalHits, requestedHitCount));
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy