org.apache.lucene.facet.search.sampling.TakmiSampleFixer Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lucene-facet Show documentation
Package for Faceted Indexing and Search
There is a newer version: 9.11.1
package org.apache.lucene.facet.search.sampling;

import java.io.IOException;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;

import org.apache.lucene.facet.search.DrillDown;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.ScoredDocIDsIterator;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * Fix sampling results by counting the intersection between two lists: a
 * TermDocs (list of documents in a certain category) and a DocIdSetIterator
 * (list of documents matching the query).
 * 
 * 
 * @lucene.experimental
 */
// TODO (Facet): implement also an estimated fixing by ratio (taking into
// account "translation" of counts!)
class TakmiSampleFixer implements SampleFixer {
  
  private TaxonomyReader taxonomyReader;
  private IndexReader indexReader;
  private FacetSearchParams searchParams;
  
  public TakmiSampleFixer(IndexReader indexReader,
      TaxonomyReader taxonomyReader, FacetSearchParams searchParams) {
    this.indexReader = indexReader;
    this.taxonomyReader = taxonomyReader;
    this.searchParams = searchParams;
  }

  public void fixResult(ScoredDocIDs origDocIds, FacetResult fres)
      throws IOException {
    FacetResultNode topRes = fres.getFacetResultNode();
    fixResultNode(topRes, origDocIds);
  }
  
  /**
   * Fix result node count, and, recursively, fix all its children
   * 
   * @param facetResNode
   *          result node to be fixed
   * @param docIds
   *          docids in effect
   * @throws IOException
   */
  private void fixResultNode(FacetResultNode facetResNode, ScoredDocIDs docIds)
      throws IOException {
    recount(facetResNode, docIds);
    for (FacetResultNode frn : facetResNode.getSubResults()) {
      fixResultNode(frn, docIds);
    }
  }

  /**
   * Internal utility: recount for a facet result node
   * 
   * @param fresNode
   *          result node to be recounted
   * @param docIds
   *          full set of matching documents.
   * @throws IOException
   */
  private void recount(FacetResultNode fresNode, ScoredDocIDs docIds)
      throws IOException {
    // TODO (Facet): change from void to return the new, smaller docSet, and use
    // that for the children, as this will make their intersection ops faster.
    // can do this only when the new set is "sufficiently" smaller.
    
    /* We need the category's path name in order to do its recounting.
     * If it is missing, because the option to label only part of the
     * facet results was exercise, we need to calculate them anyway, so
     * in essence sampling with recounting spends some extra cycles for
     * labeling results for which labels are not required. */
    CategoryPath catPath = fresNode.getLabel(taxonomyReader); // force labeling

    Term drillDownTerm = DrillDown.term(searchParams, catPath);
    int updatedCount = countIntersection(indexReader.termDocs(drillDownTerm),
        docIds.iterator());

    fresNode.setValue(updatedCount);
  }

  /**
   * Count the size of the intersection between two lists: a TermDocs (list of
   * documents in a certain category) and a DocIdSetIterator (list of documents
   * matching a query).
   */
  private static int countIntersection(TermDocs p1, ScoredDocIDsIterator p2)
      throws IOException {
    // The documentation of of both TermDocs and DocIdSetIterator claim
    // that we must do next() before doc(). So we do, and if one of the
    // lists is empty, obviously return 0;
    if (!p1.next()) {
      return 0;
    }
    if (!p2.next()) {
      return 0;
    }
    
    int d1 = p1.doc();
    int d2 = p2.getDocID();

    int count = 0;
    for (;;) {
      if (d1 == d2) {
        ++count;
        if (!p1.next()) {
          break; // end of list 1, nothing more in intersection
        }
        d1 = p1.doc();
        if (!advance(p2, d1)) {
          break; // end of list 2, nothing more in intersection
        }
        d2 = p2.getDocID();
      } else if (d1 < d2) {
        if (!p1.skipTo(d2)) {
          break; // end of list 1, nothing more in intersection
        }
        d1 = p1.doc();
      } else /* d1>d2 */ {
        if (!advance(p2, d1)) {
          break; // end of list 2, nothing more in intersection
        }
        d2 = p2.getDocID();
      }
    }
    return count;
  }

  /**
   * utility: advance the iterator until finding (or exceeding) specific
   * document
   * 
   * @param iterator
   *          iterator being advanced
   * @param targetDoc
   *          target of advancing
   * @return false if iterator exhausted, true otherwise.
   */
  private static boolean advance(ScoredDocIDsIterator iterator, int targetDoc) {
    while (iterator.next()) {
      if (iterator.getDocID() >= targetDoc) {
        return true; // target reached
      }
    }
    return false; // exhausted
  }
}