org.apache.lucene.facet.taxonomy.IntTaxonomyFacets Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-facet Show documentation
Apache Lucene (module: facet)
There is a newer version: 10.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet.taxonomy;

import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntIntHashMap;
import com.carrotsearch.hppc.cursors.IntIntCursor;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.util.PriorityQueue;

/**
 * Base class for all taxonomy-based facets that aggregate to a per-ords int[].
 *
 * @deprecated Visibility of this class will be reduced to pkg-private in a future version. This
 *     class is meant to host common code as an internal implementation detail to {@link
 *     FastTaxonomyFacetCounts} and {@link TaxonomyFacetIntAssociations},and is not intended as an
 *     extension point for user-created {@code Facets} implementations. If your code is relying on
 *     this, please migrate necessary functionality down into your own class.
 */
@Deprecated
public abstract class IntTaxonomyFacets extends TaxonomyFacets {

  /** Aggregation function used for combining values. */
  protected final AssociationAggregationFunction aggregationFunction;

  /**
   * Dense ordinal values.
   *
   * We are making this and {@link #sparseValues} protected for some expert usage. e.g. It can be
   * checked which is being used before a loop instead of calling {@link #increment} for each
   * iteration.
   */
  protected final int[] values;

  /**
   * Sparse ordinal values.
   *
   * @see #values for why protected.
   */
  protected final IntIntHashMap sparseValues;

  /**
   * Constructor that defaults the aggregation function to {@link
   * AssociationAggregationFunction#SUM}.
   */
  protected IntTaxonomyFacets(
      String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc)
      throws IOException {
    super(indexFieldName, taxoReader, config);
    this.aggregationFunction = AssociationAggregationFunction.SUM;

    if (useHashTable(fc, taxoReader)) {
      sparseValues = new IntIntHashMap();
      values = null;
    } else {
      sparseValues = null;
      values = new int[taxoReader.getSize()];
    }
  }

  /** Constructor that uses the provided aggregation function. */
  protected IntTaxonomyFacets(
      String indexFieldName,
      TaxonomyReader taxoReader,
      FacetsConfig config,
      AssociationAggregationFunction aggregationFunction,
      FacetsCollector fc)
      throws IOException {
    super(indexFieldName, taxoReader, config);
    this.aggregationFunction = aggregationFunction;

    if (useHashTable(fc, taxoReader)) {
      sparseValues = new IntIntHashMap();
      values = null;
    } else {
      sparseValues = null;
      values = new int[taxoReader.getSize()];
    }
  }

  /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */
  protected boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) {
    if (taxoReader.getSize() < 1024) {
      // small number of unique values: use an array
      return false;
    }

    if (fc == null) {
      // counting all docs: use an array
      return false;
    }

    int maxDoc = 0;
    int sumTotalHits = 0;
    for (MatchingDocs docs : fc.getMatchingDocs()) {
      sumTotalHits += docs.totalHits;
      maxDoc += docs.context.reader().maxDoc();
    }

    // if our result set is < 10% of the index, we collect sparsely (use hash map):
    return sumTotalHits < maxDoc / 10;
  }

  /** Increment the count for this ordinal by 1. */
  protected void increment(int ordinal) {
    increment(ordinal, 1);
  }

  /** Increment the count for this ordinal by {@code amount}.. */
  protected void increment(int ordinal, int amount) {
    if (sparseValues != null) {
      sparseValues.addTo(ordinal, amount);
    } else {
      values[ordinal] += amount;
    }
  }

  /** Set the count for this ordinal to {@code newValue}. */
  void setValue(int ordinal, int newValue) {
    if (sparseValues != null) {
      sparseValues.put(ordinal, newValue);
    } else {
      values[ordinal] = newValue;
    }
  }

  /** Get the count for this ordinal. */
  protected int getValue(int ordinal) {
    if (sparseValues != null) {
      return sparseValues.get(ordinal);
    } else {
      return values[ordinal];
    }
  }

  /** Rolls up any single-valued hierarchical dimensions. */
  protected void rollup() throws IOException {
    // Rollup any necessary dims:
    int[] children = null;
    for (Map.Entry ent : config.getDimConfigs().entrySet()) {
      String dim = ent.getKey();
      DimConfig ft = ent.getValue();
      if (ft.hierarchical && ft.multiValued == false) {
        int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim));
        // It can be -1 if this field was declared in the
        // config but never indexed:
        if (dimRootOrd > 0) {
          if (children == null) {
            // lazy init
            children = getChildren();
          }
          int currentValue = getValue(dimRootOrd);
          int newValue = aggregationFunction.aggregate(currentValue, rollup(children[dimRootOrd]));
          setValue(dimRootOrd, newValue);
        }
      }
    }
  }

  private int rollup(int ord) throws IOException {
    int[] children = getChildren();
    int[] siblings = getSiblings();
    int aggregatedValue = 0;
    while (ord != TaxonomyReader.INVALID_ORDINAL) {
      int currentValue = getValue(ord);
      int newValue = aggregationFunction.aggregate(currentValue, rollup(children[ord]));
      setValue(ord, newValue);
      aggregatedValue = aggregationFunction.aggregate(aggregatedValue, getValue(ord));
      ord = siblings[ord];
    }
    return aggregatedValue;
  }

  @Override
  public Number getSpecificValue(String dim, String... path) throws IOException {
    DimConfig dimConfig = verifyDim(dim);
    if (path.length == 0) {
      if (dimConfig.hierarchical && dimConfig.multiValued == false) {
        // ok: rolled up at search time
      } else if (dimConfig.requireDimCount && dimConfig.multiValued) {
        // ok: we indexed all ords at index time
      } else {
        throw new IllegalArgumentException(
            "cannot return dimension-level value alone; use getTopChildren instead");
      }
    }
    int ord = taxoReader.getOrdinal(new FacetLabel(dim, path));
    if (ord < 0) {
      return -1;
    }
    return getValue(ord);
  }

  @Override
  public FacetResult getAllChildren(String dim, String... path) throws IOException {
    DimConfig dimConfig = verifyDim(dim);
    FacetLabel cp = new FacetLabel(dim, path);
    int dimOrd = taxoReader.getOrdinal(cp);
    if (dimOrd == -1) {
      return null;
    }

    int aggregatedValue = 0;

    IntArrayList ordinals = new IntArrayList();
    IntArrayList ordValues = new IntArrayList();

    if (sparseValues != null) {
      for (IntIntCursor c : sparseValues) {
        int value = c.value;
        int ord = c.key;
        if (parents[ord] == dimOrd && value > 0) {
          aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value);
          ordinals.add(ord);
          ordValues.add(value);
        }
      }
    } else {
      int[] children = getChildren();
      int[] siblings = getSiblings();
      int ord = children[dimOrd];
      while (ord != TaxonomyReader.INVALID_ORDINAL) {
        int value = values[ord];
        if (value > 0) {
          aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value);
          ordinals.add(ord);
          ordValues.add(value);
        }
        ord = siblings[ord];
      }
    }

    if (aggregatedValue == 0) {
      return null;
    }

    if (dimConfig.multiValued) {
      if (dimConfig.requireDimCount) {
        aggregatedValue = getValue(dimOrd);
      } else {
        // Our sum'd value is not correct, in general:
        aggregatedValue = -1;
      }
    } else {
      // Our sum'd dim value is accurate, so we keep it
    }

    // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to
    // do an array copy here:
    FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray());

    LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()];
    for (int i = 0; i < ordValues.size(); i++) {
      labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i));
    }
    return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size());
  }

  @Override
  public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
    validateTopN(topN);
    DimConfig dimConfig = verifyDim(dim);
    FacetLabel cp = new FacetLabel(dim, path);
    int dimOrd = taxoReader.getOrdinal(cp);
    if (dimOrd == -1) {
      return null;
    }

    TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN);
    return createFacetResult(topChildrenForPath, dim, path);
  }

  /**
   * Determine the top-n children for a specified dimension + path. Results are in an intermediate
   * form.
   */
  private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN)
      throws IOException {
    TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN));
    int bottomValue = 0;
    int bottomOrd = Integer.MAX_VALUE;

    int aggregatedValue = 0;
    int childCount = 0;
    TopOrdAndIntQueue.OrdAndValue reuse = null;

    // TODO: would be faster if we had a "get the following children" API?  then we
    // can make a single pass over the hashmap
    if (sparseValues != null) {
      for (IntIntCursor c : sparseValues) {
        int value = c.value;
        int ord = c.key;
        if (parents[ord] == pathOrd && value > 0) {
          aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value);
          childCount++;
          if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) {
            if (reuse == null) {
              reuse = new TopOrdAndIntQueue.OrdAndValue();
            }
            reuse.ord = ord;
            reuse.value = value;
            reuse = q.insertWithOverflow(reuse);
            if (q.size() == topN) {
              bottomValue = q.top().value;
              bottomOrd = q.top().ord;
            }
          }
        }
      }
    } else {
      int[] children = getChildren();
      int[] siblings = getSiblings();
      int ord = children[pathOrd];
      while (ord != TaxonomyReader.INVALID_ORDINAL) {
        int value = values[ord];
        if (value > 0) {
          aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value);
          childCount++;
          if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) {
            if (reuse == null) {
              reuse = new TopOrdAndIntQueue.OrdAndValue();
            }
            reuse.ord = ord;
            reuse.value = value;
            reuse = q.insertWithOverflow(reuse);
            if (q.size() == topN) {
              bottomValue = q.top().value;
              bottomOrd = q.top().ord;
            }
          }
        }
        ord = siblings[ord];
      }
    }

    if (dimConfig.multiValued) {
      if (dimConfig.requireDimCount) {
        aggregatedValue = getValue(pathOrd);
      } else {
        // Our sum'd value is not correct, in general:
        aggregatedValue = -1;
      }
    }

    return new TopChildrenForPath(aggregatedValue, childCount, q);
  }

  @Override
  public List getTopDims(int topNDims, int topNChildren) throws IOException {
    if (topNDims <= 0 || topNChildren <= 0) {
      throw new IllegalArgumentException("topN must be > 0");
    }

    // get children and siblings ordinal array from TaxonomyFacets
    int[] children = getChildren();
    int[] siblings = getSiblings();

    // Create priority queue to store top dimensions and sort by their aggregated values/hits and
    // string values.
    PriorityQueue pq =
        new PriorityQueue<>(topNDims) {
          @Override
          protected boolean lessThan(DimValue a, DimValue b) {
            if (a.value > b.value) {
              return false;
            } else if (a.value < b.value) {
              return true;
            } else {
              return a.dim.compareTo(b.dim) > 0;
            }
          }
        };

    // Keep track of intermediate results, if we compute them, so we can reuse them later:
    Map intermediateResults = null;

    // iterate over children and siblings ordinals for all dims
    int ord = children[TaxonomyReader.ROOT_ORDINAL];
    while (ord != TaxonomyReader.INVALID_ORDINAL) {
      String dim = taxoReader.getPath(ord).components[0];
      FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
      if (dimConfig.indexFieldName.equals(indexFieldName)) {
        FacetLabel cp = new FacetLabel(dim);
        int dimOrd = taxoReader.getOrdinal(cp);
        if (dimOrd != -1) {
          int dimValue;
          if (dimConfig.multiValued) {
            if (dimConfig.requireDimCount) {
              // If the dim is configured as multi-valued and requires dim counts, we can access
              // an accurate count for the dim computed at indexing time:
              dimValue = getValue(dimOrd);
            } else {
              // If the dim is configured as multi-valued but not requiring dim counts, we cannot
              // compute an accurate dim count, and use -1 as a place-holder:
              dimValue = -1;
            }
          } else {
            // Single-valued dims require aggregating descendant paths to get accurate dim counts
            // since we don't directly access ancestry paths:
            // TODO: We could consider indexing dim counts directly if getTopDims is a common
            // use-case.
            TopChildrenForPath topChildrenForPath =
                getTopChildrenForPath(dimConfig, dimOrd, topNChildren);
            if (intermediateResults == null) {
              intermediateResults = new HashMap<>();
            }
            intermediateResults.put(dim, topChildrenForPath);
            dimValue = topChildrenForPath.pathValue;
          }
          if (dimValue != 0) {
            if (pq.size() < topNDims) {
              pq.add(new DimValue(dim, dimOrd, dimValue));
            } else {
              if (dimValue > pq.top().value
                  || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) {
                DimValue bottomDim = pq.top();
                bottomDim.dim = dim;
                bottomDim.value = dimValue;
                pq.updateTop();
              }
            }
          }
        }
      }
      ord = siblings[ord];
    }

    FacetResult[] results = new FacetResult[pq.size()];

    while (pq.size() > 0) {
      DimValue dimValue = pq.pop();
      assert dimValue != null;
      String dim = dimValue.dim;
      TopChildrenForPath topChildrenForPath = null;
      if (intermediateResults != null) {
        topChildrenForPath = intermediateResults.get(dim);
      }
      if (topChildrenForPath == null) {
        FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim);
        topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren);
      }
      FacetResult facetResult = createFacetResult(topChildrenForPath, dim);
      assert facetResult != null;
      results[pq.size()] = facetResult;
    }
    return Arrays.asList(results);
  }

  /**
   * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work
   * of resolving ordinals -> labels, etc. Will return null if there are no children.
   */
  FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path)
      throws IOException {
    // If the intermediate result is null or there are no children, we return null:
    if (topChildrenForPath == null || topChildrenForPath.childCount == 0) {
      return null;
    }

    TopOrdAndIntQueue q = topChildrenForPath.childQueue;
    assert q != null;

    LabelAndValue[] labelValues = new LabelAndValue[q.size()];
    int[] ordinals = new int[labelValues.length];
    int[] values = new int[labelValues.length];

    for (int i = labelValues.length - 1; i >= 0; i--) {
      TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
      assert ordAndValue != null;
      ordinals[i] = ordAndValue.ord;
      values[i] = ordAndValue.value;
    }

    FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals);
    // The path component we're interested in is the one immediately after the provided path. We
    // add 1 here to also account for the dim:
    int childComponentIdx = path.length + 1;
    for (int i = 0; i < labelValues.length; i++) {
      labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]);
    }

    return new FacetResult(
        dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount);
  }

  private static class DimValue {
    String dim;
    int dimOrd;
    int value;

    DimValue(String dim, int dimOrd, int value) {
      this.dim = dim;
      this.dimOrd = dimOrd;
      this.value = value;
    }
  }

  /** Intermediate result to store top children for a given path before resolving labels, etc. */
  private static class TopChildrenForPath {
    private final int pathValue;
    private final int childCount;
    private final TopOrdAndIntQueue childQueue;

    TopChildrenForPath(int pathValue, int childCount, TopOrdAndIntQueue childQueue) {
      this.pathValue = pathValue;
      this.childCount = childCount;
      this.childQueue = childQueue;
    }
  }
}