All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.facet.taxonomy.IntTaxonomyFacets Maven / Gradle / Ivy

There is a newer version: 10.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet.taxonomy;

import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.IntIntHashMap;
import com.carrotsearch.hppc.cursors.IntIntCursor;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.facet.FacetResult;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.facet.LabelAndValue;
import org.apache.lucene.facet.TopOrdAndIntQueue;
import org.apache.lucene.util.PriorityQueue;

/**
 * Base class for all taxonomy-based facets that aggregate to a per-ords int[].
 *
 * @deprecated Visibility of this class will be reduced to pkg-private in a future version. This
 *     class is meant to host common code as an internal implementation detail to {@link
 *     FastTaxonomyFacetCounts} and {@link TaxonomyFacetIntAssociations},and is not intended as an
 *     extension point for user-created {@code Facets} implementations. If your code is relying on
 *     this, please migrate necessary functionality down into your own class.
 */
@Deprecated
public abstract class IntTaxonomyFacets extends TaxonomyFacets {

  /** Aggregation function used for combining values. */
  protected final AssociationAggregationFunction aggregationFunction;

  /**
   * Dense ordinal values.
   *
   * 

We are making this and {@link #sparseValues} protected for some expert usage. e.g. It can be * checked which is being used before a loop instead of calling {@link #increment} for each * iteration. */ protected final int[] values; /** * Sparse ordinal values. * * @see #values for why protected. */ protected final IntIntHashMap sparseValues; /** * Constructor that defaults the aggregation function to {@link * AssociationAggregationFunction#SUM}. */ protected IntTaxonomyFacets( String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, FacetsCollector fc) throws IOException { super(indexFieldName, taxoReader, config); this.aggregationFunction = AssociationAggregationFunction.SUM; if (useHashTable(fc, taxoReader)) { sparseValues = new IntIntHashMap(); values = null; } else { sparseValues = null; values = new int[taxoReader.getSize()]; } } /** Constructor that uses the provided aggregation function. */ protected IntTaxonomyFacets( String indexFieldName, TaxonomyReader taxoReader, FacetsConfig config, AssociationAggregationFunction aggregationFunction, FacetsCollector fc) throws IOException { super(indexFieldName, taxoReader, config); this.aggregationFunction = aggregationFunction; if (useHashTable(fc, taxoReader)) { sparseValues = new IntIntHashMap(); values = null; } else { sparseValues = null; values = new int[taxoReader.getSize()]; } } /** Return true if a sparse hash table should be used for counting, instead of a dense int[]. */ protected boolean useHashTable(FacetsCollector fc, TaxonomyReader taxoReader) { if (taxoReader.getSize() < 1024) { // small number of unique values: use an array return false; } if (fc == null) { // counting all docs: use an array return false; } int maxDoc = 0; int sumTotalHits = 0; for (MatchingDocs docs : fc.getMatchingDocs()) { sumTotalHits += docs.totalHits; maxDoc += docs.context.reader().maxDoc(); } // if our result set is < 10% of the index, we collect sparsely (use hash map): return sumTotalHits < maxDoc / 10; } /** Increment the count for this ordinal by 1. */ protected void increment(int ordinal) { increment(ordinal, 1); } /** Increment the count for this ordinal by {@code amount}.. */ protected void increment(int ordinal, int amount) { if (sparseValues != null) { sparseValues.addTo(ordinal, amount); } else { values[ordinal] += amount; } } /** Set the count for this ordinal to {@code newValue}. */ void setValue(int ordinal, int newValue) { if (sparseValues != null) { sparseValues.put(ordinal, newValue); } else { values[ordinal] = newValue; } } /** Get the count for this ordinal. */ protected int getValue(int ordinal) { if (sparseValues != null) { return sparseValues.get(ordinal); } else { return values[ordinal]; } } /** Rolls up any single-valued hierarchical dimensions. */ protected void rollup() throws IOException { // Rollup any necessary dims: int[] children = null; for (Map.Entry ent : config.getDimConfigs().entrySet()) { String dim = ent.getKey(); DimConfig ft = ent.getValue(); if (ft.hierarchical && ft.multiValued == false) { int dimRootOrd = taxoReader.getOrdinal(new FacetLabel(dim)); // It can be -1 if this field was declared in the // config but never indexed: if (dimRootOrd > 0) { if (children == null) { // lazy init children = getChildren(); } int currentValue = getValue(dimRootOrd); int newValue = aggregationFunction.aggregate(currentValue, rollup(children[dimRootOrd])); setValue(dimRootOrd, newValue); } } } } private int rollup(int ord) throws IOException { int[] children = getChildren(); int[] siblings = getSiblings(); int aggregatedValue = 0; while (ord != TaxonomyReader.INVALID_ORDINAL) { int currentValue = getValue(ord); int newValue = aggregationFunction.aggregate(currentValue, rollup(children[ord])); setValue(ord, newValue); aggregatedValue = aggregationFunction.aggregate(aggregatedValue, getValue(ord)); ord = siblings[ord]; } return aggregatedValue; } @Override public Number getSpecificValue(String dim, String... path) throws IOException { DimConfig dimConfig = verifyDim(dim); if (path.length == 0) { if (dimConfig.hierarchical && dimConfig.multiValued == false) { // ok: rolled up at search time } else if (dimConfig.requireDimCount && dimConfig.multiValued) { // ok: we indexed all ords at index time } else { throw new IllegalArgumentException( "cannot return dimension-level value alone; use getTopChildren instead"); } } int ord = taxoReader.getOrdinal(new FacetLabel(dim, path)); if (ord < 0) { return -1; } return getValue(ord); } @Override public FacetResult getAllChildren(String dim, String... path) throws IOException { DimConfig dimConfig = verifyDim(dim); FacetLabel cp = new FacetLabel(dim, path); int dimOrd = taxoReader.getOrdinal(cp); if (dimOrd == -1) { return null; } int aggregatedValue = 0; IntArrayList ordinals = new IntArrayList(); IntArrayList ordValues = new IntArrayList(); if (sparseValues != null) { for (IntIntCursor c : sparseValues) { int value = c.value; int ord = c.key; if (parents[ord] == dimOrd && value > 0) { aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); ordinals.add(ord); ordValues.add(value); } } } else { int[] children = getChildren(); int[] siblings = getSiblings(); int ord = children[dimOrd]; while (ord != TaxonomyReader.INVALID_ORDINAL) { int value = values[ord]; if (value > 0) { aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); ordinals.add(ord); ordValues.add(value); } ord = siblings[ord]; } } if (aggregatedValue == 0) { return null; } if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { aggregatedValue = getValue(dimOrd); } else { // Our sum'd value is not correct, in general: aggregatedValue = -1; } } else { // Our sum'd dim value is accurate, so we keep it } // TODO: It would be nice if TaxonomyReader let us pass in a buffer + size so we didn't have to // do an array copy here: FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals.toArray()); LabelAndValue[] labelValues = new LabelAndValue[ordValues.size()]; for (int i = 0; i < ordValues.size(); i++) { labelValues[i] = new LabelAndValue(bulkPath[i].components[cp.length], ordValues.get(i)); } return new FacetResult(dim, path, aggregatedValue, labelValues, ordinals.size()); } @Override public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { validateTopN(topN); DimConfig dimConfig = verifyDim(dim); FacetLabel cp = new FacetLabel(dim, path); int dimOrd = taxoReader.getOrdinal(cp); if (dimOrd == -1) { return null; } TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topN); return createFacetResult(topChildrenForPath, dim, path); } /** * Determine the top-n children for a specified dimension + path. Results are in an intermediate * form. */ private TopChildrenForPath getTopChildrenForPath(DimConfig dimConfig, int pathOrd, int topN) throws IOException { TopOrdAndIntQueue q = new TopOrdAndIntQueue(Math.min(taxoReader.getSize(), topN)); int bottomValue = 0; int bottomOrd = Integer.MAX_VALUE; int aggregatedValue = 0; int childCount = 0; TopOrdAndIntQueue.OrdAndValue reuse = null; // TODO: would be faster if we had a "get the following children" API? then we // can make a single pass over the hashmap if (sparseValues != null) { for (IntIntCursor c : sparseValues) { int value = c.value; int ord = c.key; if (parents[ord] == pathOrd && value > 0) { aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); childCount++; if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { if (reuse == null) { reuse = new TopOrdAndIntQueue.OrdAndValue(); } reuse.ord = ord; reuse.value = value; reuse = q.insertWithOverflow(reuse); if (q.size() == topN) { bottomValue = q.top().value; bottomOrd = q.top().ord; } } } } } else { int[] children = getChildren(); int[] siblings = getSiblings(); int ord = children[pathOrd]; while (ord != TaxonomyReader.INVALID_ORDINAL) { int value = values[ord]; if (value > 0) { aggregatedValue = aggregationFunction.aggregate(aggregatedValue, value); childCount++; if (value > bottomValue || (value == bottomValue && ord < bottomOrd)) { if (reuse == null) { reuse = new TopOrdAndIntQueue.OrdAndValue(); } reuse.ord = ord; reuse.value = value; reuse = q.insertWithOverflow(reuse); if (q.size() == topN) { bottomValue = q.top().value; bottomOrd = q.top().ord; } } } ord = siblings[ord]; } } if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { aggregatedValue = getValue(pathOrd); } else { // Our sum'd value is not correct, in general: aggregatedValue = -1; } } return new TopChildrenForPath(aggregatedValue, childCount, q); } @Override public List getTopDims(int topNDims, int topNChildren) throws IOException { if (topNDims <= 0 || topNChildren <= 0) { throw new IllegalArgumentException("topN must be > 0"); } // get children and siblings ordinal array from TaxonomyFacets int[] children = getChildren(); int[] siblings = getSiblings(); // Create priority queue to store top dimensions and sort by their aggregated values/hits and // string values. PriorityQueue pq = new PriorityQueue<>(topNDims) { @Override protected boolean lessThan(DimValue a, DimValue b) { if (a.value > b.value) { return false; } else if (a.value < b.value) { return true; } else { return a.dim.compareTo(b.dim) > 0; } } }; // Keep track of intermediate results, if we compute them, so we can reuse them later: Map intermediateResults = null; // iterate over children and siblings ordinals for all dims int ord = children[TaxonomyReader.ROOT_ORDINAL]; while (ord != TaxonomyReader.INVALID_ORDINAL) { String dim = taxoReader.getPath(ord).components[0]; FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); if (dimConfig.indexFieldName.equals(indexFieldName)) { FacetLabel cp = new FacetLabel(dim); int dimOrd = taxoReader.getOrdinal(cp); if (dimOrd != -1) { int dimValue; if (dimConfig.multiValued) { if (dimConfig.requireDimCount) { // If the dim is configured as multi-valued and requires dim counts, we can access // an accurate count for the dim computed at indexing time: dimValue = getValue(dimOrd); } else { // If the dim is configured as multi-valued but not requiring dim counts, we cannot // compute an accurate dim count, and use -1 as a place-holder: dimValue = -1; } } else { // Single-valued dims require aggregating descendant paths to get accurate dim counts // since we don't directly access ancestry paths: // TODO: We could consider indexing dim counts directly if getTopDims is a common // use-case. TopChildrenForPath topChildrenForPath = getTopChildrenForPath(dimConfig, dimOrd, topNChildren); if (intermediateResults == null) { intermediateResults = new HashMap<>(); } intermediateResults.put(dim, topChildrenForPath); dimValue = topChildrenForPath.pathValue; } if (dimValue != 0) { if (pq.size() < topNDims) { pq.add(new DimValue(dim, dimOrd, dimValue)); } else { if (dimValue > pq.top().value || (dimValue == pq.top().value && dim.compareTo(pq.top().dim) < 0)) { DimValue bottomDim = pq.top(); bottomDim.dim = dim; bottomDim.value = dimValue; pq.updateTop(); } } } } } ord = siblings[ord]; } FacetResult[] results = new FacetResult[pq.size()]; while (pq.size() > 0) { DimValue dimValue = pq.pop(); assert dimValue != null; String dim = dimValue.dim; TopChildrenForPath topChildrenForPath = null; if (intermediateResults != null) { topChildrenForPath = intermediateResults.get(dim); } if (topChildrenForPath == null) { FacetsConfig.DimConfig dimConfig = config.getDimConfig(dim); topChildrenForPath = getTopChildrenForPath(dimConfig, dimValue.dimOrd, topNChildren); } FacetResult facetResult = createFacetResult(topChildrenForPath, dim); assert facetResult != null; results[pq.size()] = facetResult; } return Arrays.asList(results); } /** * Create a FacetResult for the provided dim + path and intermediate results. Does the extra work * of resolving ordinals -> labels, etc. Will return null if there are no children. */ FacetResult createFacetResult(TopChildrenForPath topChildrenForPath, String dim, String... path) throws IOException { // If the intermediate result is null or there are no children, we return null: if (topChildrenForPath == null || topChildrenForPath.childCount == 0) { return null; } TopOrdAndIntQueue q = topChildrenForPath.childQueue; assert q != null; LabelAndValue[] labelValues = new LabelAndValue[q.size()]; int[] ordinals = new int[labelValues.length]; int[] values = new int[labelValues.length]; for (int i = labelValues.length - 1; i >= 0; i--) { TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop(); assert ordAndValue != null; ordinals[i] = ordAndValue.ord; values[i] = ordAndValue.value; } FacetLabel[] bulkPath = taxoReader.getBulkPath(ordinals); // The path component we're interested in is the one immediately after the provided path. We // add 1 here to also account for the dim: int childComponentIdx = path.length + 1; for (int i = 0; i < labelValues.length; i++) { labelValues[i] = new LabelAndValue(bulkPath[i].components[childComponentIdx], values[i]); } return new FacetResult( dim, path, topChildrenForPath.pathValue, labelValues, topChildrenForPath.childCount); } private static class DimValue { String dim; int dimOrd; int value; DimValue(String dim, int dimOrd, int value) { this.dim = dim; this.dimOrd = dimOrd; this.value = value; } } /** Intermediate result to store top children for a given path before resolving labels, etc. */ private static class TopChildrenForPath { private final int pathValue; private final int childCount; private final TopOrdAndIntQueue childQueue; TopChildrenForPath(int pathValue, int childCount, TopOrdAndIntQueue childQueue) { this.pathValue = pathValue; this.childCount = childCount; this.childQueue = childQueue; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy