org.apache.lucene.facet.StringValueFacetCounts Maven / Gradle / Ivy
Show all versions of lucene-facet Show documentation
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.OrdinalMap;
import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.internal.hppc.IntIntHashMap;
import org.apache.lucene.search.ConjunctionUtils;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
/**
* Compute facet counts from a previously indexed {@link SortedSetDocValues} or {@link
* org.apache.lucene.index.SortedDocValues} field. This approach will execute facet counting against
* the string values found in the specified field, with no assumptions on their format. Unlike
* {@link org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}, no assumption is made
* about a "dimension" path component being indexed. Because of this, the field itself is
* effectively treated as the "dimension", and counts for all unique string values are produced.
* This approach is meant to complement {@link LongValueFacetCounts} in that they both provide facet
* counting on a doc value field with no assumptions of content.
*
* This implementation is useful if you want to dynamically count against any string doc value
* field without relying on {@link FacetField} and {@link FacetsConfig}. The disadvantage is that a
* separate field is required for each "dimension". If you want to pack multiple dimensions into the
* same doc values field, you probably want one of {@link
* org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts} or {@link
* org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts}.
*
*
Note that there is an added cost on every {@link IndexReader} open to create a new {@link
* StringDocValuesReaderState}. Also note that this class should be instantiated and used from a
* single thread, because it holds a thread-private instance of {@link SortedSetDocValues}.
*
* @lucene.experimental
*/
// TODO: Add a concurrent version much like ConcurrentSortedSetDocValuesFacetCounts?
public class StringValueFacetCounts extends Facets {
private final IndexReader reader;
private final String field;
private final OrdinalMap ordinalMap;
private final SortedSetDocValues docValues;
private int[] denseCounts;
private final IntIntHashMap sparseCounts;
private boolean initialized;
private final int cardinality;
private int totalDocCount;
/**
* Returns all facet counts for the field, same result as searching on {@link MatchAllDocsQuery}
* but faster.
*/
public StringValueFacetCounts(StringDocValuesReaderState state) throws IOException {
this(state, null);
}
/** Counts facets across the provided hits. */
public StringValueFacetCounts(StringDocValuesReaderState state, FacetsCollector facetsCollector)
throws IOException {
reader = state.reader;
field = state.field;
ordinalMap = state.ordinalMap;
docValues = getDocValues();
long valueCount = docValues.getValueCount();
if (valueCount > Integer.MAX_VALUE) {
throw new IllegalArgumentException(
"can only handle valueCount < Integer.MAX_VALUE; got " + valueCount);
}
cardinality = (int) valueCount;
if (facetsCollector != null) {
if (cardinality < 1024) { // count densely for low cardinality
sparseCounts = null;
denseCounts = null;
initialized = false;
count(facetsCollector);
} else {
int totalHits = 0;
int totalDocs = 0;
for (FacetsCollector.MatchingDocs matchingDocs : facetsCollector.getMatchingDocs()) {
totalHits += matchingDocs.totalHits();
totalDocs += matchingDocs.context().reader().maxDoc();
}
// No counting needed if there are no hits:
if (totalHits == 0) {
sparseCounts = null;
denseCounts = null;
initialized = true;
} else {
// If our result set is < 10% of the index, we collect sparsely (use hash map). This
// heuristic is borrowed from IntTaxonomyFacetCounts:
if (totalHits < totalDocs / 10) {
sparseCounts = new IntIntHashMap();
denseCounts = null;
initialized = true;
} else {
sparseCounts = null;
denseCounts = new int[cardinality];
initialized = true;
}
count(facetsCollector);
}
}
} else {
// Since we're counting all ordinals, count densely:
sparseCounts = null;
denseCounts = new int[cardinality];
initialized = true;
countAll();
}
}
@Override
public FacetResult getAllChildren(String dim, String... path) throws IOException {
validateDimAndPathForGetChildren(dim, path);
List labelValues = new ArrayList<>();
if (sparseCounts != null) {
for (IntIntHashMap.IntIntCursor sparseCount : sparseCounts) {
int count = sparseCount.value;
final BytesRef term = docValues.lookupOrd(sparseCount.key);
labelValues.add(new LabelAndValue(term.utf8ToString(), count));
}
} else if (denseCounts != null) {
for (int i = 0; i < denseCounts.length; i++) {
int count = denseCounts[i];
if (count != 0) {
final BytesRef term = docValues.lookupOrd(i);
labelValues.add(new LabelAndValue(term.utf8ToString(), count));
}
}
}
return new FacetResult(
field,
new String[0],
totalDocCount,
labelValues.toArray(new LabelAndValue[0]),
labelValues.size());
}
@Override
public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
validateTopN(topN);
validateDimAndPathForGetChildren(dim, path);
topN = Math.min(topN, cardinality);
TopOrdAndIntQueue q = null;
TopOrdAndIntQueue.OrdAndInt reuse = null;
int childCount = 0; // total number of labels with non-zero count
if (sparseCounts != null) {
for (IntIntHashMap.IntIntCursor sparseCount : sparseCounts) {
childCount++; // every count in sparseValues should be non-zero
int ord = sparseCount.key;
int count = sparseCount.value;
if (q == null) {
// Lazy init for sparse case:
q = new TopOrdAndIntQueue(topN);
}
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = ord;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
}
} else if (denseCounts != null) {
for (int i = 0; i < denseCounts.length; i++) {
int count = denseCounts[i];
if (count != 0) {
childCount++;
if (q == null) {
// Lazy init for sparse case:
q = new TopOrdAndIntQueue(topN);
}
if (reuse == null) {
reuse = (TopOrdAndIntQueue.OrdAndInt) q.newOrdAndValue();
}
reuse.ord = i;
reuse.value = count;
reuse = (TopOrdAndIntQueue.OrdAndInt) q.insertWithOverflow(reuse);
}
}
}
int resultCount = q == null ? 0 : q.size();
LabelAndValue[] labelValues = new LabelAndValue[resultCount];
for (int i = labelValues.length - 1; i >= 0; i--) {
TopOrdAndIntQueue.OrdAndInt ordAndValue = (TopOrdAndIntQueue.OrdAndInt) q.pop();
final BytesRef term = docValues.lookupOrd(ordAndValue.ord);
labelValues[i] = new LabelAndValue(term.utf8ToString(), ordAndValue.value);
}
return new FacetResult(field, new String[0], totalDocCount, labelValues, childCount);
}
@Override
public Number getSpecificValue(String dim, String... path) throws IOException {
if (dim.equals(field) == false) {
throw new IllegalArgumentException(
"invalid dim \"" + dim + "\"; should be \"" + field + "\"");
}
if (path.length != 1) {
throw new IllegalArgumentException("path must be length=1");
}
int ord = (int) docValues.lookupTerm(new BytesRef(path[0]));
if (ord < 0) {
return -1;
}
if (sparseCounts != null) {
return sparseCounts.get(ord);
}
if (denseCounts != null) {
return denseCounts[ord];
}
return 0;
}
@Override
public List getAllDims(int topN) throws IOException {
validateTopN(topN);
return Collections.singletonList(getTopChildren(topN, field));
}
private SortedSetDocValues getDocValues() throws IOException {
List leaves = reader.leaves();
int leafCount = leaves.size();
if (leafCount == 0) {
return DocValues.emptySortedSet();
}
if (leafCount == 1) {
return DocValues.getSortedSet(leaves.get(0).reader(), field);
}
// A good bit of this logic is forked from MultiDocValues so we can re-use an ordinal map
SortedSetDocValues[] docValues = new SortedSetDocValues[leafCount];
int[] starts = new int[leafCount + 1];
long cost = 0;
for (int i = 0; i < leafCount; i++) {
LeafReaderContext context = leaves.get(i);
SortedSetDocValues dv = DocValues.getSortedSet(context.reader(), field);
docValues[i] = dv;
starts[i] = context.docBase;
cost += dv.cost();
}
starts[leafCount] = reader.maxDoc();
return new MultiDocValues.MultiSortedSetDocValues(docValues, starts, ordinalMap, cost);
}
private void count(FacetsCollector facetsCollector) throws IOException {
List matchingDocs = facetsCollector.getMatchingDocs();
if (matchingDocs.isEmpty()) {
return;
}
if (matchingDocs.size() == 1) {
FacetsCollector.MatchingDocs hits = matchingDocs.get(0);
if (hits.totalHits() == 0) {
return;
}
// Validate state before doing anything else:
validateState(hits.context());
// Assuming the state is valid, ordinalMap should be null since we have one segment:
assert ordinalMap == null;
countOneSegment(docValues, hits.context().ord, hits, null);
} else {
// Validate state before doing anything else. We only check the first segment since they
// should all ladder up to the same top-level reader:
validateState(matchingDocs.get(0).context());
for (FacetsCollector.MatchingDocs hits : matchingDocs) {
// Assuming the state is valid, ordinalMap should be non-null and docValues should be
// a MultiSortedSetDocValues since we have more than one segment:
assert ordinalMap != null;
assert docValues instanceof MultiDocValues.MultiSortedSetDocValues;
if (hits.totalHits() == 0) {
continue;
}
MultiDocValues.MultiSortedSetDocValues multiValues =
(MultiDocValues.MultiSortedSetDocValues) docValues;
countOneSegment(multiValues.values[hits.context().ord], hits.context().ord, hits, null);
}
}
}
private void countAll() throws IOException {
List leaves = reader.leaves();
int numLeaves = leaves.size();
if (numLeaves == 0) {
return;
}
if (numLeaves == 1) {
// Since we have a single segment, ordinalMap should be null:
assert ordinalMap == null;
LeafReaderContext context = leaves.get(0);
Bits liveDocs = context.reader().getLiveDocs();
if (liveDocs == null) {
countOneSegmentNHLD(docValues, context.ord);
} else {
countOneSegment(docValues, context.ord, null, liveDocs);
}
} else {
// Since we have more than one segment, we should have a non-null ordinalMap and docValues
// should be a MultiSortedSetDocValues instance:
assert ordinalMap != null;
assert docValues instanceof MultiDocValues.MultiSortedSetDocValues;
MultiDocValues.MultiSortedSetDocValues multiValues =
(MultiDocValues.MultiSortedSetDocValues) docValues;
for (int i = 0; i < numLeaves; i++) {
LeafReaderContext context = leaves.get(i);
Bits liveDocs = context.reader().getLiveDocs();
if (liveDocs == null) {
countOneSegmentNHLD(multiValues.values[i], context.ord);
} else {
countOneSegment(multiValues.values[i], context.ord, null, liveDocs);
}
}
}
}
private void countOneSegment(
SortedSetDocValues multiValues,
int segmentOrd,
FacetsCollector.MatchingDocs hits,
Bits liveDocs)
throws IOException {
if (initialized == false) {
assert denseCounts == null && sparseCounts == null;
// If the counters weren't initialized, we can assume the cardinality is low enough that
// dense counting will be preferrable:
denseCounts = new int[cardinality];
initialized = true;
}
// It's slightly more efficient to work against SortedDocValues if the field is actually
// single-valued (see: LUCENE-5309)
SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
DocIdSetIterator valuesIt = singleValues != null ? singleValues : multiValues;
// Intersect hits with doc values unless we're "counting all," in which case we'll iterate
// all doc values for this segment:
DocIdSetIterator it;
if (hits == null) {
assert liveDocs != null;
it = FacetUtils.liveDocsDISI(valuesIt, liveDocs);
} else {
it = ConjunctionUtils.intersectIterators(Arrays.asList(hits.bits().iterator(), valuesIt));
}
// TODO: yet another option is to count all segs
// first, only in seg-ord space, and then do a
// merge-sort-PQ in the end to only "resolve to
// global" those seg ords that can compete, if we know
// we just want top K? ie, this is the same algo
// that'd be used for merging facets across shards
// (distributed faceting). but this has much higher
// temp ram req'ts (sum of number of ords across all
// segs)
if (ordinalMap == null) {
// If there's no ordinal map we don't need to map segment ordinals to globals, so counting
// is very straight-forward:
if (singleValues != null) {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
increment(singleValues.ordValue());
totalDocCount++;
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
boolean countedDocInTotal = false;
for (int i = 0; i < multiValues.docValueCount(); i++) {
int term = (int) multiValues.nextOrd();
increment(term);
if (countedDocInTotal == false) {
totalDocCount++;
countedDocInTotal = true;
}
}
}
}
} else {
// We need to map segment ordinals to globals. We have two different approaches to this
// depending on how many hits we have to count relative to how many unique doc val ordinals
// there are in this segment:
final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
int segmentCardinality = (int) multiValues.getValueCount();
if (hits != null && hits.totalHits() < segmentCardinality / 10) {
// Remap every ord to global ord as we iterate:
if (singleValues != null) {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
increment((int) ordMap.get(singleValues.ordValue()));
totalDocCount++;
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
boolean countedDocInTotal = false;
for (int i = 0; i < multiValues.docValueCount(); i++) {
int term = (int) multiValues.nextOrd();
increment((int) ordMap.get(term));
if (countedDocInTotal == false) {
totalDocCount++;
countedDocInTotal = true;
}
}
}
}
} else {
// First count in seg-ord space.
// At this point, we're either counting all ordinals or our heuristic suggests that
// we expect to visit a large percentage of the unique ordinals (lots of hits relative
// to the segment cardinality), so we count the segment densely:
final int[] segCounts = new int[segmentCardinality];
if (singleValues != null) {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
segCounts[singleValues.ordValue()]++;
totalDocCount++;
}
} else {
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
boolean countedDocInTotal = false;
for (int i = 0; i < multiValues.docValueCount(); i++) {
int term = (int) multiValues.nextOrd();
segCounts[term]++;
if (countedDocInTotal == false) {
totalDocCount++;
countedDocInTotal = true;
}
}
}
}
// Then, migrate to global ords:
for (int ord = 0; ord < segmentCardinality; ord++) {
int count = segCounts[ord];
if (count != 0) {
increment((int) ordMap.get(ord), count);
}
}
}
}
}
// Variant of countOneSegment, that has No Hits or Live Docs
private void countOneSegmentNHLD(SortedSetDocValues multiValues, int segmentOrd)
throws IOException {
// It's slightly more efficient to work against SortedDocValues if the field is actually
// single-valued (see: LUCENE-5309)
SortedDocValues singleValues = DocValues.unwrapSingleton(multiValues);
if (ordinalMap == null) {
// If there's no ordinal map we don't need to map segment ordinals to globals, so counting
// is very straight-forward:
if (singleValues != null) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
increment(singleValues.ordValue());
totalDocCount++;
}
} else {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
boolean countedDocInTotal = false;
for (int i = 0; i < multiValues.docValueCount(); i++) {
int term = (int) multiValues.nextOrd();
increment(term);
if (countedDocInTotal == false) {
totalDocCount++;
countedDocInTotal = true;
}
}
}
}
} else {
// We need to map segment ordinals to globals. We have two different approaches to this
// depending on how many hits we have to count relative to how many unique doc val ordinals
// there are in this segment:
final LongValues ordMap = ordinalMap.getGlobalOrds(segmentOrd);
int segmentCardinality = (int) multiValues.getValueCount();
// First count in seg-ord space.
// At this point, we're either counting all ordinals or our heuristic suggests that
// we expect to visit a large percentage of the unique ordinals (lots of hits relative
// to the segment cardinality), so we count the segment densely:
final int[] segCounts = new int[segmentCardinality];
if (singleValues != null) {
for (int doc = singleValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = singleValues.nextDoc()) {
segCounts[singleValues.ordValue()]++;
totalDocCount++;
}
} else {
for (int doc = multiValues.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = multiValues.nextDoc()) {
boolean countedDocInTotal = false;
for (int i = 0; i < multiValues.docValueCount(); i++) {
int term = (int) multiValues.nextOrd();
segCounts[term]++;
if (countedDocInTotal == false) {
totalDocCount++;
countedDocInTotal = true;
}
}
}
}
// Then, migrate to global ords:
for (int ord = 0; ord < segmentCardinality; ord++) {
int count = segCounts[ord];
if (count != 0) {
increment((int) ordMap.get(ord), count);
}
}
}
}
private void increment(int ordinal) {
increment(ordinal, 1);
}
private void increment(int ordinal, int amount) {
if (sparseCounts != null) {
sparseCounts.addTo(ordinal, amount);
} else {
denseCounts[ordinal] += amount;
}
}
private void validateState(LeafReaderContext context) {
// LUCENE-5090: make sure the provided reader context "matches"
// the top-level reader passed to the
// SortedSetDocValuesReaderState, else cryptic
// AIOOBE can happen:
if (ReaderUtil.getTopLevelContext(context).reader() != reader) {
throw new IllegalStateException(
"the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
}
}
private void validateDimAndPathForGetChildren(String dim, String... path) {
if (dim.equals(field) == false) {
throw new IllegalArgumentException(
"invalid dim \"" + dim + "\"; should be \"" + field + "\"");
}
if (path.length != 0) {
throw new IllegalArgumentException("path.length should be 0");
}
}
}