All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.search.facet.FacetFieldProcessorByHashDV Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search.facet;

import java.io.IOException;
import java.text.ParseException;
import java.util.function.IntFunction;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LongValues;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocSetUtil;
import org.apache.solr.search.facet.SlotAcc.SlotContext;

/**
 * Facets numbers into a hash table. The number is either a raw numeric DocValues value, or a term
 * global ordinal integer. Limitations:
 *
 * 
    *
  • doesn't handle prefix, but could easily be added *
  • doesn't handle mincount==0 -- you're better off with an array alg *
*/ class FacetFieldProcessorByHashDV extends FacetFieldProcessor { // must be a power of two, non-final to support setting by tests static int MAXIMUM_STARTING_TABLE_SIZE = 1024; /** a hash table with long keys (what we're counting) and integer values (counts) */ private static class LongCounts { static final float LOAD_FACTOR = 0.7f; long[] vals; // maintain the counts here since we need them to tell if there was actually a value anyway long[] counts; long[] oldToNewMapping; int cardinality; int threshold; /** sz must be a power of two */ LongCounts(int sz) { vals = new long[sz]; counts = new long[sz]; threshold = (int) (sz * LOAD_FACTOR); } /** Current number of slots in the hash table */ int numSlots() { return vals.length; } private int hash(long val) { // For floats: exponent bits start at bit 23 for single precision, and bit 52 for double // precision. Many values will only have significant bits just to the right of that, and the // leftmost bits will all be zero. // For now, lets just settle to get first 8 significant mantissa bits of double or float in // the lowest bits of our hash. The upper bits of our hash will be irrelevant. int h = (int) (val + (val >>> 44) + (val >>> 15)); return h; } /** returns the slot */ int add(long val) { if (cardinality >= threshold) { rehash(); } int h = hash(val); for (int slot = h & (vals.length - 1); ; slot = (slot + ((h >> 7) | 1)) & (vals.length - 1)) { long count = counts[slot]; if (count == 0) { counts[slot] = 1; vals[slot] = val; cardinality++; return slot; } else if (vals[slot] == val) { // val is already in the set counts[slot] = count + 1; return slot; } } } protected void rehash() { long[] oldVals = vals; // after retrieving the count, this array is reused as a mapping to new array long[] oldCounts = counts; int newCapacity = vals.length << 1; vals = new long[newCapacity]; counts = new long[newCapacity]; threshold = (int) (newCapacity * LOAD_FACTOR); for (int i = 0; i < oldVals.length; i++) { long count = oldCounts[i]; if (count == 0) { oldCounts[i] = -1; continue; } long val = oldVals[i]; int h = hash(val); int slot = h & (vals.length - 1); while (counts[slot] != 0) { slot = (slot + ((h >> 7) | 1)) & (vals.length - 1); } counts[slot] = count; vals[slot] = val; oldCounts[i] = slot; } oldToNewMapping = oldCounts; } int cardinality() { return cardinality; } } /** A hack instance of Calc for Term ordinals in DocValues. */ // TODO consider making FacetRangeProcessor.Calc facet top level; then less of a hack? private class TermOrdCalc extends FacetRangeProcessor.Calc { IntFunction lookupOrdFunction; // set in collectDocs()! TermOrdCalc() throws IOException { super(sf); } @Override public long bitsToSortableBits(long globalOrd) { return globalOrd; } /** To be returned in "buckets"/"val" */ @Override @SuppressWarnings({"rawtypes"}) public Comparable bitsToValue(long globalOrd) { BytesRef bytesRef = lookupOrdFunction.apply((int) globalOrd); // note FacetFieldProcessorByArray.findTopSlots also calls SchemaFieldType.toObject return sf.getType().toObject(sf, bytesRef).toString(); } @Override @SuppressWarnings({"rawtypes"}) public String formatValue(Comparable val) { return (String) val; } @Override @SuppressWarnings({"rawtypes"}) protected Comparable parseStr(String rawval) throws ParseException { throw new UnsupportedOperationException(); } @Override @SuppressWarnings({"rawtypes"}) protected Comparable parseAndAddGap(Comparable value, String gap) throws ParseException { throw new UnsupportedOperationException(); } } FacetRangeProcessor.Calc calc; LongCounts table; int allBucketsSlot = -1; FacetFieldProcessorByHashDV(FacetContext fcontext, FacetField freq, SchemaField sf) { super(fcontext, freq, sf); if (freq.mincount == 0) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, getClass() + " doesn't support mincount=0"); } if (freq.prefix != null) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, getClass() + " doesn't support prefix"); // yet, but it could } FieldInfo fieldInfo = fcontext.searcher.getFieldInfos().fieldInfo(sf.getName()); if (fieldInfo != null && fieldInfo.getDocValuesType() != DocValuesType.NUMERIC && fieldInfo.getDocValuesType() != DocValuesType.SORTED && fieldInfo.getDocValuesType() != DocValuesType.SORTED_NUMERIC) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, getClass() + " only support single valued number/string with docValues"); } } @Override public void process() throws IOException { super.process(); response = calcFacets(); table = null; // gc } private SimpleOrderedMap calcFacets() throws IOException { if (sf.getType().getNumberType() != null) { calc = FacetRangeProcessor.getNumericCalc(sf); } else { calc = new TermOrdCalc(); // kind of a hack } // TODO: Use the number of indexed terms, if present, as an estimate! // Even for NumericDocValues, we could check for a terms index for an estimate. // Our estimation should aim high to avoid expensive rehashes. int possibleValues = fcontext.base.size(); // size smaller tables so that no resize will be necessary int currHashSize = BitUtil.nextHighestPowerOfTwo((int) (possibleValues * (1 / LongCounts.LOAD_FACTOR) + 1)); currHashSize = Math.min(currHashSize, MAXIMUM_STARTING_TABLE_SIZE); table = new LongCounts(currHashSize) { @Override protected void rehash() { super.rehash(); doRehash(this); oldToNewMapping = null; // allow for gc } }; // note: these methods/phases align with FacetFieldProcessorByArray's createCollectAcc(); collectDocs(); return super.findTopSlots( table.numSlots(), table.cardinality(), slotNum -> calc.bitsToValue(table.vals[slotNum]), // getBucketValFromSlotNum val -> calc.formatValue(val)); // getFieldQueryVal } private void createCollectAcc() throws IOException { int numSlots = table.numSlots(); if (freq.allBuckets) { allBucketsSlot = numSlots++; } indexOrderAcc = new SlotAcc(fcontext) { @Override public void collect(int doc, int slot, IntFunction slotContext) throws IOException {} @Override public int compare(int slotA, int slotB) { long s1 = calc.bitsToSortableBits(table.vals[slotA]); long s2 = calc.bitsToSortableBits(table.vals[slotB]); return Long.compare(s1, s2); } @Override public Object getValue(int slotNum) throws IOException { return null; } @Override public void reset() {} @Override public void resize(Resizer resizer) {} }; countAcc = new SlotAcc.CountSlotAcc(fcontext) { @Override public void incrementCount(int slot, long count) { throw new UnsupportedOperationException(); } @Override public long getCount(int slot) { return table.counts[slot]; } @Override public Object getValue(int slotNum) { return getCount(slotNum); } @Override public void reset() { throw new UnsupportedOperationException(); } @Override public void collect(int doc, int slot, IntFunction slotContext) throws IOException { throw new UnsupportedOperationException(); } @Override public int compare(int slotA, int slotB) { return Long.compare(table.counts[slotA], table.counts[slotB]); } @Override public void resize(Resizer resizer) { throw new UnsupportedOperationException(); } }; // we set the countAcc & indexAcc first so generic ones won't be created for us. super.createCollectAcc(fcontext.base.size(), numSlots); if (freq.allBuckets) { allBucketsAcc = new SpecialSlotAcc(fcontext, collectAcc, allBucketsSlot, otherAccs, 0); } } private void collectDocs() throws IOException { if (calc instanceof TermOrdCalc) { // Strings // TODO support SortedSetDocValues SortedDocValues globalDocValues = FieldUtil.getSortedDocValues(fcontext.qcontext, sf, null); ((TermOrdCalc) calc).lookupOrdFunction = ord -> { try { return globalDocValues.lookupOrd(ord); } catch (IOException e) { throw new RuntimeException(e); } }; DocSetUtil.collectSortedDocSet( fcontext.base, fcontext.searcher.getIndexReader(), new SimpleCollector() { SortedDocValues docValues = globalDocValues; // this segment/leaf. NN LongValues toGlobal = LongValues.IDENTITY; // this segment to global ordinal. NN @Override public ScoreMode scoreMode() { return ScoreMode.COMPLETE_NO_SCORES; } @Override protected void doSetNextReader(LeafReaderContext ctx) throws IOException { setNextReaderFirstPhase(ctx); if (globalDocValues instanceof MultiDocValues.MultiSortedDocValues) { MultiDocValues.MultiSortedDocValues multiDocValues = (MultiDocValues.MultiSortedDocValues) globalDocValues; docValues = multiDocValues.values[ctx.ord]; toGlobal = multiDocValues.mapping.getGlobalOrds(ctx.ord); } } @Override public void collect(int segDoc) throws IOException { if (docValues.advanceExact(segDoc)) { long val = toGlobal.get(docValues.ordValue()); collectValFirstPhase(segDoc, val); } } }); } else { // Numeric: if (sf.multiValued()) { DocSetUtil.collectSortedDocSet( fcontext.base, fcontext.searcher.getIndexReader(), new SimpleCollector() { SortedNumericDocValues values = null; // NN @Override public ScoreMode scoreMode() { return ScoreMode.COMPLETE_NO_SCORES; } @Override protected void doSetNextReader(LeafReaderContext ctx) throws IOException { setNextReaderFirstPhase(ctx); values = DocValues.getSortedNumeric(ctx.reader(), sf.getName()); } @Override public void collect(int segDoc) throws IOException { if (values.advanceExact(segDoc)) { long l = values.nextValue(); // This document must have at least one value collectValFirstPhase(segDoc, l); for (int i = 1, count = values.docValueCount(); i < count; i++) { long lnew = values.nextValue(); // Skip the value if it's equal to the last one, we don't want to double-count // it if (lnew != l) { collectValFirstPhase(segDoc, lnew); } l = lnew; } } } }); } else { DocSetUtil.collectSortedDocSet( fcontext.base, fcontext.searcher.getIndexReader(), new SimpleCollector() { NumericDocValues values = null; // NN @Override public ScoreMode scoreMode() { return ScoreMode.COMPLETE_NO_SCORES; } @Override protected void doSetNextReader(LeafReaderContext ctx) throws IOException { setNextReaderFirstPhase(ctx); values = DocValues.getNumeric(ctx.reader(), sf.getName()); } @Override public void collect(int segDoc) throws IOException { if (values.advanceExact(segDoc)) { collectValFirstPhase(segDoc, values.longValue()); } } }); } } } private void collectValFirstPhase(int segDoc, long val) throws IOException { int slot = table.add(val); // this can trigger a rehash // Our countAcc is virtual, so this is not needed: // countAcc.incrementCount(slot, 1); super.collectFirstPhase(segDoc, slot, slotContext); } /** * SlotContext to use during all {@link SlotAcc} collection. * *

This avoids a memory allocation for each invocation of collectValFirstPhase. */ private IntFunction slotContext = (slotNum) -> { long val = table.vals[slotNum]; @SuppressWarnings({"rawtypes"}) Comparable value = calc.bitsToValue(val); return new SlotContext(sf.getType().getFieldTermQuery(null, sf, calc.formatValue(value))); }; private void doRehash(LongCounts table) { if (collectAcc == null && allBucketsAcc == null) return; // Our "count" acc is backed by the hash table and will already be rehashed // otherAccs don't need to be rehashed int newTableSize = table.numSlots(); int numSlots = newTableSize; final int oldAllBucketsSlot = allBucketsSlot; if (oldAllBucketsSlot >= 0) { allBucketsSlot = numSlots++; } final int finalNumSlots = numSlots; final long[] mapping = table.oldToNewMapping; SlotAcc.Resizer resizer = new SlotAcc.Resizer() { @Override public int getNewSize() { return finalNumSlots; } @Override public int getNewSlot(int oldSlot) { if (oldSlot < mapping.length) { return (int) mapping[oldSlot]; } if (oldSlot == oldAllBucketsSlot) { return allBucketsSlot; } return -1; } }; // NOTE: resizing isn't strictly necessary for missing/allBuckets... we could just set the new // slot directly if (collectAcc != null) { collectAcc.resize(resizer); } if (allBucketsAcc != null) { allBucketsAcc.resize(resizer); } } }