org.apache.solr.handler.component.StatsField Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
import org.apache.lucene.queries.function.valuesource.QueryValueSource;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.DocValuesStats;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.util.hll.HLL;
import org.apache.solr.util.hll.HLLType;

/**
 * Models all of the information associated with a single {@link StatsParams#STATS_FIELD} instance.
 *
 * @see StatsComponent
 */
public class StatsField {

  /**
   * An enumeration representing the sumer set of all possible stat values that can be computed.
   * Each of these enum values can be specified as a local param in a stats.field (eg:
   * stats.field={!min=true mean=true}my_field_name) but not all enum values are valid
   * for all field types (eg: mean is meaningless for String fields)
   *
   * @lucene.internal
   * @lucene.experimental
   */
  @SuppressWarnings("ImmutableEnumChecker")
  public enum Stat {
    min(true),
    max(true),
    missing(true),
    sum(true),
    count(true),
    mean(false, sum, count),
    sumOfSquares(true),
    stddev(false, sum, count, sumOfSquares),
    distinctValues(true),
    countDistinct(false, distinctValues),
    percentiles(true) {
      /** special for percentiles * */
      @Override
      boolean parseParams(StatsField sf) {
        String percentileParas = sf.localParams.get(this.name());
        if (percentileParas != null) {
          List percentiles = new ArrayList<>();
          try {
            for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
              percentiles.add(Double.parseDouble(percentile));
            }
            if (!percentiles.isEmpty()) {
              sf.percentilesList.addAll(percentiles);
              sf.tdigestCompression =
                  sf.localParams.getDouble("tdigestCompression", sf.tdigestCompression);
              return true;
            }
          } catch (NumberFormatException e) {
            throw new SolrException(
                ErrorCode.BAD_REQUEST,
                "Unable to parse "
                    + StatsParams.STATS_FIELD
                    + " local params: "
                    + sf.localParams
                    + " due to: "
                    + e.getMessage(),
                e);
          }
        }
        return false;
      }
    },
    cardinality(true) {
      /** special for percentiles * */
      @Override
      boolean parseParams(StatsField sf) {
        try {
          sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
          return (null != sf.hllOpts);
        } catch (Exception e) {
          throw new SolrException(
              ErrorCode.BAD_REQUEST,
              "Unable to parse "
                  + StatsParams.STATS_FIELD
                  + " local params: "
                  + sf.localParams
                  + " due to: "
                  + e.getMessage(),
              e);
        }
      }
    };

    private final List distribDeps;

    /**
     * Sole constructor for Stat enum values
     *
     * @param deps the set of stat values, other then this one, which are a distributed dependency
     *     and must be computed and returned by each individual shards in order to compute
     *     this stat over the entire distributed result set.
     * @param selfDep indicates that when computing this stat across a distributed result set, each
     *     shard must compute this stat in addition to any other distributed dependencies.
     * @see #getDistribDeps
     */
    Stat(boolean selfDep, Stat... deps) {
      distribDeps = new ArrayList<>(deps.length + 1);
      distribDeps.addAll(Arrays.asList(deps));
      if (selfDep) {
        distribDeps.add(this);
      }
    }

    /** Given a String, returns the corresponding Stat enum value if any, otherwise returns null. */
    public static Stat forName(String paramKey) {
      try {
        return Stat.valueOf(paramKey);
      } catch (IllegalArgumentException e) {
        return null;
      }
    }

    /**
     * The stats that must be computed and returned by each shard involved in a distributed request
     * in order to compute the overall value for this stat across the entire distributed result set.
     * A Stat instance may include itself in the getDistribDeps() result, but that is
     * not always the case.
     */
    public EnumSet getDistribDeps() {
      return EnumSet.copyOf(this.distribDeps);
    }

    /**
     * Called when the name of a stat is found as a local param on this {@link StatsField}
     *
     * @return true if the user is requesting this stat, else false
     */
    boolean parseParams(StatsField sf) {
      return sf.localParams.getBool(this.name(), false);
    }
  }

  /**
   * the equivalent stats if "calcdistinct" is specified
   *
   * @see Stat#countDistinct
   * @see Stat#distinctValues
   */
  private static final EnumSet CALCDISTINCT_PSEUDO_STAT =
      EnumSet.of(Stat.countDistinct, Stat.distinctValues);

  /** The set of stats computed by default when no localparams are used to specify explicit stats */
  public static final Set DEFAULT_STATS =
      Collections.unmodifiableSet(
          EnumSet.of(
              Stat.min,
              Stat.max,
              Stat.missing,
              Stat.sum,
              Stat.count,
              Stat.mean,
              Stat.sumOfSquares,
              Stat.stddev));

  private final SolrIndexSearcher searcher;
  private final ResponseBuilder rb;
  private final String originalParam; // for error messages
  private final SolrParams localParams;
  private final ValueSource valueSource; // may be null if simple field stats
  private final SchemaField schemaField; // may be null if function/query stats
  private final String key;
  private final boolean topLevelCalcDistinct;
  private final String[] facets;
  private final List tagList;
  private final List excludeTagList;
  private final EnumSet statsToCalculate = EnumSet.noneOf(Stat.class);
  private final EnumSet statsInResponse = EnumSet.noneOf(Stat.class);
  private final List percentilesList = new ArrayList<>();
  private final boolean isShard;

  private double tdigestCompression = 100.0D;
  private HllOptions hllOpts;

  /**
   * @param rb the current request/response
   * @param statsParam the raw {@link StatsParams#STATS_FIELD} string
   */
  public StatsField(ResponseBuilder rb, String statsParam) {
    this.rb = rb;
    this.searcher = rb.req.getSearcher();
    this.originalParam = statsParam;

    SolrParams params = rb.req.getParams();
    try {
      isShard = params.getBool("isShard", false);
      SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
      if (null == localParams) {
        // simplest possible input: bare string (field name)
        ModifiableSolrParams customParams = new ModifiableSolrParams();
        customParams.add(QueryParsing.V, originalParam);
        localParams = customParams;
      }

      this.localParams = localParams;

      String parserName = localParams.get(QueryParsing.TYPE);
      SchemaField sf = null;
      ValueSource vs = null;

      if (StrUtils.isBlank(parserName)) {

        // basic request for field stats
        sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));

      } else {
        // we have a non trivial request to compute stats over a query (or function)

        // NOTE we could use QParser.getParser(...) here, but that would redundantly
        // reparse everything.  ( TODO: refactor a common method in QParser ?)
        QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
        if (qplug == null) {
          throw new SolrException(
              SolrException.ErrorCode.BAD_REQUEST,
              "invalid query parser '"
                  + parserName
                  + (originalParam == null ? "'" : "' for query '" + originalParam + "'"));
        }
        QParser qp =
            qplug.createParser(localParams.get(QueryParsing.V), localParams, params, rb.req);

        // figure out what type of query we are dealing, get the most direct ValueSource
        vs = extractValueSource(qp.parse());

        // if this ValueSource directly corresponds to a SchemaField, act as if
        // we were asked to compute stats on it directly
        // ie:  "stats.field={!func key=foo}field(foo)" == "stats.field=foo"
        sf = extractSchemaField(vs, searcher.getSchema());
        if (null != sf) {
          vs = null;
        }
      }

      assert ((null == vs) ^ (null == sf)) : "exactly one of vs & sf must be null";

      this.schemaField = sf;
      this.valueSource = vs;

    } catch (SyntaxError e) {
      throw new SolrException(
          ErrorCode.BAD_REQUEST,
          "Unable to parse "
              + StatsParams.STATS_FIELD
              + ": "
              + originalParam
              + " due to: "
              + e.getMessage(),
          e);
    }

    // allow explicit setting of the response key via localparams...
    this.key =
        localParams.get(
            CommonParams.OUTPUT_KEY,
            // default to the main param value...
            localParams.get(
                CommonParams.VALUE,
                // default to entire original param str.
                originalParam));

    this.topLevelCalcDistinct =
        null == schemaField
            ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false)
            : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);

    populateStatsSets();

    String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
    this.facets = (null == facets) ? new String[0] : facets;
    String tagStr = localParams.get(CommonParams.TAG);
    this.tagList =
        (null == tagStr) ? Collections.emptyList() : StrUtils.splitSmart(tagStr, ',');

    // figure out if we need a special base DocSet
    String excludeStr = localParams.get(CommonParams.EXCLUDE);
    this.excludeTagList =
        (null == excludeStr)
            ? Collections.emptyList()
            : StrUtils.splitSmart(excludeStr, ',');

    assert ((null == this.valueSource) ^ (null == this.schemaField))
        : "exactly one of valueSource & schemaField must be null";
  }

  /**
   * Inspects a {@link Query} to see if it directly maps to a {@link ValueSource}, and if so returns
   * it -- otherwise wraps it as needed.
   *
   * @param q Query whose scores we have been asked to compute stats of
   * @returns a ValueSource to use for computing the stats
   */
  private static ValueSource extractValueSource(Query q) {
    return (q instanceof FunctionQuery)
        ?
        // Common case: we're wrapping a func, so we can directly pull out ValueSource
        ((FunctionQuery) q).getValueSource()
        :
        // asked to compute stats over a query, wrap it up as a ValueSource
        new QueryValueSource(q, 0.0F);
  }

  /**
   * Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField}, and if so
   * returns it.
   *
   * @param vs ValueSource we've been asked to compute stats of
   * @param schema The Schema to use
   * @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex
   * @see FieldCacheSource
   */
  private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
    if (vs instanceof FieldCacheSource) {
      String fieldName = ((FieldCacheSource) vs).getField();
      return schema.getField(fieldName);
    }
    return null;
  }

  /**
   * The key to be used when refering to this {@link StatsField} instance in the response tp
   * clients.
   */
  public String getOutputKey() {
    return key;
  }

  /**
   * Computes a base {@link DocSet} for the current request to be used when computing global stats
   * for the local index.
   *
   * This is typically the same as the main DocSet for the {@link ResponseBuilder} unless {@link
   * CommonParams#TAG tag}ged filter queries have been excluded using the {@link
   * CommonParams#EXCLUDE ex} local param
   */
  public DocSet computeBaseDocSet() throws IOException {

    DocSet docs = rb.getResults().docSet;
    Map tagMap = (Map) rb.req.getContext().get("tags");

    if (excludeTagList.isEmpty() || null == tagMap) {
      // either the exclude list is empty, or there
      // aren't any tagged filters to exclude anyway.
      return docs;
    }

    IdentityHashMap excludeSet = new IdentityHashMap<>();
    for (String excludeTag : excludeTagList) {
      Object olst = tagMap.get(excludeTag);
      // tagMap has entries of List>, but subject to change in the future
      if (!(olst instanceof Collection)) continue;
      for (Object o : (Collection) olst) {
        if (!(o instanceof QParser)) continue;
        QParser qp = (QParser) o;
        try {
          excludeSet.put(qp.getQuery(), Boolean.TRUE);
        } catch (SyntaxError e) {
          // this shouldn't be possible since the request should have already
          // failed when attempting to execute the query, but just in case...
          throw new SolrException(
              ErrorCode.BAD_REQUEST,
              "Excluded query can't be parsed: " + originalParam + " due to: " + e.getMessage(),
              e);
        }
      }
    }
    if (excludeSet.size() == 0) return docs;

    List qlist = new ArrayList<>();

    // add the base query
    if (!excludeSet.containsKey(rb.getQuery())) {
      qlist.add(rb.getQuery());
    }

    // add the filters
    if (rb.getFilters() != null) {
      for (Query q : rb.getFilters()) {
        if (!excludeSet.containsKey(q)) {
          qlist.add(q);
        }
      }
    }

    // get the new base docset for this facet
    return searcher.getDocSet(qlist);
  }

  /**
   * Computes the {@link StatsValues} for this {@link StatsField} relative to the specified {@link
   * DocSet}
   *
   * @see #computeBaseDocSet
   */
  public StatsValues computeLocalStatsValues(DocSet base) throws IOException {

    if (statsToCalculate.isEmpty()) {
      // perf optimization for the case where we compute nothing
      // ie: stats.field={!min=$domin}myfield&domin=false
      return StatsValuesFactory.createStatsValues(this);
    }

    if (null != schemaField
        && !schemaField.getType().isPointField()
        && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) {

      // TODO: should this also be used for single-valued string fields? (should work fine)
      return DocValuesStats.getCounts(searcher, this, base, facets);
    } else {
      // either a single valued field we pull from FieldCache, or an explicit
      // function ValueSource
      return computeLocalValueSourceStats(base);
    }
  }

  private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException {

    IndexSchema schema = searcher.getSchema();

    final StatsValues allstats = StatsValuesFactory.createStatsValues(this);

    List facetStats = new ArrayList<>();
    for (String facetField : facets) {
      SchemaField fsf = schema.getField(facetField);

      if (fsf.multiValued()) {
        throw new SolrException(
            SolrException.ErrorCode.BAD_REQUEST,
            "Stats can only facet on single-valued fields, not: " + facetField);
      }

      facetStats.add(new FieldFacetStats(searcher, fsf, this));
    }

    final Iterator ctxIt = searcher.getIndexReader().leaves().iterator();
    LeafReaderContext ctx = null;
    for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) {
      final int doc = docsIt.nextDoc();
      if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
        // advance
        do {
          ctx = ctxIt.next();
        } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
        assert doc >= ctx.docBase;

        // propagate the context among accumulators.
        allstats.setNextReader(ctx);
        for (FieldFacetStats f : facetStats) {
          f.setNextReader(ctx);
        }
      }

      // accumulate
      allstats.accumulate(doc - ctx.docBase);
      for (FieldFacetStats f : facetStats) {
        f.facet(doc - ctx.docBase);
      }
    }

    for (FieldFacetStats f : facetStats) {
      allstats.addFacet(f.name, f.facetStatsValues);
    }
    return allstats;
  }

  /**
   * The searcher that should be used for processing local stats
   *
   * @see SolrQueryRequest#getSearcher
   */
  public SolrIndexSearcher getSearcher() {
    // see AbstractStatsValues.setNextReader

    return searcher;
  }

  /**
   * The {@link SchemaField} whose results these stats are computed over, may be null if the stats
   * are computed over the results of a function or query
   *
   * @see #getValueSource
   */
  public SchemaField getSchemaField() {
    return schemaField;
  }

  /**
   * The {@link ValueSource} of a function or query whose results these stats are computed over, may
   * be null if the stats are directly over a {@link SchemaField}
   *
   * @see #getValueSource
   */
  public ValueSource getValueSource() {
    return valueSource;
  }

  public List getTagList() {
    return tagList;
  }

  @Override
  public String toString() {
    return "StatsField<" + originalParam + ">";
  }

  /**
   * A helper method which inspects the {@link #localParams} associated with this StatsField, and
   * uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data
   * structures
   */
  private void populateStatsSets() {
    boolean statSpecifiedByLocalParam = false;
    // local individual stat
    Iterator itParams = localParams.getParameterNamesIterator();

    while (itParams.hasNext()) {
      String paramKey = itParams.next();
      Stat stat = Stat.forName(paramKey);
      if (stat != null) {
        statSpecifiedByLocalParam = true;
        if (stat.parseParams(this)) {
          statsInResponse.add(stat);
        }
      }
    }

    // if no individual stat setting use the default set
    if (!(statSpecifiedByLocalParam
        // calcdistinct (as a local param) is a pseudo-stat, prevents default set
        || localParams.getBool("calcdistinct", false))) {
      statsInResponse.addAll(DEFAULT_STATS);
    }

    // calcDistinct is a pseudo-stat with optional top level param default behavior
    // if not overridden by the specific individual stats
    if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) {
      for (Stat stat : CALCDISTINCT_PSEUDO_STAT) {
        // assume true, but don't include if specific stat overrides
        if (localParams.getBool(stat.name(), true)) {
          statsInResponse.add(stat);
        }
      }
    }

    for (Stat stat : statsInResponse) {
      statsToCalculate.addAll(stat.getDistribDeps());
    }
  }

  public boolean calculateStats(Stat stat) {
    return statsToCalculate.contains(stat);
  }

  public boolean includeInResponse(Stat stat) {
    if (isShard) {
      return statsToCalculate.contains(stat);
    }

    if (statsInResponse.contains(stat)) {
      return true;
    }
    return false;
  }

  public List getPercentilesList() {
    return percentilesList;
  }

  public boolean getIsShard() {
    return isShard;
  }

  public double getTdigestCompression() {
    return tdigestCompression;
  }

  public HllOptions getHllOptions() {
    return hllOpts;
  }

  /**
   * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL}
   *
   * @see Stat#cardinality
   * @lucene.internal
   */
  public static final class HllOptions {
    final HashFunction hasher;

    // NOTE: this explanation linked to from the java-hll jdocs...
    // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning
    // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough
    // to support any max cardinality given that we're always dealing with hashes and
    // the cardinality of the set of all long values is 2**64 == 1.9e19
    //
    // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect
    // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values
    // might fall in the same register (ie: bucket) and having a wider register to count more of
    // them may be useful

    final int log2m;
    final int regwidth;

    static final String ERR =
        "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";

    private HllOptions(int log2m, int regwidth, HashFunction hasher) {
      this.log2m = log2m;
      this.regwidth = regwidth;
      this.hasher = hasher;
    }

    /**
     * Creates an HllOptions based on the (local) params specified (if appropriate).
     *
     * @param localParams the LocalParams for this {@link StatsField}
     * @param field the field corresponding to this {@link StatsField}, may be null if these stats
     *     are over a value source
     * @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should
     *     be computed
     * @throws SolrException if there are invalid options
     */
    public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field)
        throws SolrException {

      String cardinalityOpt = localParams.get(Stat.cardinality.name());
      if (StrUtils.isBlank(cardinalityOpt)) {
        return null;
      }

      final NumberType hashableNumType = getHashableNumericType(field);

      // some sane defaults
      int log2m = 13; // roughly equivalent to "cardinality='0.33'"
      int regwidth = 6; // with decent hash, this is plenty for all valid long hashes

      if (NumberType.FLOAT.equals(hashableNumType) || NumberType.INTEGER.equals(hashableNumType)) {
        // for 32bit values, we can adjust our default regwidth down a bit
        regwidth--;

        // NOTE: EnumField uses LegacyNumericType.INT, and in theory we could be super conservative
        // with it, but there's no point - just let the EXPLICIT HLL handle it
      }

      // TODO: we could attempt additional reductions in the default regwidth based on index
      // statistics -- but thta doesn't seem worth the effort.  for tiny indexes, the
      // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't
      // want to be too aggresive about lowering regwidth or we could really poor results if
      // log2m is also low and  there is heavy hashkey collision

      try {
        // NFE will short out here if it's not a number
        final double accuracyOpt = Double.parseDouble(cardinalityOpt);

        // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy
        // - 0 means accuracy is not a concern, save RAM
        // - 1 means be as accurate as possible, using as much RAM as needed.

        if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
          throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
        }

        // use accuracyOpt as a scaling factor between min & max legal log2m values
        log2m =
            HLL.MINIMUM_LOG2M_PARAM
                + (int)
                    Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));

        // use accuracyOpt as a scaling factor for regwidth as well, BUT...
        // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful
        // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling
        final int MIN_HUERISTIC_REGWIDTH = regwidth - 1;
        regwidth =
            MIN_HUERISTIC_REGWIDTH
                + (int)
                    Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));

      } catch (NumberFormatException nfe) {
        // param value isn't a number -- let's check for simple true/false
        if (!localParams.getBool(Stat.cardinality.name(), false)) {
          return null;
        }
      }

      // let explicit params override both the default and/or any accuracy specification
      log2m = localParams.getInt("hllLog2m", log2m);
      regwidth = localParams.getInt("hllRegwidth", regwidth);

      // validate legal values
      if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
        throw new SolrException(
            ErrorCode.BAD_REQUEST,
            "hllLog2m must be at least "
                + HLL.MINIMUM_LOG2M_PARAM
                + " and at most "
                + HLL.MAXIMUM_LOG2M_PARAM
                + " ("
                + log2m
                + ")");
      }
      if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
        throw new SolrException(
            ErrorCode.BAD_REQUEST,
            "hllRegwidth must be at least "
                + HLL.MINIMUM_REGWIDTH_PARAM
                + " and at most "
                + HLL.MAXIMUM_REGWIDTH_PARAM);
      }

      HashFunction hasher =
          localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();

      if (null == hasher) {
        // if this is a function, or a non Long field, pre-hashed is invalid
        // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings
        if (null == field
            || !(NumberType.LONG.equals(field.getType().getNumberType())
                || NumberType.DATE.equals(field.getType().getNumberType()))) {
          throw new SolrException(
              ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields");
        }
      }

      // if we're still here, then we need an HLL...
      return new HllOptions(log2m, regwidth, hasher);
    }

    /**
     * @see HLL
     */
    public int getLog2m() {
      return log2m;
    }

    /**
     * @see HLL
     */
    public int getRegwidth() {
      return regwidth;
    }

    /** May be null if user has indicated that field values are pre-hashed */
    public HashFunction getHasher() {
      return hasher;
    }

    public HLL newHLL() {
      // Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have
      // some nasty impacts on response time as it gets larger - particularly in distrib requests.
      // Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs
      //
      // TODO: add more tunning options for this.
      return new HLL(
          getLog2m(),
          getRegwidth(),
          -1 /* auto explict threshold */,
          false /* no sparse representation */,
          HLLType.EMPTY);
    }
  }

  /**
   * Returns the effective {@link NumberType} for the field for the purposes of hash values. ie: If
   * the field has an explict NumberType that is returned; If the field has no explicit NumberType
   * then {@link NumberType#LONG} is returned; If field is null, then {@link NumberType#FLOAT} is
   * assumed for ValueSource.
   */
  private static NumberType getHashableNumericType(SchemaField field) {
    if (null == field) {
      return NumberType.FLOAT;
    }
    final NumberType result = field.getType().getNumberType();
    return null == result ? NumberType.LONG : result;
  }
}