All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.solr.handler.component.StatsField Maven / Gradle / Ivy

There is a newer version: 9.7.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
import org.apache.lucene.queries.function.valuesource.QueryValueSource;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.DocValuesStats;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.util.hll.HLL;
import org.apache.solr.util.hll.HLLType;

/**
 * Models all of the information associated with a single {@link StatsParams#STATS_FIELD} instance.
 *
 * @see StatsComponent
 */
public class StatsField {

  /**
   * An enumeration representing the sumer set of all possible stat values that can be computed.
   * Each of these enum values can be specified as a local param in a stats.field (eg:
   * stats.field={!min=true mean=true}my_field_name) but not all enum values are valid
   * for all field types (eg: mean is meaningless for String fields)
   *
   * @lucene.internal
   * @lucene.experimental
   */
  @SuppressWarnings("ImmutableEnumChecker")
  public enum Stat {
    min(true),
    max(true),
    missing(true),
    sum(true),
    count(true),
    mean(false, sum, count),
    sumOfSquares(true),
    stddev(false, sum, count, sumOfSquares),
    distinctValues(true),
    countDistinct(false, distinctValues),
    percentiles(true) {
      /** special for percentiles * */
      @Override
      boolean parseParams(StatsField sf) {
        String percentileParas = sf.localParams.get(this.name());
        if (percentileParas != null) {
          List percentiles = new ArrayList<>();
          try {
            for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
              percentiles.add(Double.parseDouble(percentile));
            }
            if (!percentiles.isEmpty()) {
              sf.percentilesList.addAll(percentiles);
              sf.tdigestCompression =
                  sf.localParams.getDouble("tdigestCompression", sf.tdigestCompression);
              return true;
            }
          } catch (NumberFormatException e) {
            throw new SolrException(
                ErrorCode.BAD_REQUEST,
                "Unable to parse "
                    + StatsParams.STATS_FIELD
                    + " local params: "
                    + sf.localParams
                    + " due to: "
                    + e.getMessage(),
                e);
          }
        }
        return false;
      }
    },
    cardinality(true) {
      /** special for percentiles * */
      @Override
      boolean parseParams(StatsField sf) {
        try {
          sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
          return (null != sf.hllOpts);
        } catch (Exception e) {
          throw new SolrException(
              ErrorCode.BAD_REQUEST,
              "Unable to parse "
                  + StatsParams.STATS_FIELD
                  + " local params: "
                  + sf.localParams
                  + " due to: "
                  + e.getMessage(),
              e);
        }
      }
    };

    private final List distribDeps;

    /**
     * Sole constructor for Stat enum values
     *
     * @param deps the set of stat values, other then this one, which are a distributed dependency
     *     and must be computed and returned by each individual shards in order to compute
     *     this stat over the entire distributed result set.
     * @param selfDep indicates that when computing this stat across a distributed result set, each
     *     shard must compute this stat in addition to any other distributed dependencies.
     * @see #getDistribDeps
     */
    Stat(boolean selfDep, Stat... deps) {
      distribDeps = new ArrayList<>(deps.length + 1);
      distribDeps.addAll(Arrays.asList(deps));
      if (selfDep) {
        distribDeps.add(this);
      }
    }

    /** Given a String, returns the corresponding Stat enum value if any, otherwise returns null. */
    public static Stat forName(String paramKey) {
      try {
        return Stat.valueOf(paramKey);
      } catch (IllegalArgumentException e) {
        return null;
      }
    }

    /**
     * The stats that must be computed and returned by each shard involved in a distributed request
     * in order to compute the overall value for this stat across the entire distributed result set.
     * A Stat instance may include itself in the getDistribDeps() result, but that is
     * not always the case.
     */
    public EnumSet getDistribDeps() {
      return EnumSet.copyOf(this.distribDeps);
    }

    /**
     * Called when the name of a stat is found as a local param on this {@link StatsField}
     *
     * @return true if the user is requesting this stat, else false
     */
    boolean parseParams(StatsField sf) {
      return sf.localParams.getBool(this.name(), false);
    }
  }

  /**
   * the equivalent stats if "calcdistinct" is specified
   *
   * @see Stat#countDistinct
   * @see Stat#distinctValues
   */
  private static final EnumSet CALCDISTINCT_PSEUDO_STAT =
      EnumSet.of(Stat.countDistinct, Stat.distinctValues);

  /** The set of stats computed by default when no localparams are used to specify explicit stats */
  public static final Set DEFAULT_STATS =
      Collections.unmodifiableSet(
          EnumSet.of(
              Stat.min,
              Stat.max,
              Stat.missing,
              Stat.sum,
              Stat.count,
              Stat.mean,
              Stat.sumOfSquares,
              Stat.stddev));

  private final SolrIndexSearcher searcher;
  private final ResponseBuilder rb;
  private final String originalParam; // for error messages
  private final SolrParams localParams;
  private final ValueSource valueSource; // may be null if simple field stats
  private final SchemaField schemaField; // may be null if function/query stats
  private final String key;
  private final boolean topLevelCalcDistinct;
  private final String[] facets;
  private final List tagList;
  private final List excludeTagList;
  private final EnumSet statsToCalculate = EnumSet.noneOf(Stat.class);
  private final EnumSet statsInResponse = EnumSet.noneOf(Stat.class);
  private final List percentilesList = new ArrayList<>();
  private final boolean isShard;

  private double tdigestCompression = 100.0D;
  private HllOptions hllOpts;

  /**
   * @param rb the current request/response
   * @param statsParam the raw {@link StatsParams#STATS_FIELD} string
   */
  public StatsField(ResponseBuilder rb, String statsParam) {
    this.rb = rb;
    this.searcher = rb.req.getSearcher();
    this.originalParam = statsParam;

    SolrParams params = rb.req.getParams();
    try {
      isShard = params.getBool("isShard", false);
      SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
      if (null == localParams) {
        // simplest possible input: bare string (field name)
        ModifiableSolrParams customParams = new ModifiableSolrParams();
        customParams.add(QueryParsing.V, originalParam);
        localParams = customParams;
      }

      this.localParams = localParams;

      String parserName = localParams.get(QueryParsing.TYPE);
      SchemaField sf = null;
      ValueSource vs = null;

      if (StrUtils.isBlank(parserName)) {

        // basic request for field stats
        sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));

      } else {
        // we have a non trivial request to compute stats over a query (or function)

        // NOTE we could use QParser.getParser(...) here, but that would redundantly
        // reparse everything.  ( TODO: refactor a common method in QParser ?)
        QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
        if (qplug == null) {
          throw new SolrException(
              SolrException.ErrorCode.BAD_REQUEST,
              "invalid query parser '"
                  + parserName
                  + (originalParam == null ? "'" : "' for query '" + originalParam + "'"));
        }
        QParser qp =
            qplug.createParser(localParams.get(QueryParsing.V), localParams, params, rb.req);

        // figure out what type of query we are dealing, get the most direct ValueSource
        vs = extractValueSource(qp.parse());

        // if this ValueSource directly corresponds to a SchemaField, act as if
        // we were asked to compute stats on it directly
        // ie:  "stats.field={!func key=foo}field(foo)" == "stats.field=foo"
        sf = extractSchemaField(vs, searcher.getSchema());
        if (null != sf) {
          vs = null;
        }
      }

      assert ((null == vs) ^ (null == sf)) : "exactly one of vs & sf must be null";

      this.schemaField = sf;
      this.valueSource = vs;

    } catch (SyntaxError e) {
      throw new SolrException(
          ErrorCode.BAD_REQUEST,
          "Unable to parse "
              + StatsParams.STATS_FIELD
              + ": "
              + originalParam
              + " due to: "
              + e.getMessage(),
          e);
    }

    // allow explicit setting of the response key via localparams...
    this.key =
        localParams.get(
            CommonParams.OUTPUT_KEY,
            // default to the main param value...
            localParams.get(
                CommonParams.VALUE,
                // default to entire original param str.
                originalParam));

    this.topLevelCalcDistinct =
        null == schemaField
            ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false)
            : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);

    populateStatsSets();

    String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
    this.facets = (null == facets) ? new String[0] : facets;
    String tagStr = localParams.get(CommonParams.TAG);
    this.tagList =
        (null == tagStr) ? Collections.emptyList() : StrUtils.splitSmart(tagStr, ',');

    // figure out if we need a special base DocSet
    String excludeStr = localParams.get(CommonParams.EXCLUDE);
    this.excludeTagList =
        (null == excludeStr)
            ? Collections.emptyList()
            : StrUtils.splitSmart(excludeStr, ',');

    assert ((null == this.valueSource) ^ (null == this.schemaField))
        : "exactly one of valueSource & schemaField must be null";
  }

  /**
   * Inspects a {@link Query} to see if it directly maps to a {@link ValueSource}, and if so returns
   * it -- otherwise wraps it as needed.
   *
   * @param q Query whose scores we have been asked to compute stats of
   * @returns a ValueSource to use for computing the stats
   */
  private static ValueSource extractValueSource(Query q) {
    return (q instanceof FunctionQuery)
        ?
        // Common case: we're wrapping a func, so we can directly pull out ValueSource
        ((FunctionQuery) q).getValueSource()
        :
        // asked to compute stats over a query, wrap it up as a ValueSource
        new QueryValueSource(q, 0.0F);
  }

  /**
   * Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField}, and if so
   * returns it.
   *
   * @param vs ValueSource we've been asked to compute stats of
   * @param schema The Schema to use
   * @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex
   * @see FieldCacheSource
   */
  private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
    if (vs instanceof FieldCacheSource) {
      String fieldName = ((FieldCacheSource) vs).getField();
      return schema.getField(fieldName);
    }
    return null;
  }

  /**
   * The key to be used when refering to this {@link StatsField} instance in the response tp
   * clients.
   */
  public String getOutputKey() {
    return key;
  }

  /**
   * Computes a base {@link DocSet} for the current request to be used when computing global stats
   * for the local index.
   *
   * 

This is typically the same as the main DocSet for the {@link ResponseBuilder} unless {@link * CommonParams#TAG tag}ged filter queries have been excluded using the {@link * CommonParams#EXCLUDE ex} local param */ public DocSet computeBaseDocSet() throws IOException { DocSet docs = rb.getResults().docSet; Map tagMap = (Map) rb.req.getContext().get("tags"); if (excludeTagList.isEmpty() || null == tagMap) { // either the exclude list is empty, or there // aren't any tagged filters to exclude anyway. return docs; } IdentityHashMap excludeSet = new IdentityHashMap<>(); for (String excludeTag : excludeTagList) { Object olst = tagMap.get(excludeTag); // tagMap has entries of List>, but subject to change in the future if (!(olst instanceof Collection)) continue; for (Object o : (Collection) olst) { if (!(o instanceof QParser)) continue; QParser qp = (QParser) o; try { excludeSet.put(qp.getQuery(), Boolean.TRUE); } catch (SyntaxError e) { // this shouldn't be possible since the request should have already // failed when attempting to execute the query, but just in case... throw new SolrException( ErrorCode.BAD_REQUEST, "Excluded query can't be parsed: " + originalParam + " due to: " + e.getMessage(), e); } } } if (excludeSet.size() == 0) return docs; List qlist = new ArrayList<>(); // add the base query if (!excludeSet.containsKey(rb.getQuery())) { qlist.add(rb.getQuery()); } // add the filters if (rb.getFilters() != null) { for (Query q : rb.getFilters()) { if (!excludeSet.containsKey(q)) { qlist.add(q); } } } // get the new base docset for this facet return searcher.getDocSet(qlist); } /** * Computes the {@link StatsValues} for this {@link StatsField} relative to the specified {@link * DocSet} * * @see #computeBaseDocSet */ public StatsValues computeLocalStatsValues(DocSet base) throws IOException { if (statsToCalculate.isEmpty()) { // perf optimization for the case where we compute nothing // ie: stats.field={!min=$domin}myfield&domin=false return StatsValuesFactory.createStatsValues(this); } if (null != schemaField && !schemaField.getType().isPointField() && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) { // TODO: should this also be used for single-valued string fields? (should work fine) return DocValuesStats.getCounts(searcher, this, base, facets); } else { // either a single valued field we pull from FieldCache, or an explicit // function ValueSource return computeLocalValueSourceStats(base); } } private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException { IndexSchema schema = searcher.getSchema(); final StatsValues allstats = StatsValuesFactory.createStatsValues(this); List facetStats = new ArrayList<>(); for (String facetField : facets) { SchemaField fsf = schema.getField(facetField); if (fsf.multiValued()) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Stats can only facet on single-valued fields, not: " + facetField); } facetStats.add(new FieldFacetStats(searcher, fsf, this)); } final Iterator ctxIt = searcher.getIndexReader().leaves().iterator(); LeafReaderContext ctx = null; for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) { final int doc = docsIt.nextDoc(); if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) { // advance do { ctx = ctxIt.next(); } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()); assert doc >= ctx.docBase; // propagate the context among accumulators. allstats.setNextReader(ctx); for (FieldFacetStats f : facetStats) { f.setNextReader(ctx); } } // accumulate allstats.accumulate(doc - ctx.docBase); for (FieldFacetStats f : facetStats) { f.facet(doc - ctx.docBase); } } for (FieldFacetStats f : facetStats) { allstats.addFacet(f.name, f.facetStatsValues); } return allstats; } /** * The searcher that should be used for processing local stats * * @see SolrQueryRequest#getSearcher */ public SolrIndexSearcher getSearcher() { // see AbstractStatsValues.setNextReader return searcher; } /** * The {@link SchemaField} whose results these stats are computed over, may be null if the stats * are computed over the results of a function or query * * @see #getValueSource */ public SchemaField getSchemaField() { return schemaField; } /** * The {@link ValueSource} of a function or query whose results these stats are computed over, may * be null if the stats are directly over a {@link SchemaField} * * @see #getValueSource */ public ValueSource getValueSource() { return valueSource; } public List getTagList() { return tagList; } @Override public String toString() { return "StatsField<" + originalParam + ">"; } /** * A helper method which inspects the {@link #localParams} associated with this StatsField, and * uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data * structures */ private void populateStatsSets() { boolean statSpecifiedByLocalParam = false; // local individual stat Iterator itParams = localParams.getParameterNamesIterator(); while (itParams.hasNext()) { String paramKey = itParams.next(); Stat stat = Stat.forName(paramKey); if (stat != null) { statSpecifiedByLocalParam = true; if (stat.parseParams(this)) { statsInResponse.add(stat); } } } // if no individual stat setting use the default set if (!(statSpecifiedByLocalParam // calcdistinct (as a local param) is a pseudo-stat, prevents default set || localParams.getBool("calcdistinct", false))) { statsInResponse.addAll(DEFAULT_STATS); } // calcDistinct is a pseudo-stat with optional top level param default behavior // if not overridden by the specific individual stats if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) { for (Stat stat : CALCDISTINCT_PSEUDO_STAT) { // assume true, but don't include if specific stat overrides if (localParams.getBool(stat.name(), true)) { statsInResponse.add(stat); } } } for (Stat stat : statsInResponse) { statsToCalculate.addAll(stat.getDistribDeps()); } } public boolean calculateStats(Stat stat) { return statsToCalculate.contains(stat); } public boolean includeInResponse(Stat stat) { if (isShard) { return statsToCalculate.contains(stat); } if (statsInResponse.contains(stat)) { return true; } return false; } public List getPercentilesList() { return percentilesList; } public boolean getIsShard() { return isShard; } public double getTdigestCompression() { return tdigestCompression; } public HllOptions getHllOptions() { return hllOpts; } /** * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL} * * @see Stat#cardinality * @lucene.internal */ public static final class HllOptions { final HashFunction hasher; // NOTE: this explanation linked to from the java-hll jdocs... // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough // to support any max cardinality given that we're always dealing with hashes and // the cardinality of the set of all long values is 2**64 == 1.9e19 // // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values // might fall in the same register (ie: bucket) and having a wider register to count more of // them may be useful final int log2m; final int regwidth; static final String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)"; private HllOptions(int log2m, int regwidth, HashFunction hasher) { this.log2m = log2m; this.regwidth = regwidth; this.hasher = hasher; } /** * Creates an HllOptions based on the (local) params specified (if appropriate). * * @param localParams the LocalParams for this {@link StatsField} * @param field the field corresponding to this {@link StatsField}, may be null if these stats * are over a value source * @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should * be computed * @throws SolrException if there are invalid options */ public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field) throws SolrException { String cardinalityOpt = localParams.get(Stat.cardinality.name()); if (StrUtils.isBlank(cardinalityOpt)) { return null; } final NumberType hashableNumType = getHashableNumericType(field); // some sane defaults int log2m = 13; // roughly equivalent to "cardinality='0.33'" int regwidth = 6; // with decent hash, this is plenty for all valid long hashes if (NumberType.FLOAT.equals(hashableNumType) || NumberType.INTEGER.equals(hashableNumType)) { // for 32bit values, we can adjust our default regwidth down a bit regwidth--; // NOTE: EnumField uses LegacyNumericType.INT, and in theory we could be super conservative // with it, but there's no point - just let the EXPLICIT HLL handle it } // TODO: we could attempt additional reductions in the default regwidth based on index // statistics -- but thta doesn't seem worth the effort. for tiny indexes, the // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't // want to be too aggresive about lowering regwidth or we could really poor results if // log2m is also low and there is heavy hashkey collision try { // NFE will short out here if it's not a number final double accuracyOpt = Double.parseDouble(cardinalityOpt); // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy // - 0 means accuracy is not a concern, save RAM // - 1 means be as accurate as possible, using as much RAM as needed. if (accuracyOpt < 0D || 1.0D < accuracyOpt) { throw new SolrException(ErrorCode.BAD_REQUEST, ERR); } // use accuracyOpt as a scaling factor between min & max legal log2m values log2m = HLL.MINIMUM_LOG2M_PARAM + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM)); // use accuracyOpt as a scaling factor for regwidth as well, BUT... // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling final int MIN_HUERISTIC_REGWIDTH = regwidth - 1; regwidth = MIN_HUERISTIC_REGWIDTH + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH)); } catch (NumberFormatException nfe) { // param value isn't a number -- let's check for simple true/false if (!localParams.getBool(Stat.cardinality.name(), false)) { return null; } } // let explicit params override both the default and/or any accuracy specification log2m = localParams.getInt("hllLog2m", log2m); regwidth = localParams.getInt("hllRegwidth", regwidth); // validate legal values if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) { throw new SolrException( ErrorCode.BAD_REQUEST, "hllLog2m must be at least " + HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM + " (" + log2m + ")"); } if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) { throw new SolrException( ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " + HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM); } HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128(); if (null == hasher) { // if this is a function, or a non Long field, pre-hashed is invalid // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings if (null == field || !(NumberType.LONG.equals(field.getType().getNumberType()) || NumberType.DATE.equals(field.getType().getNumberType()))) { throw new SolrException( ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields"); } } // if we're still here, then we need an HLL... return new HllOptions(log2m, regwidth, hasher); } /** * @see HLL */ public int getLog2m() { return log2m; } /** * @see HLL */ public int getRegwidth() { return regwidth; } /** May be null if user has indicated that field values are pre-hashed */ public HashFunction getHasher() { return hasher; } public HLL newHLL() { // Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have // some nasty impacts on response time as it gets larger - particularly in distrib requests. // Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs // // TODO: add more tunning options for this. return new HLL( getLog2m(), getRegwidth(), -1 /* auto explict threshold */, false /* no sparse representation */, HLLType.EMPTY); } } /** * Returns the effective {@link NumberType} for the field for the purposes of hash values. ie: If * the field has an explict NumberType that is returned; If the field has no explicit NumberType * then {@link NumberType#LONG} is returned; If field is null, then {@link NumberType#FLOAT} is * assumed for ValueSource. */ private static NumberType getHashableNumericType(SchemaField field) { if (null == field) { return NumberType.FLOAT; } final NumberType result = field.getType().getNumberType(); return null == result ? NumberType.LONG : result; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy