org.apache.solr.handler.component.StatsValuesFactory Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of solr-core Show documentation
Apache Solr (module: core)
There is a newer version: 9.7.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import com.google.common.hash.HashFunction;
import com.tdunning.math.stats.AVLTreeDigest;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionValues;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.EnumFieldValue;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.handler.component.StatsField.Stat;
import org.apache.solr.schema.AbstractEnumField;
import org.apache.solr.schema.DatePointField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.PointField;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.StrField;
import org.apache.solr.schema.TrieDateField;
import org.apache.solr.schema.TrieField;
import org.apache.solr.util.hll.HLL;
import org.apache.solr.util.hll.HLLType;

/** Factory class for creating instance of {@link org.apache.solr.handler.component.StatsValues} */
public class StatsValuesFactory {

  /**
   * Creates an instance of StatsValues which supports values from the specified {@link StatsField}
   *
   * @param statsField {@link StatsField} whose statistics will be created by the resulting {@link
   *     StatsValues}
   * @return Instance of {@link StatsValues} that will create statistics from values from the
   *     specified {@link StatsField}
   */
  public static StatsValues createStatsValues(StatsField statsField) {

    final SchemaField sf = statsField.getSchemaField();

    if (null == sf) {
      // function stats
      return new NumericStatsValues(statsField);
    }

    final FieldType fieldType = sf.getType(); // TODO: allow FieldType to provide impl.

    if (TrieDateField.class.isInstance(fieldType) || DatePointField.class.isInstance(fieldType)) {
      DateStatsValues statsValues = new DateStatsValues(statsField);
      if (sf.multiValued()) {
        return new SortedDateStatsValues(statsValues, statsField);
      }
      return statsValues;
    } else if (TrieField.class.isInstance(fieldType) || PointField.class.isInstance(fieldType)) {

      NumericStatsValues statsValue = new NumericStatsValues(statsField);
      if (sf.multiValued()) {
        return new SortedNumericStatsValues(statsValue, statsField);
      }
      return statsValue;
    } else if (StrField.class.isInstance(fieldType)) {
      return new StringStatsValues(statsField);
    } else if (AbstractEnumField.class.isInstance(fieldType)) {
      return new EnumStatsValues(statsField);
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Field type " + fieldType + " is not currently supported");
    }
  }

  /**
   * Abstract implementation of {@link StatsValues} that provides the default behavior for most
   * StatsValues implementations.
   *
   * There are very few requirements placed on what statistics concrete implementations should
   * collect, with the only required statistics being the minimum and maximum values.
   */
  private abstract static class AbstractStatsValues implements StatsValues {
    private static final String FACETS = "facets";

    /** Tracks all data about tthe stats we need to collect */
    protected final StatsField statsField;

    /** may be null if we are collecting stats directly from a function ValueSource */
    protected final SchemaField sf;

    /** may be null if we are collecting stats directly from a function ValueSource */
    protected final FieldType ft;

    // final booleans from StatsField to allow better inlining & JIT optimizing
    protected final boolean computeCount;
    protected final boolean computeMissing;
    protected final boolean
        computeCalcDistinct; // needed for either countDistinct or distinctValues
    protected final boolean computeMin;
    protected final boolean computeMax;
    protected final boolean computeMinOrMax;
    protected final boolean computeCardinality;

    /**
     * Either a function value source to collect from, or the ValueSource associated with a single
     * valued field we are collecting from. Will be null until/unless {@link #setNextReader} is
     * called at least once
     */
    private ValueSource valueSource;

    /**
     * Context to use when retrieving FunctionValues, will be null until/unless {@link
     * #setNextReader} is called at least once
     */
    private Map vsContext;

    /**
     * Values to collect, will be null until/unless {@link #setNextReader} is called at least once
     */
    protected FunctionValues values;

    protected T max;
    protected T min;
    protected long missing;
    protected long count;
    protected long countDistinct;
    protected final Set distinctValues;

    /** Hash function that must be used by implementations of {@link #hash} */
    protected final HashFunction hasher;

    // if null, no HLL logic can be computed; not final because of "union" optimization (see below)
    private HLL hll;

    // facetField facetValue
    protected Map> facets = new HashMap<>();

    protected AbstractStatsValues(StatsField statsField) {
      this.statsField = statsField;
      this.computeCount = statsField.calculateStats(Stat.count);
      this.computeMissing = statsField.calculateStats(Stat.missing);
      this.computeCalcDistinct =
          statsField.calculateStats(Stat.countDistinct)
              || statsField.calculateStats(Stat.distinctValues);
      this.computeMin = statsField.calculateStats(Stat.min);
      this.computeMax = statsField.calculateStats(Stat.max);
      this.computeMinOrMax = computeMin || computeMax;

      this.distinctValues = computeCalcDistinct ? new TreeSet<>() : null;

      this.computeCardinality = statsField.calculateStats(Stat.cardinality);
      if (computeCardinality) {

        hasher = statsField.getHllOptions().getHasher();
        hll = statsField.getHllOptions().newHLL();
        assert null != hll : "Cardinality requires an HLL";
      } else {
        hll = null;
        hasher = null;
      }

      // alternatively, we could refactor a common base class that doesn't know/care
      // about either SchemaField or ValueSource - but then there would be a lot of
      // duplicate code between "NumericSchemaFieldStatsValues" and
      // "NumericValueSourceStatsValues" which would have diff parent classes
      //
      // part of the complexity here being that the StatsValues API serves two
      // leaders: collecting concrete Values from things like DocValuesStats and
      // the distributed aggregation logic, but also collecting docIds which it
      // then
      // uses to go out and pull concreate values from the ValueSource
      // (from a func, or single valued field)
      if (null != statsField.getSchemaField()) {
        assert null == statsField.getValueSource();
        this.sf = statsField.getSchemaField();
        this.ft = sf.getType();
      } else {
        assert null != statsField.getValueSource();
        assert null == statsField.getSchemaField();
        this.sf = null;
        this.ft = null;
      }
    }

    @Override
    @SuppressWarnings({"unchecked"})
    public void accumulate(NamedList stv) {
      if (computeCount) {
        count += (Long) stv.get("count");
      }
      if (computeMissing) {
        missing += (Long) stv.get("missing");
      }
      if (computeCalcDistinct) {
        distinctValues.addAll((Collection) stv.get("distinctValues"));
        countDistinct = distinctValues.size();
      }

      if (computeMinOrMax) {
        updateMinMax((T) stv.get("min"), (T) stv.get("max"));
      }

      if (computeCardinality) {
        byte[] data = (byte[]) stv.get("cardinality");
        HLL other = HLL.fromBytes(data);
        if (hll.getType().equals(HLLType.EMPTY)) {
          // The HLL.union method goes out of it's way not to modify the "other" HLL.
          // Which means in the case of merging into an "EMPTY" HLL (garunteed to happen at
          // least once in every coordination of shard requests) it always clones all
          // of the internal storage -- but since we're going to throw "other" away after
          // the merge, this just means a short term doubling of RAM that we can skip.
          hll = other;
        } else {
          hll.union(other);
        }
      }

      updateTypeSpecificStats(stv);

      NamedList f = (NamedList) stv.get(FACETS);
      if (f == null) {
        return;
      }

      for (int i = 0; i < f.size(); i++) {
        String field = f.getName(i);
        NamedList vals = (NamedList) f.getVal(i);
        Map addTo = facets.get(field);
        if (addTo == null) {
          addTo = new HashMap<>();
          facets.put(field, addTo);
        }
        for (int j = 0; j < vals.size(); j++) {
          String val = vals.getName(j);
          StatsValues vvals = addTo.get(val);
          if (vvals == null) {
            vvals = createStatsValues(statsField);
            addTo.put(val, vvals);
          }
          vvals.accumulate((NamedList) vals.getVal(j));
        }
      }
    }

    @Override
    @SuppressWarnings({"unchecked"})
    public void accumulate(BytesRef value, int count) {
      if (null == ft) {
        throw new IllegalStateException(
            "Can't collect & convert BytesRefs on stats that do't use a a FieldType: "
                + statsField);
      }
      T typedValue = (T) ft.toObject(sf, value);
      accumulate(typedValue, count);
    }

    public void accumulate(T value, int count) {
      assert null != value : "Can't accumulate null";

      if (computeCount) {
        this.count += count;
      }
      if (computeCalcDistinct) {
        distinctValues.add(value);
        countDistinct = distinctValues.size();
      }
      if (computeMinOrMax) {
        updateMinMax(value, value);
      }
      if (computeCardinality) {
        if (null == hasher) {
          assert value instanceof Number : "pre-hashed value support only works with numeric longs";
          hll.addRaw(((Number) value).longValue());
        } else {
          hll.addRaw(hash(value));
        }
      }
      updateTypeSpecificStats(value, count);
    }

    @Override
    public void missing() {
      if (computeMissing) {
        missing++;
      }
    }

    @Override
    public void addMissing(int count) {
      missing += count;
    }

    @Override
    public void addFacet(String facetName, Map facetValues) {
      facets.put(facetName, facetValues);
    }

    @Override
    public NamedList getStatsValues() {
      NamedList