All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.stats.StatsUtils Maven / Gradle / Ivy

There is a newer version: 4.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.stats;

import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.api.AggrStats;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.PartitionIterable;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.ColumnStatsList;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
import org.apache.hadoop.hive.ql.plan.ExprDynamicParamDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.Statistics;
import org.apache.hadoop.hive.ql.plan.Statistics.State;
import org.apache.hadoop.hive.ql.stats.BasicStats.Factory;
import org.apache.hadoop.hive.ql.stats.estimator.StatEstimator;
import org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.hadoop.hive.ql.udf.generic.NDV;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.ql.util.NamedForkJoinWorkerThreadFactory;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantStructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampLocalTZObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hive.common.util.AnnotationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.math.LongMath;
import com.google.common.primitives.Doubles;
import com.google.common.primitives.Longs;
import com.google.common.util.concurrent.ThreadFactoryBuilder;

public class StatsUtils {

  private static final Logger LOG = LoggerFactory.getLogger(StatsUtils.class.getName());

  // Range lower limit for date type when not defined (days, heuristic): '1999-01-01'
  private static final int DATE_RANGE_LOWER_LIMIT = 10593;
  // Range upper limit for date type when not defined (days, heuristic): '2024-12-31'
  private static final int DATE_RANGE_UPPER_LIMIT = 20089;
  // Range lower limit for timestamp type when not defined (seconds, heuristic): '1999-01-01 00:00:00'
  private static final long TIMESTAMP_RANGE_LOWER_LIMIT = 915148800L;
  // Range upper limit for timestamp type when not defined (seconds, heuristic): '2024-12-31 23:59:59'
  private static final long TIMESTAMP_RANGE_UPPER_LIMIT = 1735689599L;

  private static final ForkJoinPool statsForkJoinPool = new ForkJoinPool(
          Runtime.getRuntime().availableProcessors(),
          new NamedForkJoinWorkerThreadFactory("basic-stats-"),
          getUncaughtExceptionHandler(),
          false
  );

  private static Thread.UncaughtExceptionHandler getUncaughtExceptionHandler() {
    return (t, e) -> LOG.error(String.format("Thread %s exited with error", t.getName()), e);
  }

  /**
   * Collect table, partition and column level statistics
   * @param conf
   *          - hive configuration
   * @param partList
   *          - partition list
   * @param table
   *          - table
   * @param tableScanOperator
   *          - table scan operator
   * @return statistics object
   * @throws HiveException
   */
  public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, ColumnStatsList colStatsCache,
      Table table, TableScanOperator tableScanOperator) throws HiveException {

    // column level statistics are required only for the columns that are needed
    List schema = tableScanOperator.getSchema().getSignature();
    List neededColumns = tableScanOperator.getNeededColumns();
    List referencedColumns = tableScanOperator.getReferencedColumns();

    return collectStatistics(conf, partList, table, schema, neededColumns, colStatsCache, referencedColumns);
  }

  private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
      Table table, List schema, List neededColumns, ColumnStatsList colStatsCache,
      List referencedColumns) throws HiveException {

    boolean fetchColStats =
        HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
    boolean testMode =
        HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_IN_TEST);

    return collectStatistics(conf, partList, table, schema, neededColumns, colStatsCache, referencedColumns,
        fetchColStats, testMode);
  }

  /**
   * Returns number of rows if it exists. Otherwise it estimates number of rows
   * based on estimated data size for both partition and non-partitioned table
   * RelOptHiveTable's getRowCount uses this.
   */
  public static long getNumRows(HiveConf conf, List schema, Table table, PrunedPartitionList partitionList, AtomicInteger noColsMissingStats) {

    List inputs = new ArrayList<>();
    if (table.isPartitioned()) {
      for (Partition part : partitionList.getNotDeniedPartns()) {
        inputs.add(Partish.buildFor(table, part));
      }
    } else {
      inputs.add(Partish.buildFor(table));
    }

    Factory basicStatsFactory = new BasicStats.Factory();

    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS)) {
      basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
      basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
    }

    List results = new ArrayList<>();
    for (Partish pi : inputs) {
      BasicStats bStats = new BasicStats(pi);
      long nr = bStats.getNumRows();
      // FIXME: this point will be lost after the factory; check that it's really a warning....cleanup/etc
      if (nr <= 0) {
        // log warning if row count is missing
        noColsMissingStats.getAndIncrement();
      }
    }

    results = basicStatsFactory.buildAll(conf, inputs);

    BasicStats aggregateStat = BasicStats.buildFrom(results);

    aggregateStat.apply(new BasicStats.SetMinRowNumber01());

    return aggregateStat.getNumRows();
  }

  private static void estimateStatsForMissingCols(List neededColumns, List columnStats,
                                           Table table, HiveConf conf, long nr, List schema) {

    Set neededCols = new HashSet<>(neededColumns);
    Set colsWithStats = new HashSet<>();

    for (ColStatistics cstats : columnStats) {
      colsWithStats.add(cstats.getColumnName());
    }

    List missingColStats = new ArrayList(Sets.difference(neededCols, colsWithStats));

    if(missingColStats.size() > 0) {
      List estimatedColStats = estimateStats(table, schema, missingColStats, conf, nr);
      for (ColStatistics estColStats : estimatedColStats) {
        columnStats.add(estColStats);
      }
    }
  }

  public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
      Table table, List schema, List neededColumns, ColumnStatsList colStatsCache,
      List referencedColumns, boolean needColStats)
      throws HiveException {
    return collectStatistics(conf, partList, table, schema, neededColumns, colStatsCache,
        referencedColumns, needColStats, false);
  }

  private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table,
      List schema, List neededColumns, ColumnStatsList colStatsCache,
      List referencedColumns, boolean needColStats, boolean failIfCacheMiss) throws HiveException {

    Statistics stats = null;

    boolean fetchColStats =
        HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
    boolean estimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
    boolean metaTable = table.getMetaTable() != null;

    if (!table.isPartitioned()) {

      Factory basicStatsFactory = new BasicStats.Factory();

      if (estimateStats) {
        basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
      }

      //      long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table);
      basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
      basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01());

      BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table));

      //      long nr = getNumRows(conf, schema, neededColumns, table, ds);
      long ds = basicStats.getDataSize();
      long nr = basicStats.getNumRows();
      long fs = basicStats.getTotalFileSize();
      List colStats = Collections.emptyList();

      long numErasureCodedFiles = getErasureCodedFiles(table);

      if (needColStats && !metaTable) {
        colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache, fetchColStats);
        if (estimateStats) {
          estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
        }
        // we should have stats for all columns (estimated or actual)
        if (neededColumns.size() == colStats.size()) {
          long betterDS = getDataSizeFromColumnStats(nr, colStats);
          ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
        }
      }

      stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
      // infer if any column can be primary key based on column statistics
      inferAndSetPrimaryKey(stats.getNumRows(), colStats);

      stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
      stats.addToColumnStats(colStats);
    } else if (partList != null) {

      // For partitioned tables, get the size of all the partitions after pruning
      // the partitions that are not required

      Factory basicStatsFactory = new Factory();
      if (estimateStats) {
        // FIXME: misses parallel
        basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
      }

      basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));

      List partStats = null;
      try {
        partStats = statsForkJoinPool.submit(() ->
          partList.getNotDeniedPartns().parallelStream().
                  map(p -> basicStatsFactory.build(Partish.buildFor(table, p))).
                  collect(Collectors.toList())
        ).get();
      } catch (Exception e) {
        throw new HiveException(e);
      }

      BasicStats bbs = BasicStats.buildFrom(partStats);

      long nr = bbs.getNumRows();
      long ds = bbs.getDataSize();
      long fs = bbs.getTotalFileSize();

      List erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(),
          StatsSetupConst.NUM_ERASURE_CODED_FILES);
      long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles);

      if (nr == 0) {
        nr = 1;
      }
      stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
      stats.setBasicStatsState(bbs.getState());
      if (nr > 0) {
        // FIXME: this promotion process should be removed later
        if (State.PARTIAL.morePreciseThan(bbs.getState())) {
          stats.setBasicStatsState(State.PARTIAL);
        }
      }

      if (needColStats) {
        List partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);

        // We will retrieve stats from the metastore only for columns that are not cached
        List columnStats = new ArrayList<>();
        List neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats);
        List partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats);

        // List of partitions
        List partNames = new ArrayList<>(partList.getNotDeniedPartns().size());
        for (Partition part : partList.getNotDeniedPartns()) {
          partNames.add(part.getName());
        }

        AggrStats aggrStats = null;
        // We check the sizes of neededColumns and partNames here. If either
        // size is 0, aggrStats is null after several retries. Thus, we can
        // skip the step to connect to the metastore.
        if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) {
          aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(),
              neededColsToRetrieve, partNames, false);
        }

        boolean statsRetrieved = aggrStats != null &&
            aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
        if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
          estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
          // There are some partitions with no state (or we didn't fetch any state).
          // Update the stats with empty list to reflect that in the
          // state/initialize structures.

          // add partition column stats
          addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);

          // FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
          stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
          stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));

          stats.addToColumnStats(columnStats);
        } else {
          if (statsRetrieved) {
            columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
          }
          int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
          if (columnStats.size() != colStatsAvailable) {
            LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns",
                    columnStats.size(), colStatsAvailable);
          }

          addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
          long betterDS = getDataSizeFromColumnStats(nr, columnStats);
          stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
          // infer if any column can be primary key based on column statistics
          inferAndSetPrimaryKey(stats.getNumRows(), columnStats);

          stats.addToColumnStats(columnStats);

          // Infer column stats state
          stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
          if (neededColumns.size() != neededColsToRetrieve.size() ||
              partitionCols.size() != partitionColsToRetrieve.size()) {
            // Include state for cached columns
            stats.updateColumnStatsState(colStatsCache.getState());
          }
          // Change if we could not retrieve for all partitions
          if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
            stats.updateColumnStatsState(State.PARTIAL);
            LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions",
                    partNames.size(), aggrStats.getPartsFound());
          }
        }

        if (partStats.isEmpty()) {
          // all partitions are filtered by partition pruning
          stats.setBasicStatsState(State.COMPLETE);
        }

        // This block exists for debugging purposes: we want to check whether
        // the col stats cache is working properly and we are retrieving the
        // stats from metastore only once.
        if (colStatsCache != null && failIfCacheMiss &&
            stats.getColumnStatsState().equals(State.COMPLETE) &&
            (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
          throw new HiveException("Cache has been loaded in logical planning phase for all columns; "
              + "however, stats for column some columns could not be retrieved from it "
              + "(see messages above)");
        }
      }
    }
    return stats;
  }

  private static List extractColumnStates(Table table, List columns,
      ColumnStatsList colStatsCache, List columnStats) {
    if (colStatsCache == null) {
      return columns;
    }
    List neededColsToRetrieve = new ArrayList<>(columns.size());
    for (String colName : columns) {
      ColStatistics colStats = colStatsCache.getColStats().get(colName);
      if (colStats == null) {
        neededColsToRetrieve.add(colName);
        LOG.debug("Stats for column {} in table {} could not be retrieved from cache", colName,
            table.getCompleteName());
      } else {
        columnStats.add(colStats);
        LOG.debug("Stats for column {} in table {} retrieved from cache", colName, table.getCompleteName());
      }
    }
    return neededColsToRetrieve;
  }


  /**
   * Based on the provided column statistics and number of rows, this method infers if the column
   * can be primary key. It checks if the difference between the min and max value is equal to
   * number of rows specified.
   * @param numRows - number of rows
   * @param colStats - column statistics
   */
  public static void inferAndSetPrimaryKey(long numRows, List colStats) {
    if (colStats != null) {
      for (ColStatistics cs : colStats) {
        if (cs != null && cs.getCountDistint() >= numRows) {
          cs.setPrimaryKey(true);
        }
        else if (cs != null && cs.getRange() != null && cs.getRange().minValue != null &&
            cs.getRange().maxValue != null) {
          if (numRows ==
              ((cs.getRange().maxValue.longValue() - cs.getRange().minValue.longValue()) + 1)) {
            cs.setPrimaryKey(true);
          }
        }
      }
    }
  }

  /**
   * Infer foreign key relationship from given column statistics.
   * @param csPK - column statistics of primary key
   * @param csFK - column statistics of potential foreign key
   * @return
   */
  public static boolean inferForeignKey(ColStatistics csPK, ColStatistics csFK) {
    if (csPK != null && csFK != null) {
      if (csPK.isPrimaryKey()) {
        if (csPK.getRange() != null && csFK.getRange() != null) {
          ColStatistics.Range pkRange = csPK.getRange();
          ColStatistics.Range fkRange = csFK.getRange();
          return isWithin(fkRange, pkRange);
        }
      }
    }
    return false;
  }

  /**
   * Scale selectivity based on key range ratio.
   * @param csPK - column statistics of primary key
   * @param csFK - column statistics of potential foreign key
   * @return
   */
  public static float getScaledSelectivity(ColStatistics csPK, ColStatistics csFK) {
    float scaledSelectivity = 1.0f;
    if (csPK != null && csFK != null) {
      if (csPK.isPrimaryKey()) {
        // Use Max-Min Range as NDV gets scaled by selectivity.
        if (csPK.getRange() != null && csFK.getRange() != null) {
          long pkRangeDelta = getRangeDelta(csPK.getRange());
          long fkRangeDelta = getRangeDelta(csFK.getRange());
          if (fkRangeDelta > 0 && pkRangeDelta > 0 && fkRangeDelta < pkRangeDelta) {
            scaledSelectivity = (float) pkRangeDelta / (float) fkRangeDelta;
          }
        }
      }
    }
    return scaledSelectivity;
  }

  public static long getRangeDelta(ColStatistics.Range range) {
    if (range.minValue != null && range.maxValue != null) {
      return (range.maxValue.longValue() - range.minValue.longValue());
    }
    return 0;
  }

  private static boolean isWithin(ColStatistics.Range range1, ColStatistics.Range range2) {
    if (range1.minValue != null && range2.minValue != null && range1.maxValue != null &&
        range2.maxValue != null) {
      if (range1.minValue.longValue() >= range2.minValue.longValue() &&
          range1.maxValue.longValue() <= range2.maxValue.longValue()) {
        return true;
      }
    }
    return false;
  }

  private static void addPartitionColumnStats(HiveConf conf, List partitionCols,
      List schema, Table table, PrunedPartitionList partList, List colStats)
          throws HiveException {
    for (String col : partitionCols) {
      for (ColumnInfo ci : schema) {
        // conditions for being partition column
        if (col.equals(ci.getInternalName())) {
          colStats.add(getColStatsForPartCol(ci, new PartitionIterable(partList.getPartitions()), conf));
        }
      }
    }
  }

  private static List getPartitionColumns(List schema,
      List neededColumns,
      List referencedColumns) {
    // extra columns is difference between referenced columns vs needed
    // columns. The difference could be partition columns.
    List partitionCols = new ArrayList<>(referencedColumns.size());
    List extraCols = Lists.newArrayList(referencedColumns);
    if (referencedColumns.size() > neededColumns.size()) {
      extraCols.removeAll(neededColumns);
      for (String col : extraCols) {
        for (ColumnInfo ci : schema) {
          // conditions for being partition column
          if (col.equals(ci.getInternalName()) && ci.getIsVirtualCol() &&
              !ci.isHiddenVirtualCol()) {
            partitionCols.add(col);
          }
        }
      }
    }
    return partitionCols;
  }

  public static ColStatistics getColStatsForPartCol(ColumnInfo ci,PartitionIterable partList, HiveConf conf) {
    // currently metastore does not store column stats for
    // partition column, so we calculate the NDV from partition list
    ColStatistics partCS = new ColStatistics(ci.getInternalName(), ci.getType()
        .getTypeName());
    long numPartitions = getNDVPartitionColumn(partList,
        ci.getInternalName());
    partCS.setCountDistint(numPartitions);
    partCS.setAvgColLen(StatsUtils.getAvgColLenOf(conf,
        ci.getObjectInspector(), partCS.getColumnType()));
    partCS.setRange(getRangePartitionColumn(partList, ci.getInternalName(),
        ci.getType().getTypeName(), conf.getVar(ConfVars.DEFAULTPARTITIONNAME)));
    return partCS;
  }

  public static int getNDVPartitionColumn(PartitionIterable partitions, String partColName) {
    Set distinctVals = new HashSet();
    for (Partition partition : partitions) {
      distinctVals.add(partition.getSpec().get(partColName));
    }
    return distinctVals.size();
  }

  private static Range getRangePartitionColumn(PartitionIterable partitions, String partColName,
      String colType, String defaultPartName) {
    Range range = null;
    String partVal;
    String colTypeLowerCase = colType.toLowerCase();
    if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)
        || colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)
        || colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME)
        || colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) {
      long min = Long.MAX_VALUE;
      long max = Long.MIN_VALUE;
      for (Partition partition : partitions) {
        partVal = partition.getSpec().get(partColName);
        if (partVal.equals(defaultPartName)) {
          // partition column value is null.
          continue;
        } else {
          long value = Long.parseLong(partVal);
          min = Math.min(min, value);
          max = Math.max(max, value);
        }
      }
      range = new Range(min, max);
    } else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)
        || colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
      double min = Double.MAX_VALUE;
      double max = Double.MIN_VALUE;
      for (Partition partition : partitions) {
        partVal = partition.getSpec().get(partColName);
        if (partVal.equals(defaultPartName)) {
          // partition column value is null.
          continue;
        } else {
          double value = Double.parseDouble(partVal);
          min = Math.min(min, value);
          max = Math.max(max, value);
        }
      }
      range = new Range(min, max);
    } else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      double min = Double.MAX_VALUE;
      double max = Double.MIN_VALUE;
      for (Partition partition : partitions) {
        partVal = partition.getSpec().get(partColName);
        if (partVal.equals(defaultPartName)) {
          // partition column value is null.
          continue;
        } else {
          double value = new BigDecimal(partVal).doubleValue();
          min = Math.min(min, value);
          max = Math.max(max, value);
        }
      }
      range = new Range(min, max);
    } else {
      // Columns statistics for complex datatypes are not supported yet
      return null;
    }
    return range;
  }

  private static long getAvgColSize(final ColumnInfo columnInfo, HiveConf conf) {
    ObjectInspector oi = columnInfo.getObjectInspector();
    String colTypeLowerCase = columnInfo.getTypeName().toLowerCase();
    if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
        || colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.LIST_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.MAP_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.STRUCT_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.UNION_TYPE_NAME)) {
      return getAvgColLenOf(conf, oi, colTypeLowerCase);
    } else {
      return getAvgColLenOfFixedLengthTypes(colTypeLowerCase);
    }
  }

  public static long estimateRowSizeFromSchema(HiveConf conf, List schema) {
    long avgRowSize = 0;
    for (ColumnInfo ci : schema) {
      avgRowSize += getAvgColSize(ci, conf);
    }
    return avgRowSize;
  }

  public static long estimateRowSizeFromSchema(HiveConf conf, List schema,
      List neededColumns) {
    long avgRowSize = 0;
    for (String neededCol : neededColumns) {
      ColumnInfo ci = getColumnInfoForColumn(neededCol, schema);
      if (ci == null) {
        // No need to collect statistics of index columns
        continue;
      }
      avgRowSize += getAvgColSize(ci, conf);
    }
    return avgRowSize;
  }

  private static ColumnInfo getColumnInfoForColumn(String neededCol, List schema) {
    for (ColumnInfo ci : schema) {
      if (ci.getInternalName().equalsIgnoreCase(neededCol)) {
        return ci;
      }
    }
    return null;
  }

  /**
   * Find the bytes on disk occupied by a table
   * @param conf
   *          - hive conf
   * @param table
   *          - table
   * @return size on disk
   */
  public static long getFileSizeForTable(HiveConf conf, Table table) {
    Path path = table.getPath();
    long size = 0;
    try {
      FileSystem fs = path.getFileSystem(conf);
      size = fs.getContentSummary(path).getLength();
    } catch (Exception e) {
      size = 0;
    }
    return size;
  }

  /**
   * Find the bytes on disks occupied by list of partitions
   * @param conf
   *          - hive conf
   * @param parts
   *          - partition list
   * @return sizes of partitions
   */
  @Deprecated
  public static List getFileSizeForPartitions(final HiveConf conf, List parts) {
    LOG.info("Number of partitions : " + parts.size());
    ArrayList> futures = new ArrayList<>();

    int threads = Math.max(1, conf.getIntVar(ConfVars.METASTORE_FS_HANDLER_THREADS_COUNT));
    final ExecutorService pool = Executors.newFixedThreadPool(threads,
                new ThreadFactoryBuilder()
                    .setDaemon(true)
                    .setNameFormat("Get-Partitions-Size-%d")
                    .build());

    final ArrayList sizes = new ArrayList<>(parts.size());
    for (final Partition part : parts) {
      final Path path = part.getDataLocation();
      futures.add(pool.submit(new Callable() {
        @Override
        public Long call() throws Exception {
          try {
            LOG.debug("Partition path : " + path);
            FileSystem fs = path.getFileSystem(conf);
            return fs.getContentSummary(path).getLength();
          } catch (IOException e) {
            return 0L;
          }
        }
      }));
    }

    try {
      for(int i = 0; i < futures.size(); i++) {
        sizes.add(i, futures.get(i).get());
      }
    } catch (InterruptedException | ExecutionException e) {
      LOG.warn("Exception in processing files ", e);
    } finally {
      pool.shutdownNow();
    }
    return sizes;
  }

  public static boolean containsNonPositives(List vals) {
    for (Long val : vals) {
      if (val <= 0L) {
        return true;
      }
    }
    return false;
  }

  /**
   * Get sum of all values in the list that are >0
   * @param vals
   *          - list of values
   * @return sum
   */
  public static long getSumIgnoreNegatives(List vals) {
    long result = 0;
    for (Long l : vals) {
      if (l > 0) {
        result = safeAdd(result, l);
      }
    }
    return result;
  }

  private static Statistics.State deriveStatType(
      List colStats, List neededColumns) {
    boolean hasStats = false,
        hasNull = (colStats == null) || (colStats.size() < neededColumns.size());
    if (colStats != null) {
      for (ColStatistics cs : colStats) {
        // either colstats is null or is estimated
        boolean isNull = (cs == null) ? true: (cs.isEstimated());
        hasStats |= !isNull;
        hasNull |= isNull;
        if (hasNull && hasStats) {
          break;
        }
      }
    }
    State result = (hasStats
        ? (hasNull ? Statistics.State.PARTIAL : Statistics.State.COMPLETE)
        : (neededColumns.isEmpty() ? Statistics.State.COMPLETE : Statistics.State.NONE));
    return result;
  }

  /**
   * Convert ColumnStatisticsObj to ColStatistics
   * @param cso
   *          - ColumnStatisticsObj
   * @param tabName
   *          - table name
   * @param colName
   *          - column name
   * @return ColStatistics
   */
  public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tabName,
      String colName) {
    String colTypeLowerCase = cso.getColType().toLowerCase();
    ColStatistics cs = new ColStatistics(colName, colTypeLowerCase);
    ColumnStatisticsData csd = cso.getStatsData();
    if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)
        || colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)
        || colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME)) {
      cs.setCountDistint(csd.getLongStats().getNumDVs());
      cs.setNumNulls(csd.getLongStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(csd.getLongStats().getLowValue(), csd.getLongStats().getHighValue());
      cs.setBitVectors(csd.getLongStats().getBitVectors());
      cs.setHistogram(csd.getLongStats().getHistogram());
    } else if (colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) {
      cs.setCountDistint(csd.getLongStats().getNumDVs());
      cs.setNumNulls(csd.getLongStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive2());
      cs.setRange(csd.getLongStats().getLowValue(), csd.getLongStats().getHighValue());
      cs.setBitVectors(csd.getLongStats().getBitVectors());
      cs.setHistogram(csd.getLongStats().getHistogram());
    } else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)) {
      cs.setCountDistint(csd.getDoubleStats().getNumDVs());
      cs.setNumNulls(csd.getDoubleStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(csd.getDoubleStats().getLowValue(), csd.getDoubleStats().getHighValue());
      cs.setBitVectors(csd.getDoubleStats().getBitVectors());
      cs.setHistogram(csd.getDoubleStats().getHistogram());
    } else if (colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
      cs.setCountDistint(csd.getDoubleStats().getNumDVs());
      cs.setNumNulls(csd.getDoubleStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive2());
      cs.setRange(csd.getDoubleStats().getLowValue(), csd.getDoubleStats().getHighValue());
      cs.setBitVectors(csd.getDoubleStats().getBitVectors());
      cs.setHistogram(csd.getDoubleStats().getHistogram());
    } else if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
      cs.setCountDistint(csd.getStringStats().getNumDVs());
      cs.setNumNulls(csd.getStringStats().getNumNulls());
      cs.setAvgColLen(csd.getStringStats().getAvgColLen());
      cs.setBitVectors(csd.getStringStats().getBitVectors());
    } else if (colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
      if (csd.getBooleanStats().getNumFalses() > 0 && csd.getBooleanStats().getNumTrues() > 0) {
        cs.setCountDistint(2);
      } else {
        cs.setCountDistint(1);
      }
      cs.setNumTrues(csd.getBooleanStats().getNumTrues());
      cs.setNumFalses(csd.getBooleanStats().getNumFalses());
      cs.setNumNulls(csd.getBooleanStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive1());
    } else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
      cs.setAvgColLen(csd.getBinaryStats().getAvgColLen());
      cs.setNumNulls(csd.getBinaryStats().getNumNulls());
    } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
      cs.setNumNulls(csd.getTimestampStats().getNumNulls());
      Long lowVal = (csd.getTimestampStats().getLowValue() != null) ? csd.getTimestampStats().getLowValue()
          .getSecondsSinceEpoch() : null;
      Long highVal = (csd.getTimestampStats().getHighValue() != null) ? csd.getTimestampStats().getHighValue()
          .getSecondsSinceEpoch() : null;
      cs.setRange(lowVal, highVal);
      cs.setHistogram(csd.getTimestampStats().getHistogram());
    } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
    } else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal());
      cs.setCountDistint(csd.getDecimalStats().getNumDVs());
      cs.setNumNulls(csd.getDecimalStats().getNumNulls());
      Decimal highValue = csd.getDecimalStats().getHighValue();
      Decimal lowValue = csd.getDecimalStats().getLowValue();
      if (highValue != null && highValue.getUnscaled() != null
          && lowValue != null && lowValue.getUnscaled() != null) {
        HiveDecimal maxHiveDec = HiveDecimal.create(new BigInteger(highValue.getUnscaled()), highValue.getScale());
        BigDecimal maxVal = maxHiveDec == null ? null : maxHiveDec.bigDecimalValue();
        HiveDecimal minHiveDec = HiveDecimal.create(new BigInteger(lowValue.getUnscaled()), lowValue.getScale());
        BigDecimal minVal = minHiveDec == null ? null : minHiveDec.bigDecimalValue();

        if (minVal != null && maxVal != null) {
          cs.setRange(minVal, maxVal);
        }
      }
      cs.setBitVectors(csd.getDecimalStats().getBitVectors());
      cs.setHistogram(csd.getDecimalStats().getHistogram());
    } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
      cs.setNumNulls(csd.getDateStats().getNumNulls());
      Long lowVal = (csd.getDateStats().getLowValue() != null) ? csd.getDateStats().getLowValue()
          .getDaysSinceEpoch() : null;
      Long highVal = (csd.getDateStats().getHighValue() != null) ? csd.getDateStats().getHighValue()
          .getDaysSinceEpoch() : null;
      cs.setRange(lowVal, highVal);
      cs.setBitVectors(csd.getDateStats().getBitVectors());
      cs.setHistogram(csd.getDateStats().getHistogram());
    } else {
      // Columns statistics for complex datatypes are not supported yet
      return null;
    }

    return cs;
  }

  private static ColStatistics estimateColStats(long numRows, String colName, HiveConf conf,
      List schema) {
    ColumnInfo cinfo = getColumnInfoForColumn(colName, schema);
    ColStatistics cs = new ColStatistics(colName, cinfo.getTypeName());
    cs.setIsEstimated(true);

    String colTypeLowerCase = cinfo.getTypeName().toLowerCase();

    float ndvPercent = Math.min(100L, HiveConf.getFloatVar(conf, ConfVars.HIVE_STATS_NDV_ESTIMATE_PERC));
    float nullPercent = Math.min(100L, HiveConf.getFloatVar(conf, ConfVars.HIVE_STATS_NUM_NULLS_ESTIMATE_PERC));

    cs.setCountDistint(Math.max(1, (long)(numRows * ndvPercent/100.00)));
    cs.setNumNulls(Math.min(numRows, (long)(numRows * nullPercent/100.00)));

    if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)){
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(-128,127);
    }
    else if(colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)){
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(-32768, 32767);
    } else if(colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(Long.MIN_VALUE, Long.MAX_VALUE);
    } else if (colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().primitive2());
      cs.setRange(Integer.MIN_VALUE, Integer.MAX_VALUE);
    } else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(Float.MIN_VALUE, Float.MAX_VALUE);
    } else if (colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().primitive2());
      cs.setRange(Double.MIN_VALUE, Double.MAX_VALUE);
    } else if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.BINARY_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
      cs.setAvgColLen(getAvgColLenOf(conf,cinfo.getObjectInspector(), cinfo.getTypeName()));
    } else if (colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
        cs.setCountDistint(2);
        cs.setNumTrues(Math.max(1, numRows/2));
        cs.setNumFalses(Math.max(1, numRows/2));
        cs.setAvgColLen(JavaDataModel.get().primitive1());
    } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
      // epoch, seconds since epoch
      cs.setRange(TIMESTAMP_RANGE_LOWER_LIMIT, TIMESTAMP_RANGE_UPPER_LIMIT);
    } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
    } else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal());
      cs.setRange(Float.MIN_VALUE, Float.MAX_VALUE);
    } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
      // epoch, days since epoch
      cs.setRange(DATE_RANGE_LOWER_LIMIT, DATE_RANGE_UPPER_LIMIT);
    } else {
      cs.setAvgColLen(getSizeOfComplexTypes(conf, cinfo.getObjectInspector()));
    }
    return cs;
  }

  private static List estimateStats(Table table, List schema,
      List neededColumns, HiveConf conf, long nr) {

    List stats = new ArrayList(neededColumns.size());

    for (int i = 0; i < neededColumns.size(); i++) {
      ColStatistics cs = estimateColStats(nr, neededColumns.get(i), conf, schema);
      stats.add(cs);
    }
    return stats;
  }

  /**
   * Get table level column statistics from metastore for needed columns
   * @param table
   *          - table
   * @param schema
   *          - output schema
   * @param neededColumns
   *          - list of needed columns
   * @return column statistics
   */
  public static List getTableColumnStats(
      Table table, List schema, List neededColumns,
      ColumnStatsList colStatsCache, boolean fetchColStats) {
    List stats = new ArrayList<>();
    if (table.isMaterializedTable()) {
      LOG.debug("Materialized table does not contain table statistics");
      return stats;
    }
    // We will retrieve stats from the metastore only for columns that are not cached
    List colStatsToRetrieve;
    if (colStatsCache != null) {
      colStatsToRetrieve = new ArrayList<>(neededColumns.size());
      for (String colName : neededColumns) {
        if (!colStatsCache.getColStats().containsKey(colName)) {
          colStatsToRetrieve.add(colName);
        }
      }
    } else {
      colStatsToRetrieve = neededColumns;
    }
    // Retrieve stats from metastore
    String dbName = table.getDbName();
    String tabName = table.getTableName();
    if (SemanticAnalyzer.DUMMY_DATABASE.equals(dbName) &&
        SemanticAnalyzer.DUMMY_TABLE.equals(tabName)) {
      // insert into values gets written into insert from select dummy_table
      // This table is dummy and has no stats
      return stats;
    }
    if (fetchColStats && !colStatsToRetrieve.isEmpty()) {
      try {
        List colStat;
        if (table.isNonNative() && table.getStorageHandler().canProvideColStatistics(table)) {
          colStat = table.getStorageHandler().getColStatistics(table);
        } else {
          colStat = Hive.get().getTableColumnStatistics(dbName, tabName, colStatsToRetrieve, false);
        }
        stats = convertColStats(colStat, tabName);
      } catch (HiveException e) {
        LOG.error("Failed to retrieve table statistics: ", e);
      }
    }
    // Merge stats from cache with metastore cache
    if (colStatsCache != null) {
      for(String col:neededColumns) {
        ColStatistics cs = colStatsCache.getColStats().get(col);
        if (cs != null) {
          stats.add(cs);
          if (LOG.isDebugEnabled()) {
            LOG.debug("Stats for column " + cs.getColumnName() +
                " in table " + table.getCompleteName() + " retrieved from cache");
          }
        }
      }
    }
    return stats;
  }

  private static List convertColStats(List colStats, String tabName) {
    if (colStats == null) {
      return Collections.emptyList();
    }
    List stats = new ArrayList(colStats.size());
    for (ColumnStatisticsObj statObj : colStats) {
      ColStatistics cs = getColStatistics(statObj, tabName, statObj.getColName());
      if (cs != null) {
        stats.add(cs);
      }
    }
    return stats;
  }

  /**
   * Get the raw data size of variable length data types
   * @param conf
   *          - hive conf
   * @param oi
   *          - object inspector
   * @param colType
   *          - column type
   * @return raw data size
   */
  public static long getAvgColLenOf(HiveConf conf, ObjectInspector oi,
      String colType) {

    long configVarLen = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAX_VARIABLE_LENGTH);
    String colTypeLowCase = colType.toLowerCase();
    if (colTypeLowCase.equals(serdeConstants.STRING_TYPE_NAME)) {

      // constant string projection Ex: select "hello" from table
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        Object constantValue = coi.getWritableConstantValue();
        return constantValue == null ? 0 : constantValue.toString().length();
      } else if (oi instanceof StringObjectInspector) {

        // some UDFs may emit strings of variable length. like pattern matching
        // UDFs. it's hard to find the length of such UDFs.
        // return the variable length from config
        return configVarLen;
      }
    } else if (colTypeLowCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {

      // constant varchar projection
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        Object constantValue = coi.getWritableConstantValue();
        return constantValue == null ? 0 : constantValue.toString().length();
      } else if (oi instanceof HiveVarcharObjectInspector) {
        VarcharTypeInfo type = (VarcharTypeInfo) ((HiveVarcharObjectInspector) oi).getTypeInfo();
        return type.getLength();
      }
    } else if (colTypeLowCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {

      // constant char projection
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        Object constantValue = coi.getWritableConstantValue();
        return constantValue == null ? 0 : constantValue.toString().length();
      } else if (oi instanceof HiveCharObjectInspector) {
        CharTypeInfo type = (CharTypeInfo) ((HiveCharObjectInspector) oi).getTypeInfo();
        return type.getLength();
      }
    } else if (colTypeLowCase.equals(serdeConstants.BINARY_TYPE_NAME)) {

      // constant byte arrays
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        BytesWritable constantValue = (BytesWritable)coi.getWritableConstantValue();
        return constantValue == null ? 0 : constantValue.getLength();
      } else if (oi instanceof BinaryObjectInspector) {

        // return the variable length from config
        return configVarLen;
      }
    } else {

      // complex types (map, list, struct, union)
      return getSizeOfComplexTypes(conf, oi);
    }

    throw new IllegalArgumentException("Size requested for unknown type: " + colType + " OI: " + oi.getTypeName());
  }

  /**
   * Get the size of complex data types
   * @return raw data size
   */
  public static long getSizeOfComplexTypes(HiveConf conf, ObjectInspector oi) {
    long result = 0;
    int length = 0;
    int listEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_LIST_NUM_ENTRIES);
    int mapEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_NUM_ENTRIES);

    switch (oi.getCategory()) {
    case PRIMITIVE:
      String colTypeLowerCase = oi.getTypeName().toLowerCase();
      if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
          || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
          || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
        int avgColLen = (int) getAvgColLenOf(conf, oi, colTypeLowerCase);
        result += JavaDataModel.get().lengthForStringOfLength(avgColLen);
      } else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
        int avgColLen = (int) getAvgColLenOf(conf, oi, colTypeLowerCase);
        result += JavaDataModel.get().lengthForByteArrayOfSize(avgColLen);
      } else {
        result += getAvgColLenOfFixedLengthTypes(colTypeLowerCase);
      }
      break;
    case LIST:
      if (oi instanceof StandardConstantListObjectInspector) {

        // constant list projection of known length
        StandardConstantListObjectInspector scloi = (StandardConstantListObjectInspector) oi;
        List value = scloi.getWritableConstantValue();
        if (null == value) {
          length = 0;
        } else {
          length = value.size();
        }

        // check if list elements are primitive or Objects
        ObjectInspector leoi = scloi.getListElementObjectInspector();
        if (leoi.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
          int maxVarLen = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAX_VARIABLE_LENGTH);
          result += getSizeOfPrimitiveTypeArraysFromType(leoi.getTypeName(), length, maxVarLen);
        } else {
          result += JavaDataModel.get().lengthForObjectArrayOfSize(length);
        }
      } else {
        StandardListObjectInspector sloi = (StandardListObjectInspector) oi;

        // list overhead + (configured number of element in list * size of element)
        long elemSize = getSizeOfComplexTypes(conf, sloi.getListElementObjectInspector());
        result += JavaDataModel.get().arrayList() + (listEntries * elemSize);
      }
      break;
    case MAP:
      if (oi instanceof StandardConstantMapObjectInspector) {

        // constant map projection of known length
        StandardConstantMapObjectInspector scmoi = (StandardConstantMapObjectInspector) oi;
        result += getSizeOfMap(scmoi);
      } else {
        StandardMapObjectInspector smoi = (StandardMapObjectInspector) oi;
        result += getSizeOfComplexTypes(conf, smoi.getMapKeyObjectInspector());
        result += getSizeOfComplexTypes(conf, smoi.getMapValueObjectInspector());

        // hash map overhead
        result += JavaDataModel.get().hashMap(mapEntries);
      }
      break;
    case STRUCT:
      if (oi instanceof StandardConstantStructObjectInspector) {
        // constant map projection of known length
        StandardConstantStructObjectInspector scsoi = (StandardConstantStructObjectInspector) oi;
        result += getSizeOfStruct(scsoi);
      }  else {
        StructObjectInspector soi = (StructObjectInspector) oi;

        // add constant object overhead for struct
        result += JavaDataModel.get().object();

        // add constant struct field names references overhead
        result += soi.getAllStructFieldRefs().size() * JavaDataModel.get().ref();
        for (StructField field : soi.getAllStructFieldRefs()) {
          result += getSizeOfComplexTypes(conf, field.getFieldObjectInspector());
        }
      }
      break;
    case UNION:
      UnionObjectInspector uoi = (UnionObjectInspector) oi;

      // add constant object overhead for union
      result += JavaDataModel.get().object();

      // add constant size for unions tags
      result += uoi.getObjectInspectors().size() * JavaDataModel.get().primitive1();
      for (ObjectInspector foi : uoi.getObjectInspectors()) {
        result += getSizeOfComplexTypes(conf, foi);
      }
      break;
    default:
      break;
    }

    return result;
  }

  /**
   * Get size of fixed length primitives.
   *
   * @param colType column type
   * @return raw data size
   * @throws NullPointerException if colType is {@code null}
   */
  public static long getAvgColLenOfFixedLengthTypes(final String colType) {
    String colTypeLowerCase = Objects.requireNonNull(colType).toLowerCase();
    if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      return JavaDataModel.get().lengthOfDecimal();
    }
    switch (colTypeLowerCase) {
    case serdeConstants.TINYINT_TYPE_NAME:
    case serdeConstants.SMALLINT_TYPE_NAME:
    case serdeConstants.INT_TYPE_NAME:
    case serdeConstants.VOID_TYPE_NAME:
    case serdeConstants.BOOLEAN_TYPE_NAME:
    case serdeConstants.FLOAT_TYPE_NAME:
      return JavaDataModel.get().primitive1();
    case serdeConstants.DOUBLE_TYPE_NAME:
    case serdeConstants.BIGINT_TYPE_NAME:
    case serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME:
    case "long":
      return JavaDataModel.get().primitive2();
    case serdeConstants.TIMESTAMP_TYPE_NAME:
    case serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME:
      return JavaDataModel.get().lengthOfTimestamp();
    case serdeConstants.DATE_TYPE_NAME:
      return JavaDataModel.get().lengthOfDate();
    case serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME:
      return JavaDataModel.JAVA32_META;
    default:
      // TODO: support complex types
      // for complex type we simply return 0
      return 0;
    }
  }

  /**
   * Get the size of arrays of primitive types.
   *
   * @param colType The column type
   * @param length The length of the column type
   * @param maxLength The maximum length of the field
   * @return raw data size
   * @throws NullPointerException if colType is {@code null}
   */
  public static long getSizeOfPrimitiveTypeArraysFromType(final String colType, final int length, final int maxLength) {
    String colTypeLowerCase = Objects.requireNonNull(colType).toLowerCase();
    if (colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
        || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
      int charTypeLen = JavaDataModel.get().lengthForStringOfLength(maxLength);
      return JavaDataModel.get().lengthForPrimitiveArrayOfSize(charTypeLen, length);
    }
    switch (colTypeLowerCase) {
    case serdeConstants.TINYINT_TYPE_NAME:
    case serdeConstants.SMALLINT_TYPE_NAME:
    case serdeConstants.INT_TYPE_NAME:
    case serdeConstants.FLOAT_TYPE_NAME:
      return JavaDataModel.get().lengthForIntArrayOfSize(length);
    case serdeConstants.DOUBLE_TYPE_NAME:
      return JavaDataModel.get().lengthForDoubleArrayOfSize(length);
    case serdeConstants.BIGINT_TYPE_NAME:
    case "long":
      return JavaDataModel.get().lengthForLongArrayOfSize(length);
    case serdeConstants.BINARY_TYPE_NAME:
      return JavaDataModel.get().lengthForByteArrayOfSize(length);
    case serdeConstants.BOOLEAN_TYPE_NAME:
      return JavaDataModel.get().lengthForBooleanArrayOfSize(length);
    case serdeConstants.TIMESTAMP_TYPE_NAME:
    case serdeConstants.DATETIME_TYPE_NAME:
    case serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME:
    case serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME:
    case serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME:
      return JavaDataModel.get().lengthForTimestampArrayOfSize(length);
    case serdeConstants.DATE_TYPE_NAME:
      return JavaDataModel.get().lengthForDateArrayOfSize(length);
    case serdeConstants.DECIMAL_TYPE_NAME:
      return JavaDataModel.get().lengthForDecimalArrayOfSize(length);
    case serdeConstants.STRING_TYPE_NAME:
      int stringTypeLen = JavaDataModel.get().lengthForStringOfLength(maxLength);
      return JavaDataModel.get().lengthForPrimitiveArrayOfSize(stringTypeLen, length);
    default:
      return 0;
    }
  }

  /**
   * Estimate the size of map object
   * @param scmoi
   *          - object inspector
   * @return size of map
   */
  public static long getSizeOfMap(StandardConstantMapObjectInspector scmoi) {
    Map map = scmoi.getWritableConstantValue();
    if (null == map) {
      return 0L;
    }
    ObjectInspector koi = scmoi.getMapKeyObjectInspector();
    ObjectInspector voi = scmoi.getMapValueObjectInspector();
    long result = 0;
    for (Map.Entry entry : map.entrySet()) {
      result += getWritableSize(koi, entry.getKey());
      result += getWritableSize(voi, entry.getValue());
    }

    // add additional overhead of each map entries
    result += JavaDataModel.get().hashMap(map.entrySet().size());
    return result;
  }

  public static long getSizeOfStruct(StandardConstantStructObjectInspector soi) {
	long result = 0;
    // add constant object overhead for struct
    result += JavaDataModel.get().object();

    // add constant struct field names references overhead
    result += soi.getAllStructFieldRefs().size() * JavaDataModel.get().ref();
    List value = soi.getWritableConstantValue();
    List fields = soi.getAllStructFieldRefs();
    if (value == null || value.size() != fields.size()) {
      return result;
    }
    for (int i = 0; i < fields.size(); i++) {
      result += getWritableSize(fields.get(i).getFieldObjectInspector(), value.get(i));
    }
	return result;
  }

  /**
   * Get size of primitive data types based on their respective writable object inspector
   * @param oi
   *          - object inspector
   * @param value
   *          - value
   * @return raw data size
   */
  public static long getWritableSize(ObjectInspector oi, Object value) {
    if (oi instanceof WritableStringObjectInspector) {
      WritableStringObjectInspector woi = (WritableStringObjectInspector) oi;
      return JavaDataModel.get().lengthForStringOfLength(
        value == null ? 0 : woi.getPrimitiveWritableObject(value).getLength());
    } else if (oi instanceof WritableBinaryObjectInspector) {
      WritableBinaryObjectInspector woi = (WritableBinaryObjectInspector) oi;
      return JavaDataModel.get().lengthForByteArrayOfSize(
        value == null ? 0 : woi.getPrimitiveWritableObject(value).getLength());
    } else if (oi instanceof WritableBooleanObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableByteObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableDateObjectInspector) {
      return JavaDataModel.get().lengthOfDate();
    } else if (oi instanceof WritableDoubleObjectInspector) {
      return JavaDataModel.get().primitive2();
    } else if (oi instanceof WritableFloatObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableHiveDecimalObjectInspector) {
      return JavaDataModel.get().lengthOfDecimal();
    } else if (oi instanceof WritableIntObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableLongObjectInspector) {
      return JavaDataModel.get().primitive2();
    } else if (oi instanceof WritableShortObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableTimestampObjectInspector ||
        oi instanceof WritableTimestampLocalTZObjectInspector) {
      return JavaDataModel.get().lengthOfTimestamp();
    }

    return 0;
  }

  /**
   * Get column statistics from parent statistics.
   * @param conf
   *          - hive conf
   * @param parentStats
   *          - parent statistics
   * @param colExprMap
   *          - column expression map
   * @param rowSchema
   *          - row schema
   * @return column statistics
   */
  public static List getColStatisticsFromExprMap(HiveConf conf,
      Statistics parentStats, Map colExprMap, RowSchema rowSchema) {

    List cs = Lists.newArrayList();
    if (colExprMap != null  && rowSchema != null) {
      for (ColumnInfo ci : rowSchema.getSignature()) {
        String outColName = ci.getInternalName();
        ExprNodeDesc end = colExprMap.get(outColName);
        ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, end);
        if (colStat != null) {
          colStat.setColumnName(outColName);
          cs.add(colStat);
        }
      }
      // sometimes RowSchema is empty, so fetch stats of columns in exprMap
      for (Entry pair : colExprMap.entrySet()) {
        if (rowSchema.getColumnInfo(pair.getKey()) == null) {
          ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, pair.getValue());
          if (colStat != null) {
            colStat.setColumnName(pair.getKey());
            cs.add(colStat);
          }
        }
      }

      return cs;
    }

    // In cases where column expression map or row schema is missing, just pass on the parent column
    // stats. This could happen in cases like TS -> FIL where FIL does not map input column names to
    // internal names.
    if (colExprMap == null || rowSchema == null) {
      if (parentStats.getColumnStats() != null) {
        cs.addAll(parentStats.getColumnStats());
      }
    }
    return cs;
  }

  /**
   * Get column statistics from parent statistics given the
   * row schema of its child.
   * @param parentStats
   *          - parent statistics
   * @param rowSchema
   *          - row schema
   * @return column statistics
   */
  public static List getColStatisticsUpdatingTableAlias(
          Statistics parentStats, RowSchema rowSchema) {

    List cs = Lists.newArrayList();

    for (ColStatistics parentColStat : parentStats.getColumnStats()) {
      ColStatistics colStat;
      colStat = parentColStat.clone();
      if (colStat != null) {
        cs.add(colStat);
      }
    }

    return cs;
  }

  /**
   * Get column statistics expression nodes
   * @param conf
   *          - hive conf
   * @param parentStats
   *          - parent statistics
   * @param end
   *          - expression nodes
   * @return column statistics
   */
  public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats,
      ExprNodeDesc end) {

    if (end == null) {
      return null;
    }

    String colName = null;
    String colType = null;
    double avgColSize = 0;
    long countDistincts = 0;
    long numNulls = 0;
    ObjectInspector oi = end.getWritableObjectInspector();
    long numRows = parentStats.getNumRows();

    if (end instanceof ExprNodeColumnDesc) {
      // column projection
      ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
      colName = encd.getColumn();

      if (encd.getIsPartitionColOrVirtualCol()) {

        ColStatistics colStats = parentStats.getColumnStatisticsFromColName(colName);
        if (colStats != null) {
          /* If statistics for the column already exist use it. */
            return colStats.clone();
        }

        // virtual columns
        colType = encd.getTypeInfo().getTypeName();
        countDistincts = numRows;
      } else {

        // clone the column stats and return
        ColStatistics result = parentStats.getColumnStatisticsFromColName(colName);
        if (result != null) {
            return result.clone();
        }
        return null;
      }
    } else if (end instanceof ExprNodeConstantDesc) {
      return buildColStatForConstant(conf, numRows, (ExprNodeConstantDesc) end);
    } else if (end instanceof ExprNodeGenericFuncDesc) {
      ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
      colName = engfd.getName();
      colType = engfd.getTypeString();

      // If it is a widening cast, we do not change NDV, min, max
      if (isWideningCast(engfd) && engfd.getChildren().get(0) instanceof ExprNodeColumnDesc) {
        // cast on single column
        ColStatistics stats = parentStats.getColumnStatisticsFromColName(engfd.getCols().get(0));
        if (stats != null) {
          ColStatistics newStats;
          newStats = stats.clone();
          newStats.setColumnName(colName);
          colType = colType.toLowerCase();
          newStats.setColumnType(colType);
          newStats.setAvgColLen(getAvgColLenOf(conf, oi, colType));
          return newStats;
        }
      }

      if (conf.getBoolVar(ConfVars.HIVE_STATS_ESTIMATORS_ENABLE)) {
        Optional sep = engfd.getGenericUDF().adapt(StatEstimatorProvider.class);
        if (sep.isPresent()) {
          StatEstimator se = sep.get().getStatEstimator();
          List csList = new ArrayList();
          for (ExprNodeDesc child : engfd.getChildren()) {
            ColStatistics cs = getColStatisticsFromExpression(conf, parentStats, child);
            if (cs == null) {
              break;
            }
            csList.add(cs);
          }
          if (csList.size() == engfd.getChildren().size()) {
            Optional res = se.estimate(csList);
            if (res.isPresent()) {
              ColStatistics newStats = res.get();
              colType = colType.toLowerCase();
              newStats.setColumnType(colType);
              newStats.setColumnName(colName);
              return newStats;
            }
          }
        }
      }
      // fallback to default
      countDistincts = getNDVFor(engfd, numRows, parentStats);
    } else if (end instanceof ExprNodeColumnListDesc) {

      // column list
      ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end;
      colName = Joiner.on(",").join(encd.getCols());
      colType = serdeConstants.LIST_TYPE_NAME;
      countDistincts = numRows;
    } else if (end instanceof ExprNodeFieldDesc) {

      // field within complex type
      ExprNodeFieldDesc enfd = (ExprNodeFieldDesc) end;
      colName = enfd.getFieldName();
      colType = enfd.getTypeString();
      countDistincts = numRows;
    } else if (end instanceof ExprDynamicParamDesc) {
      //skip collecting stats for parameters
      // ideally we should estimate and create colstats object, because otherwise it could lead to
      // planning as if stats are missing. But since colstats require column name and type it is not
      // possible to create colstats object
      return null;
    } else {
      throw new IllegalArgumentException("not supported expr type " + end.getClass());
    }

    colType = colType.toLowerCase();
    avgColSize = getAvgColLenOf(conf, oi, colType);
    ColStatistics colStats = new ColStatistics(colName, colType);
    colStats.setAvgColLen(avgColSize);
    colStats.setCountDistint(countDistincts);
    colStats.setNumNulls(numNulls);

    return colStats;
  }

  private static ColStatistics buildColStatForConstant(HiveConf conf, long numRows, ExprNodeConstantDesc encd) {

    long numNulls = 0;
    long countDistincts = 0;
    if (encd.getValue() == null) {
      // null projection
      numNulls = numRows;
    } else {
      countDistincts = 1;
    }
    String colType = encd.getTypeString();
    colType = colType.toLowerCase();
    ObjectInspector oi = encd.getWritableObjectInspector();
    double avgColSize = getAvgColLenOf(conf, oi, colType);
    ColStatistics colStats = new ColStatistics(encd.getName(), colType);
    colStats.setAvgColLen(avgColSize);
    colStats.setCountDistint(countDistincts);
    colStats.setNumNulls(numNulls);

    Optional value = getConstValue(encd);
    if (value.isPresent()) {
      colStats.setRange(value.get(), value.get());
    }
    return colStats;
  }

  private static Optional getConstValue(ExprNodeConstantDesc encd) {
    if (encd.getValue() == null) {
      return Optional.empty();
    }
    String constant = encd.getValue().toString();
    PrimitiveCategory category = GenericUDAFSum.getReturnType(encd.getTypeInfo());
    if (category == null) {
      return Optional.empty();
    }
    switch (category) {
    case INT:
    case BYTE:
    case SHORT:
    case LONG:
      return Optional.ofNullable(Longs.tryParse(constant));
    case FLOAT:
    case DOUBLE:
    case DECIMAL:
      return Optional.ofNullable(Doubles.tryParse(constant));
    default:
      return Optional.empty();
    }
  }

  private static boolean isWideningCast(ExprNodeGenericFuncDesc engfd) {
    GenericUDF udf = engfd.getGenericUDF();
    if (!FunctionRegistry.isOpCast(udf)) {
      // It is not a cast
      return false;
    }
    return TypeInfoUtils.implicitConvertible(engfd.getChildren().get(0).getTypeInfo(),
            engfd.getTypeInfo());
  }

  public static Long addWithExpDecay (List distinctVals) {
    // Exponential back-off for NDVs.
    // 1) Descending order sort of NDVs
    // 2) denominator = NDV1 * (NDV2 ^ (1/2)) * (NDV3 ^ (1/4))) * ....
    Collections.sort(distinctVals, Collections.reverseOrder());

    long denom = distinctVals.get(0);
    for (int i = 1; i < distinctVals.size(); i++) {
      denom = (long) (denom * Math.pow(distinctVals.get(i), 1.0 / (1 << i)));
    }

    return denom;
  }

  private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Statistics parentStats) {

    GenericUDF udf = engfd.getGenericUDF();
    if (!FunctionRegistry.isDeterministic(udf) && !FunctionRegistry.isRuntimeConstant(udf)){
      return numRows;
    }
    List ndvs = Lists.newArrayList();
    Class udfClass = udf instanceof GenericUDFBridge ? ((GenericUDFBridge) udf).getUdfClass() : udf.getClass();
    NDV ndv = AnnotationUtils.getAnnotation(udfClass, NDV.class);
    long udfNDV = Long.MAX_VALUE;
    if (ndv != null) {
      udfNDV = ndv.maxNdv();
    } else {
      for (String col : engfd.getCols()) {
        ColStatistics stats = parentStats.getColumnStatisticsFromColName(col);
        if (stats != null) {
          ndvs.add(stats.getCountDistint());
        }
      }
    }
    long countDistincts = ndvs.isEmpty() ? numRows : addWithExpDecay(ndvs);
    return Collections.min(Lists.newArrayList(countDistincts, udfNDV, numRows));
  }

  /**
   * Get number of rows of a give table
   * @return number of rows
   */
  @Deprecated
  public static long getNumRows(Table table) {
    return getBasicStatForTable(table, StatsSetupConst.ROW_COUNT);
  }

  /**
   * Get raw data size of a give table
   * @return raw data size
   */
  public static long getRawDataSize(Table table) {
    return getBasicStatForTable(table, StatsSetupConst.RAW_DATA_SIZE);
  }

  /**
   * Get total size of a give table
   * @return total size
   */
  public static long getTotalSize(Table table) {
    return getBasicStatForTable(table, StatsSetupConst.TOTAL_SIZE);
  }

  /**
   * Get number of Erasure Coded files for a table
   * @return count of EC files
   */
  public static long getErasureCodedFiles(Table table) {
    return getBasicStatForTable(table, StatsSetupConst.NUM_ERASURE_CODED_FILES);
  }

  /**
   * Get basic stats of table
   * @param table
   *          - table
   * @param statType
   *          - type of stats
   * @return value of stats
   */
  @Deprecated
  public static long getBasicStatForTable(Table table, String statType) {
    Map params = table.getParameters();
    long result = -1;

    if (params != null) {
      try {
        result = Long.parseLong(params.get(statType));
      } catch (NumberFormatException e) {
        result = -1;
      }
    }
    return result;
  }

  /**
   * Get basic stats of partitions
   * @param table
   *          - table
   * @param parts
   *          - partitions
   * @param statType
   *          - type of stats
   * @return value of stats
   */
  public static List getBasicStatForPartitions(Table table, List parts,
      String statType) {

    List stats = Lists.newArrayList();
    for (Partition part : parts) {
      Map params = part.getParameters();
      long result = 0;
      if (params != null) {
        try {
          result = Long.parseLong(params.get(statType));
        } catch (NumberFormatException e) {
          result = 0;
        }
        stats.add(result);
      }
    }
    return stats;
  }

  /**
   * Compute raw data size from column statistics
   * @param numRows
   *          - number of rows
   * @param colStats
   *          - column statistics
   * @return raw data size
   */
  public static long getDataSizeFromColumnStats(long numRows, List colStats) {
    long result = 0;

    if (numRows <= 0 || colStats == null) {
      return result;
    }

    if (colStats.isEmpty()) {
      // this may happen if we are not projecting any column from current operator
      // think count(*) where we are projecting rows without any columns
      // in such a case we estimate empty row to be of size of empty java object.
      return numRows * JavaDataModel.JAVA64_REF;
    }

    for (ColStatistics cs : colStats) {
      if (cs != null) {
        String colTypeLowerCase = cs.getColumnType().toLowerCase();
        long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
        double sizeOf = 0;
        if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)
            || colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)
            || colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME)
            || colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)
            || colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME)
            || colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)
            || colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
          sizeOf = cs.getAvgColLen();
        } else if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME)
            || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
            || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
          int acl = (int) Math.round(cs.getAvgColLen());
          sizeOf = JavaDataModel.get().lengthForStringOfLength(acl);
        } else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
          int acl = (int) Math.round(cs.getAvgColLen());
          sizeOf = JavaDataModel.get().lengthForByteArrayOfSize(acl);
        } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME) ||
            colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME)) {
          sizeOf = JavaDataModel.get().lengthOfTimestamp();
        } else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
          sizeOf = JavaDataModel.get().lengthOfDecimal();
        } else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
          sizeOf = JavaDataModel.get().lengthOfDate();
        } else {
          sizeOf = cs.getAvgColLen();
        }
        result = safeAdd(result, safeMult(nonNullCount, sizeOf));
      }
    }

    return result;
  }

  @Deprecated
  public static String getFullyQualifiedTableName(String dbName, String tabName) {
    return getFullyQualifiedName(dbName, tabName);
  }

  @Deprecated
  private static String getFullyQualifiedName(String... names) {
    List nonNullAndEmptyNames = Lists.newArrayList();
    for (String name : names) {
      if (name != null && !name.isEmpty()) {
        nonNullAndEmptyNames.add(name);
      }
    }
    return Joiner.on(".").join(nonNullAndEmptyNames);
  }

  /**
   * Get qualified column name from output key column names.
   * @param keyExprs
   *          - output key names
   * @return list of qualified names
   */
  public static List getQualifedReducerKeyNames(List keyExprs) {
    List result = Lists.newArrayList();
    if (keyExprs != null) {
      for (String key : keyExprs) {
        result.add(Utilities.ReduceField.KEY.toString() + "." + key);
      }
    }
    return result;
  }

  /**
   * negative number of rows or data sizes are invalid. It could be because of
   * long overflow in which case return Long.MAX_VALUE
   * @param val - input value
   * @return Long.MAX_VALUE if val is negative else val
   */
  public static long getMaxIfOverflow(long val) {
    return val < 0 ? Long.MAX_VALUE : val;
  }

  /** Bounded multiplication - overflows become MAX_VALUE */
  public static long safeMult(long a, double b) {
    double result = a * b;
    return (result > Long.MAX_VALUE) ? Long.MAX_VALUE : (long)result;
  }

  /** Bounded addition - overflows become MAX_VALUE */
  public static long safeAdd(long a, long b) {
    try {
      return LongMath.checkedAdd(a, b);
    } catch (ArithmeticException ex) {
      return Long.MAX_VALUE;
    }
  }

  /** Bounded multiplication - overflows become MAX_VALUE */
  public static long safeMult(long a, long b) {
    try {
      return LongMath.checkedMultiply(a, b);
    } catch (ArithmeticException ex) {
      return Long.MAX_VALUE;
    }
  }

  public static List safeMult(List l, float b) {
    List ret = new ArrayList<>();
    for (Long a : l) {
      ret.add(safeMult(a, b));
    }
    return ret;
  }

  public static boolean hasDiscreteRange(ColStatistics colStat) {
    if (colStat.getRange() != null) {
      TypeInfo colType = TypeInfoUtils.getTypeInfoFromTypeString(colStat.getColumnType());
      if (colType.getCategory() == Category.PRIMITIVE) {
        PrimitiveTypeInfo pti = (PrimitiveTypeInfo) colType;
        switch (pti.getPrimitiveCategory()) {
          case BOOLEAN:
          case BYTE:
          case SHORT:
          case INT:
          case LONG:
            return true;
          default:
            break;
        }
      }
    }
    return false;
  }

  public static Range combineRange(Range range1, Range range2) {
    if (   range1.minValue != null && range1.maxValue != null
        && range2.minValue != null && range2.maxValue != null) {
      long min1 = range1.minValue.longValue();
      long max1 = range1.maxValue.longValue();
      long min2 = range2.minValue.longValue();
      long max2 = range2.maxValue.longValue();

      if (max1 < min2 || max2 < min1) {
        // No overlap between the two ranges
        return null;
      } else {
        // There is an overlap of ranges - create combined range.
        return new ColStatistics.Range(
            Math.min(min1, min2),
            Math.max(max1, max2));
      }
    }
    return null;
  }

  public static boolean checkCanProvideStats(Table table) {
    if (MetaStoreUtils.isExternalTable(table.getTTable())) {
      if (MetaStoreUtils.isNonNativeTable(table.getTTable()) && table.getStorageHandler().canProvideBasicStatistics()) {
        return true;
      }
      return false;
    }
    return true;
  }

  /**
   * Are the basic stats for the table up-to-date for query planning.
   * Can run additional checks compared to the version in StatsSetupConst.
   */
  public static boolean areBasicStatsUptoDateForQueryAnswering(Table table, Map params) {
    return checkCanProvideStats(table) == true ? StatsSetupConst.areBasicStatsUptoDate(params) : false;
  }

  /**
   * Are the column stats for the table up-to-date for query planning.
   * Can run additional checks compared to the version in StatsSetupConst.
   */
  public static boolean areColumnStatsUptoDateForQueryAnswering(Table table, Map params, String colName) {
    return checkCanProvideStats(table) == true ? StatsSetupConst.areColumnStatsUptoDate(params, colName) : false;
  }

  /**
   * Update the basic statistics of the statistics object based on the row number
   * @param stats
   *          - statistics to be updated
   * @param newNumRows
   *          - new number of rows
   * @param useColStats
   *          - use column statistics to compute data size
   */
  public static void updateStats(Statistics stats, long newNumRows,
      boolean useColStats, Operator op) {
    updateStats(stats, newNumRows, useColStats, op, Collections.EMPTY_SET);
  }

  public static void updateStats(Statistics stats, long newNumRows,
      boolean useColStats, Operator op,
      Set affectedColumns) {

    if (newNumRows < 0) {
      LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. "
          + newNumRows + " rows will be set to Long.MAX_VALUE");
      newNumRows = StatsUtils.getMaxIfOverflow(newNumRows);
    }
    if (newNumRows == 0) {
      LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. "
          + newNumRows + " rows will be set to 1");
      newNumRows = 1;
    }

    long oldRowCount = stats.getNumRows();
    double ratio = (double) newNumRows / (double) oldRowCount;
    stats.setNumRows(newNumRows);

    if (useColStats) {
      List colStats = stats.getColumnStats();
      for (ColStatistics cs : colStats) {
        long oldDV = cs.getCountDistint();
        if (affectedColumns.contains(cs.getColumnName())) {
          long newDV = oldDV;

          // if ratio is greater than 1, then number of rows increases. This can happen
          // when some operators like GROUPBY duplicates the input rows in which case
          // number of distincts should not change. Update the distinct count only when
          // the output number of rows is less than input number of rows.
          if (ratio <= 1.0) {
            newDV = (long) Math.ceil(ratio * oldDV);
          }
          cs.setCountDistint(newDV);
          cs.setFilterColumn();
          oldDV = newDV;
        }
        if (oldDV > newNumRows) {
          cs.setCountDistint(newNumRows);
        }
        long newNumNulls = Math.round(ratio * cs.getNumNulls());
        cs.setNumNulls(newNumNulls > newNumRows ? newNumRows: newNumNulls);
      }
      stats.setColumnStats(colStats);
      long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
      stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
    } else {
      long newDataSize = (long) (ratio * stats.getDataSize());
      stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
    }
  }

  public static void scaleColStatistics(List colStats, double factor) {
    for (ColStatistics cs : colStats) {
      cs.setNumFalses(StatsUtils.safeMult(cs.getNumFalses(), factor));
      cs.setNumTrues(StatsUtils.safeMult(cs.getNumTrues(), factor));
      cs.setNumNulls(StatsUtils.safeMult(cs.getNumNulls(), factor));
      if (factor < 1.0) {
        final double newNDV = Math.ceil(cs.getCountDistint() * factor);
        cs.setCountDistint(newNDV > Long.MAX_VALUE ? Long.MAX_VALUE : (long) newNDV);
      }
    }
  }

  public static long computeNDVGroupingColumns(List colStats, Statistics parentStats,
      boolean expDecay) {
    List ndvValues =
        extractNDVGroupingColumns(colStats, parentStats);
    if (ndvValues == null) {
      return 0L;
    }
    if (ndvValues.isEmpty()) {
      // No grouping columns, one row
      return 1L;
    }
    if (expDecay) {
      return addWithExpDecay(ndvValues);
    } else {
      return ndvValues.stream().reduce(1L, StatsUtils::safeMult);
    }
  }

  private static List extractNDVGroupingColumns(List colStats, Statistics parentStats) {
    List ndvValues = new ArrayList<>(colStats.size());

    // compute product of distinct values of grouping columns
    for (ColStatistics cs : colStats) {
      if (cs != null) {
        long ndv = cs.getCountDistint();
        if (cs.getNumNulls() > 0) {
          ndv = StatsUtils.safeAdd(ndv, 1);
        }
        ndvValues.add(ndv);
      } else {
        if (parentStats.getColumnStatsState().equals(Statistics.State.COMPLETE)) {
          // the column must be an aggregate column inserted by GBY. We
          // don't have to account for this column when computing product
          // of NDVs
          continue;
        } else {
          // partial column statistics on grouping attributes case.
          // if column statistics on grouping attribute is missing, then
          // assume worst case.
          ndvValues = null;
        }
        break;
      }
    }

    return ndvValues;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy