All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.ql.stats.StatsUtils Maven / Gradle / Ivy

There is a newer version: 1.2.1.spark2
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.stats;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.common.math.LongMath;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.metastore.api.AggrStats;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.apache.hadoop.hive.ql.plan.ColStatistics.Range;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnListDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.Statistics;
import org.apache.hadoop.hive.ql.plan.Statistics.State;
import org.apache.hadoop.hive.ql.util.JavaDataModel;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableBooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantBinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantHiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableConstantStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableFloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableHiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableLongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableStringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableTimestampObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.tez.mapreduce.hadoop.MRJobConfig;

import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class StatsUtils {

  private static final Log LOG = LogFactory.getLog(StatsUtils.class.getName());


  /**
   * Collect table, partition and column level statistics
   * @param conf
   *          - hive configuration
   * @param partList
   *          - partition list
   * @param table
   *          - table
   * @param tableScanOperator
   *          - table scan operator
   * @return statistics object
   * @throws HiveException
   */
  public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
      Table table, TableScanOperator tableScanOperator) throws HiveException {

    // column level statistics are required only for the columns that are needed
    List schema = tableScanOperator.getSchema().getSignature();
    List neededColumns = tableScanOperator.getNeededColumns();
    List referencedColumns = tableScanOperator.getReferencedColumns();

    return collectStatistics(conf, partList, table, schema, neededColumns, referencedColumns);
  }

  private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
      Table table, List schema, List neededColumns,
      List referencedColumns) throws HiveException {

    boolean fetchColStats =
        HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
    boolean fetchPartStats =
        HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_PARTITION_STATS);

    return collectStatistics(conf, partList, table, schema, neededColumns, referencedColumns,
        fetchColStats, fetchPartStats);
  }

  public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList,
      Table table, List schema, List neededColumns,
      List referencedColumns, boolean fetchColStats, boolean fetchPartStats)
      throws HiveException {

    Statistics stats = new Statistics();

    float deserFactor =
        HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);

    if (!table.isPartitioned()) {
      long nr = getNumRows(table);
      long ds = getRawDataSize(table);
      if (ds <= 0) {
        ds = getTotalSize(table);

        // if data size is still 0 then get file size
        if (ds <= 0) {
          ds = getFileSizeForTable(conf, table);
        }

        ds = (long) (ds * deserFactor);
      }

      // number of rows -1 means that statistics from metastore is not reliable
      // and 0 means statistics gathering is disabled
      if (nr <= 0) {
        int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
        if (avgRowSize > 0) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Estimated average row size: " + avgRowSize);
          }
          nr = ds / avgRowSize;
        }
      }
      if (nr == 0) {
        nr = 1;
      }
      stats.setNumRows(nr);
      stats.setDataSize(ds);

      List colStats = Lists.newArrayList();
      if (fetchColStats) {
        colStats = getTableColumnStats(table, schema, neededColumns);
      }

      // infer if any column can be primary key based on column statistics
      inferAndSetPrimaryKey(stats.getNumRows(), colStats);

      stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
      stats.addToColumnStats(colStats);
    } else if (partList != null) {
      // For partitioned tables, get the size of all the partitions after pruning
      // the partitions that are not required
      long nr = 0;
      long ds = 0;

      List rowCounts = Lists.newArrayList();
      List dataSizes = Lists.newArrayList();

      if (fetchPartStats) {
        rowCounts = getBasicStatForPartitions(
            table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
        dataSizes =  getBasicStatForPartitions(
            table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);

        nr = getSumIgnoreNegatives(rowCounts);
        ds = getSumIgnoreNegatives(dataSizes);
        if (ds <= 0) {
          dataSizes = getBasicStatForPartitions(
              table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
          ds = getSumIgnoreNegatives(dataSizes);
        }
      }

      // if data size still could not be determined, then fall back to filesytem to get file
      // sizes
      if (ds <= 0) {
        dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
      }
      ds = getSumIgnoreNegatives(dataSizes);
      ds = (long) (ds * deserFactor);

      int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
      if (avgRowSize > 0) {
        setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
        nr = getSumIgnoreNegatives(rowCounts);
        ds = getSumIgnoreNegatives(dataSizes);

        // number of rows -1 means that statistics from metastore is not reliable
        if (nr <= 0) {
          nr = ds / avgRowSize;
        }
      }
      if (nr == 0) {
        nr = 1;
      }
      stats.addToNumRows(nr);
      stats.addToDataSize(ds);

      // if at least a partition does not contain row count then mark basic stats state as PARTIAL
      if (containsNonPositives(rowCounts) &&
          stats.getBasicStatsState().equals(State.COMPLETE)) {
        stats.setBasicStatsState(State.PARTIAL);
      }
      if (fetchColStats) {
        List partNames = new ArrayList(partList.getNotDeniedPartns().size());
        for (Partition part : partList.getNotDeniedPartns()) {
          partNames.add(part.getName());
        }
        neededColumns = processNeededColumns(schema, neededColumns);
        AggrStats aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(),
            neededColumns, partNames);
        if (null == aggrStats || null == aggrStats.getColStats()
            || aggrStats.getColStatsSize() == 0) {
          // There are some partitions with no state (or we didn't fetch any state).
          // Update the stats with empty list to reflect that in the
          // state/initialize structures.
          List emptyStats = Lists.newArrayList();

          // add partition column stats
          addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList,
              emptyStats);

          stats.addToColumnStats(emptyStats);
          stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
        } else {
          List colStats = aggrStats.getColStats();
          if (colStats.size() != neededColumns.size()) {
            LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" +
                " retrieve for " + colStats.size() + " columns");
          }
          List columnStats = convertColStats(colStats, table.getTableName());

          addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList,
              columnStats);

          // infer if any column can be primary key based on column statistics
          inferAndSetPrimaryKey(stats.getNumRows(), columnStats);

          stats.addToColumnStats(columnStats);
          State colState = deriveStatType(columnStats, referencedColumns);
          if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
            LOG.debug("Column stats requested for : " + partNames.size() + " partitions. "
                + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
            colState = State.PARTIAL;
          }
          stats.setColumnStatsState(colState);
        }
      }
    }
    return stats;
  }


  /**
   * Based on the provided column statistics and number of rows, this method infers if the column
   * can be primary key. It checks if the difference between the min and max value is equal to
   * number of rows specified.
   * @param numRows - number of rows
   * @param colStats - column statistics
   */
  public static void inferAndSetPrimaryKey(long numRows, List colStats) {
    if (colStats != null) {
      for (ColStatistics cs : colStats) {
        if (cs != null && cs.getCountDistint() >= numRows) {
          cs.setPrimaryKey(true);
        }
        else if (cs != null && cs.getRange() != null && cs.getRange().minValue != null &&
            cs.getRange().maxValue != null) {
          if (numRows ==
              ((cs.getRange().maxValue.longValue() - cs.getRange().minValue.longValue()) + 1)) {
            cs.setPrimaryKey(true);
          }
        }
      }
    }
  }

  /**
   * Infer foreign key relationship from given column statistics.
   * @param csPK - column statistics of primary key
   * @param csFK - column statistics of potential foreign key
   * @return
   */
  public static boolean inferForeignKey(ColStatistics csPK, ColStatistics csFK) {
    if (csPK != null && csFK != null) {
      if (csPK.isPrimaryKey()) {
        if (csPK.getRange() != null && csFK.getRange() != null) {
          ColStatistics.Range pkRange = csPK.getRange();
          ColStatistics.Range fkRange = csFK.getRange();
          return isWithin(fkRange, pkRange);
        }
      }
    }
    return false;
  }

  /**
   * Scale selectivity based on key range ratio.
   * @param csPK - column statistics of primary key
   * @param csFK - column statistics of potential foreign key
   * @return
   */
  public static float getScaledSelectivity(ColStatistics csPK, ColStatistics csFK) {
    float scaledSelectivity = 1.0f;
    if (csPK != null && csFK != null) {
      if (csPK.isPrimaryKey()) {
        // Use Max-Min Range as NDV gets scaled by selectivity.
        if (csPK.getRange() != null && csFK.getRange() != null) {
          long pkRangeDelta = getRangeDelta(csPK.getRange());
          long fkRangeDelta = getRangeDelta(csFK.getRange());
          if (fkRangeDelta > 0 && pkRangeDelta > 0 && fkRangeDelta < pkRangeDelta) {
            scaledSelectivity = (float) pkRangeDelta / (float) fkRangeDelta;
          }
        }
      }
    }
    return scaledSelectivity;
  }

  private static long getRangeDelta(ColStatistics.Range range) {
    if (range.minValue != null && range.maxValue != null) {
      return (range.maxValue.longValue() - range.minValue.longValue());
    }
    return 0;
  }

  private static boolean isWithin(ColStatistics.Range range1, ColStatistics.Range range2) {
    if (range1.minValue != null && range2.minValue != null && range1.maxValue != null &&
        range2.maxValue != null) {
      if (range1.minValue.longValue() >= range2.minValue.longValue() &&
          range1.maxValue.longValue() <= range2.maxValue.longValue()) {
        return true;
      }
    }
    return false;
  }

  private static void addParitionColumnStats(HiveConf conf, List neededColumns,
      List referencedColumns, List schema, Table table,
      PrunedPartitionList partList, List colStats)
      throws HiveException {

    // extra columns is difference between referenced columns vs needed
    // columns. The difference could be partition columns.
    List extraCols = Lists.newArrayList(referencedColumns);
    if (referencedColumns.size() > neededColumns.size()) {
      extraCols.removeAll(neededColumns);
      for (String col : extraCols) {
        for (ColumnInfo ci : schema) {
          // conditions for being partition column
          if (col.equals(ci.getInternalName()) && ci.getIsVirtualCol() &&
              !ci.isHiddenVirtualCol()) {
            // currently metastore does not store column stats for
            // partition column, so we calculate the NDV from pruned
            // partition list
            ColStatistics partCS = new ColStatistics(ci.getInternalName(), ci.getType()
                .getTypeName());
            long numPartitions = getNDVPartitionColumn(partList.getPartitions(),
                ci.getInternalName());
            partCS.setCountDistint(numPartitions);
            partCS.setAvgColLen(StatsUtils.getAvgColLenOfVariableLengthTypes(conf,
                ci.getObjectInspector(), partCS.getColumnType()));
            partCS.setRange(getRangePartitionColumn(partList.getPartitions(), ci.getInternalName(),
                ci.getType().getTypeName(), conf.getVar(ConfVars.DEFAULTPARTITIONNAME)));
            colStats.add(partCS);
          }
        }
      }
    }
  }

  public static int getNDVPartitionColumn(Set partitions, String partColName) {
    Set distinctVals = new HashSet(partitions.size());
    for (Partition partition : partitions) {
      distinctVals.add(partition.getSpec().get(partColName));
    }
    return distinctVals.size();
  }

  private static Range getRangePartitionColumn(Set partitions, String partColName,
      String colType, String defaultPartName) {
    Range range = null;
    String partVal;
    if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
      long min = Long.MAX_VALUE;
      long max = Long.MIN_VALUE;
      for (Partition partition : partitions) {
        partVal = partition.getSpec().get(partColName);
        if (partVal.equals(defaultPartName)) {
          // partition column value is null.
          continue;
        } else {
          long value = Long.parseLong(partVal);
          min = Math.min(min, value);
          max = Math.max(max, value);
        }
      }
      range = new Range(min, max);
    } else if (colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
      double min = Double.MAX_VALUE;
      double max = Double.MIN_VALUE;
      for (Partition partition : partitions) {
        partVal = partition.getSpec().get(partColName);
        if (partVal.equals(defaultPartName)) {
          // partition column value is null.
          continue;
        } else {
          double value = Double.parseDouble(partVal);
          min = Math.min(min, value);
          max = Math.max(max, value);
        }
      }
      range = new Range(min, max);
    } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      double min = Double.MAX_VALUE;
      double max = Double.MIN_VALUE;
      for (Partition partition : partitions) {
        partVal = partition.getSpec().get(partColName);
        if (partVal.equals(defaultPartName)) {
          // partition column value is null.
          continue;
        } else {
          double value = new BigDecimal(partVal).doubleValue();
          min = Math.min(min, value);
          max = Math.max(max, value);
        }
      }
      range = new Range(min, max);
    } else {
      // Columns statistics for complex datatypes are not supported yet
      return null;
    }
    return range;
  }

  private static void setUnknownRcDsToAverage(
      List rowCounts, List dataSizes, int avgRowSize) {
    if (LOG.isDebugEnabled()) {
      LOG.debug("Estimated average row size: " + avgRowSize);
    }
    for (int i = 0; i < rowCounts.size(); i++) {
      long rc = rowCounts.get(i);
      long s = dataSizes.get(i);
      if (rc <= 0 && s > 0) {
        rc = s / avgRowSize;
        rowCounts.set(i, rc);
      }

      if (s <= 0 && rc > 0) {
        s = safeMult(rc, avgRowSize);
        dataSizes.set(i, s);
      }
    }
  }

  public static int estimateRowSizeFromSchema(HiveConf conf, List schema,
      List neededColumns) {
    int avgRowSize = 0;
    for (String neededCol : neededColumns) {
      ColumnInfo ci = getColumnInfoForColumn(neededCol, schema);
      if (ci == null) {
        // No need to collect statistics of index columns
        continue;
      }
      ObjectInspector oi = ci.getObjectInspector();
      String colType = ci.getTypeName();
      if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
          || colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)
          || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
          || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
          || colType.startsWith(serdeConstants.LIST_TYPE_NAME)
          || colType.startsWith(serdeConstants.MAP_TYPE_NAME)
          || colType.startsWith(serdeConstants.STRUCT_TYPE_NAME)
          || colType.startsWith(serdeConstants.UNION_TYPE_NAME)) {
        avgRowSize += getAvgColLenOfVariableLengthTypes(conf, oi, colType);
      } else {
        avgRowSize += getAvgColLenOfFixedLengthTypes(colType);
      }
    }
    return avgRowSize;
  }

  private static ColumnInfo getColumnInfoForColumn(String neededCol, List schema) {
    for (ColumnInfo ci : schema) {
      if (ci.getInternalName().equalsIgnoreCase(neededCol)) {
        return ci;
      }
    }
    return null;
  }

  /**
   * Find the bytes on disk occupied by a table
   * @param conf
   *          - hive conf
   * @param table
   *          - table
   * @return size on disk
   */
  public static long getFileSizeForTable(HiveConf conf, Table table) {
    Path path = table.getPath();
    long size = 0;
    try {
      FileSystem fs = path.getFileSystem(conf);
      size = fs.getContentSummary(path).getLength();
    } catch (Exception e) {
      size = 0;
    }
    return size;
  }

  /**
   * Find the bytes on disks occupied by list of partitions
   * @param conf
   *          - hive conf
   * @param parts
   *          - partition list
   * @return sizes of patitions
   */
  public static List getFileSizeForPartitions(HiveConf conf, List parts) {
    List sizes = Lists.newArrayList();
    for (Partition part : parts) {
      Path path = part.getDataLocation();
      long size = 0;
      try {
        FileSystem fs = path.getFileSystem(conf);
        size = fs.getContentSummary(path).getLength();
      } catch (Exception e) {
        size = 0;
      }
      sizes.add(size);
    }
    return sizes;
  }

  private static boolean containsNonPositives(List vals) {
    for (Long val : vals) {
      if (val <= 0L) {
        return true;
      }
    }
    return false;
  }

  /**
   * Get sum of all values in the list that are >0
   * @param vals
   *          - list of values
   * @return sum
   */
  public static long getSumIgnoreNegatives(List vals) {
    long result = 0;
    for (Long l : vals) {
      if (l > 0) {
        result = safeAdd(result, l);
      }
    }
    return result;
  }

  private static Statistics.State deriveStatType(
      List colStats, List neededColumns) {
    boolean hasStats = false,
        hasNull = (colStats == null) || (colStats.size() < neededColumns.size());
    if (colStats != null) {
      for (ColStatistics cs : colStats) {
        boolean isNull = cs == null;
        hasStats |= !isNull;
        hasNull |= isNull;
        if (hasNull && hasStats) break;
      }
    }
    State result = (hasStats
        ? (hasNull ? Statistics.State.PARTIAL : Statistics.State.COMPLETE)
        : (neededColumns.isEmpty() ? Statistics.State.COMPLETE : Statistics.State.NONE));
    return result;
  }

  /**
   * Convert ColumnStatisticsObj to ColStatistics
   * @param cso
   *          - ColumnStatisticsObj
   * @param tabName
   *          - table name
   * @param colName
   *          - column name
   * @return ColStatistics
   */
  public static ColStatistics getColStatistics(ColumnStatisticsObj cso, String tabName,
      String colName) {
    ColStatistics cs = new ColStatistics(colName, cso.getColType());
    String colType = cso.getColType();
    ColumnStatisticsData csd = cso.getStatsData();
    if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)) {
      cs.setCountDistint(csd.getLongStats().getNumDVs());
      cs.setNumNulls(csd.getLongStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(csd.getLongStats().getLowValue(), csd.getLongStats().getHighValue());
    } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) {
      cs.setCountDistint(csd.getLongStats().getNumDVs());
      cs.setNumNulls(csd.getLongStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive2());
      cs.setRange(csd.getLongStats().getLowValue(), csd.getLongStats().getHighValue());
    } else if (colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
      cs.setCountDistint(csd.getDoubleStats().getNumDVs());
      cs.setNumNulls(csd.getDoubleStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive1());
      cs.setRange(csd.getDoubleStats().getLowValue(), csd.getDoubleStats().getHighValue());
    } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
      cs.setCountDistint(csd.getDoubleStats().getNumDVs());
      cs.setNumNulls(csd.getDoubleStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive2());
      cs.setRange(csd.getDoubleStats().getLowValue(), csd.getDoubleStats().getHighValue());
    } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
        || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
        || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {
      cs.setCountDistint(csd.getStringStats().getNumDVs());
      cs.setNumNulls(csd.getStringStats().getNumNulls());
      cs.setAvgColLen(csd.getStringStats().getAvgColLen());
    } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
      if (csd.getBooleanStats().getNumFalses() > 0 && csd.getBooleanStats().getNumTrues() > 0) {
        cs.setCountDistint(2);
      } else {
        cs.setCountDistint(1);
      }
      cs.setNumTrues(csd.getBooleanStats().getNumTrues());
      cs.setNumFalses(csd.getBooleanStats().getNumFalses());
      cs.setNumNulls(csd.getBooleanStats().getNumNulls());
      cs.setAvgColLen(JavaDataModel.get().primitive1());
    } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
      cs.setAvgColLen(csd.getBinaryStats().getAvgColLen());
      cs.setNumNulls(csd.getBinaryStats().getNumNulls());
    } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfTimestamp());
    } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfDecimal());
      cs.setCountDistint(csd.getDecimalStats().getNumDVs());
      cs.setNumNulls(csd.getDecimalStats().getNumNulls());
      Decimal val = csd.getDecimalStats().getHighValue();
      BigDecimal maxVal = HiveDecimal.
          create(new BigInteger(val.getUnscaled()), val.getScale()).bigDecimalValue();
      val = csd.getDecimalStats().getLowValue();
      BigDecimal minVal = HiveDecimal.
          create(new BigInteger(val.getUnscaled()), val.getScale()).bigDecimalValue();
      cs.setRange(minVal, maxVal);
    } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
      cs.setAvgColLen(JavaDataModel.get().lengthOfDate());
    } else {
      // Columns statistics for complex datatypes are not supported yet
      return null;
    }

    return cs;
  }

  /**
   * Get table level column statistics from metastore for needed columns
   * @param table
   *          - table
   * @param schema
   *          - output schema
   * @param neededColumns
   *          - list of needed columns
   * @return column statistics
   */
  public static List getTableColumnStats(
      Table table, List schema, List neededColumns) {
    String dbName = table.getDbName();
    String tabName = table.getTableName();
    List neededColsInTable = processNeededColumns(schema, neededColumns);
    List stats = null;
    try {
      List colStat = Hive.get().getTableColumnStatistics(
          dbName, tabName, neededColsInTable);
      stats = convertColStats(colStat, tabName);
    } catch (HiveException e) {
      LOG.error("Failed to retrieve table statistics: ", e);
      stats = null;
    }
    return stats;
  }

  private static List convertColStats(List colStats, String tabName) {
    List stats = new ArrayList(colStats.size());
    for (ColumnStatisticsObj statObj : colStats) {
      ColStatistics cs = getColStatistics(statObj, tabName, statObj.getColName());
      stats.add(cs);
    }
    return stats;
  }
  private static List processNeededColumns(List schema,
      List neededColumns) {
    // Remove hidden virtual columns, as well as needed columns that are not
    // part of the table. TODO: the latter case should not really happen...
    List neededColsInTable = null;
    int limit = neededColumns.size();
    for (int i = 0; i < limit; ++i) {
      if (neededColsInTable == null) {
        neededColsInTable = Lists.newArrayList(neededColumns);
      }
      neededColsInTable.remove(i--);
      --limit;
    }
    return (neededColsInTable == null || neededColsInTable.size() == 0) ? neededColumns
        : neededColsInTable;
  }

  /**
   * Get the raw data size of variable length data types
   * @param conf
   *          - hive conf
   * @param oi
   *          - object inspector
   * @param colType
   *          - column type
   * @return raw data size
   */
  public static long getAvgColLenOfVariableLengthTypes(HiveConf conf, ObjectInspector oi,
      String colType) {

    long configVarLen = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAX_VARIABLE_LENGTH);

    if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)) {

      // constant string projection Ex: select "hello" from table
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        if (coi.getWritableConstantValue() == null) {
          return 0;
        }

        return coi.getWritableConstantValue().toString().length();
      } else if (oi instanceof WritableConstantStringObjectInspector) {

        // some UDFs return writable constant strings (fixed width)
        // Ex: select upper("hello") from table
        WritableConstantStringObjectInspector wcsoi = (WritableConstantStringObjectInspector) oi;

        return wcsoi.getWritableConstantValue().toString().length();
      } else if (oi instanceof WritableStringObjectInspector) {

        // some UDFs may emit strings of variable length. like pattern matching
        // UDFs. it's hard to find the length of such UDFs.
        // return the variable length from config
        return configVarLen;
      }
    } else if (colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)) {

      // constant varchar projection
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        if (coi.getWritableConstantValue() == null) {
          return 0;
        }

        return coi.getWritableConstantValue().toString().length();
      } else if (oi instanceof WritableConstantHiveVarcharObjectInspector) {

        WritableConstantHiveVarcharObjectInspector wcsoi =
            (WritableConstantHiveVarcharObjectInspector) oi;
        return wcsoi.getWritableConstantValue().toString().length();
      } else if (oi instanceof WritableHiveVarcharObjectInspector) {
        return ((WritableHiveVarcharObjectInspector) oi).getMaxLength();
      }
    } else if (colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {

      // constant char projection
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        if (coi.getWritableConstantValue() == null) {
          return 0;
        }

        return coi.getWritableConstantValue().toString().length();
      } else if (oi instanceof WritableConstantHiveCharObjectInspector) {

        WritableConstantHiveCharObjectInspector wcsoi =
            (WritableConstantHiveCharObjectInspector) oi;
        return wcsoi.getWritableConstantValue().toString().length();
      } else if (oi instanceof WritableHiveCharObjectInspector) {
        return ((WritableHiveCharObjectInspector) oi).getMaxLength();
      }
    } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {

      // constant byte arrays
      if (oi instanceof ConstantObjectInspector) {
        ConstantObjectInspector coi = (ConstantObjectInspector) oi;

        // if writable constant is null then return size 0
        if (coi.getWritableConstantValue() == null) {
          return 0;
        }

        BytesWritable bw = ((BytesWritable) coi.getWritableConstantValue());
        return bw.getLength();
      } else if (oi instanceof WritableConstantBinaryObjectInspector) {

        // writable constant byte arrays
        WritableConstantBinaryObjectInspector wcboi = (WritableConstantBinaryObjectInspector) oi;

        return wcboi.getWritableConstantValue().getLength();
      } else if (oi instanceof WritableBinaryObjectInspector) {

        // return the variable length from config
        return configVarLen;
      }
    } else {

      // complex types (map, list, struct, union)
      return getSizeOfComplexTypes(conf, oi);
    }

    return 0;
  }

  /**
   * Get the size of complex data types
   * @param conf
   *          - hive conf
   * @param oi
   *          - object inspector
   * @return raw data size
   */
  public static long getSizeOfComplexTypes(HiveConf conf, ObjectInspector oi) {
    long result = 0;
    int length = 0;
    int listEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_LIST_NUM_ENTRIES);
    int mapEntries = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVE_STATS_MAP_NUM_ENTRIES);

    switch (oi.getCategory()) {
    case PRIMITIVE:
      String colType = oi.getTypeName();
      if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
          || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
          || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
        int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
        result += JavaDataModel.get().lengthForStringOfLength(avgColLen);
      } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
        int avgColLen = (int) getAvgColLenOfVariableLengthTypes(conf, oi, colType);
        result += JavaDataModel.get().lengthForByteArrayOfSize(avgColLen);
      } else {
        result += getAvgColLenOfFixedLengthTypes(colType);
      }
      break;
    case LIST:
      if (oi instanceof StandardConstantListObjectInspector) {

        // constant list projection of known length
        StandardConstantListObjectInspector scloi = (StandardConstantListObjectInspector) oi;
        length = scloi.getWritableConstantValue().size();

        // check if list elements are primitive or Objects
        ObjectInspector leoi = scloi.getListElementObjectInspector();
        if (leoi.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
          result += getSizeOfPrimitiveTypeArraysFromType(leoi.getTypeName(), length);
        } else {
          result += JavaDataModel.get().lengthForObjectArrayOfSize(length);
        }
      } else {
        StandardListObjectInspector sloi = (StandardListObjectInspector) oi;

        // list overhead + (configured number of element in list * size of element)
        long elemSize = getSizeOfComplexTypes(conf, sloi.getListElementObjectInspector());
        result += JavaDataModel.get().arrayList() + (listEntries * elemSize);
      }
      break;
    case MAP:
      if (oi instanceof StandardConstantMapObjectInspector) {

        // constant map projection of known length
        StandardConstantMapObjectInspector scmoi = (StandardConstantMapObjectInspector) oi;
        result += getSizeOfMap(scmoi);
      } else {
        StandardMapObjectInspector smoi = (StandardMapObjectInspector) oi;
        result += getSizeOfComplexTypes(conf, smoi.getMapKeyObjectInspector());
        result += getSizeOfComplexTypes(conf, smoi.getMapValueObjectInspector());

        // hash map overhead
        result += JavaDataModel.get().hashMap(mapEntries);
      }
      break;
    case STRUCT:
      StructObjectInspector soi = (StructObjectInspector) oi;

      // add constant object overhead for struct
      result += JavaDataModel.get().object();

      // add constant struct field names references overhead
      result += soi.getAllStructFieldRefs().size() * JavaDataModel.get().ref();
      for (StructField field : soi.getAllStructFieldRefs()) {
        result += getSizeOfComplexTypes(conf, field.getFieldObjectInspector());
      }
      break;
    case UNION:
      UnionObjectInspector uoi = (UnionObjectInspector) oi;

      // add constant object overhead for union
      result += JavaDataModel.get().object();

      // add constant size for unions tags
      result += uoi.getObjectInspectors().size() * JavaDataModel.get().primitive1();
      for (ObjectInspector foi : uoi.getObjectInspectors()) {
        result += getSizeOfComplexTypes(conf, foi);
      }
      break;
    default:
      break;
    }

    return result;
  }

  /**
   * Get size of fixed length primitives
   * @param colType
   *          - column type
   * @return raw data size
   */
  public static long getAvgColLenOfFixedLengthTypes(String colType) {
    if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
      return JavaDataModel.get().primitive1();
    } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)
        || colType.equalsIgnoreCase("long")) {
      return JavaDataModel.get().primitive2();
    } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
      return JavaDataModel.get().lengthOfTimestamp();
    } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
      return JavaDataModel.get().lengthOfDate();
    } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      return JavaDataModel.get().lengthOfDecimal();
    } else {
      return 0;
    }
  }

  /**
   * Get the size of arrays of primitive types
   * @param colType
   *          - column type
   * @param length
   *          - array length
   * @return raw data size
   */
  public static long getSizeOfPrimitiveTypeArraysFromType(String colType, int length) {
    if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) {
      return JavaDataModel.get().lengthForIntArrayOfSize(length);
    } else if (colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
      return JavaDataModel.get().lengthForDoubleArrayOfSize(length);
    } else if (colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)
        || colType.equalsIgnoreCase("long")) {
      return JavaDataModel.get().lengthForLongArrayOfSize(length);
    } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
      return JavaDataModel.get().lengthForByteArrayOfSize(length);
    } else if (colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) {
      return JavaDataModel.get().lengthForBooleanArrayOfSize(length);
    } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
      return JavaDataModel.get().lengthForTimestampArrayOfSize(length);
    } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
      return JavaDataModel.get().lengthForDateArrayOfSize(length);
    } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
      return JavaDataModel.get().lengthForDecimalArrayOfSize(length);
    } else {
      return 0;
    }
  }

  /**
   * Estimate the size of map object
   * @param scmoi
   *          - object inspector
   * @return size of map
   */
  public static long getSizeOfMap(StandardConstantMapObjectInspector scmoi) {
    Map map = scmoi.getWritableConstantValue();
    ObjectInspector koi = scmoi.getMapKeyObjectInspector();
    ObjectInspector voi = scmoi.getMapValueObjectInspector();
    long result = 0;
    for (Map.Entry entry : map.entrySet()) {
      result += getWritableSize(koi, entry.getKey());
      result += getWritableSize(voi, entry.getValue());
    }

    // add additional overhead of each map entries
    result += JavaDataModel.get().hashMap(map.entrySet().size());
    return result;
  }

  /**
   * Get size of primitive data types based on their respective writable object inspector
   * @param oi
   *          - object inspector
   * @param value
   *          - value
   * @return raw data size
   */
  public static long getWritableSize(ObjectInspector oi, Object value) {
    if (oi instanceof WritableStringObjectInspector) {
      WritableStringObjectInspector woi = (WritableStringObjectInspector) oi;
      return JavaDataModel.get().lengthForStringOfLength(
          woi.getPrimitiveWritableObject(value).getLength());
    } else if (oi instanceof WritableBinaryObjectInspector) {
      WritableBinaryObjectInspector woi = (WritableBinaryObjectInspector) oi;
      return JavaDataModel.get().lengthForByteArrayOfSize(
          woi.getPrimitiveWritableObject(value).getLength());
    } else if (oi instanceof WritableBooleanObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableByteObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableDateObjectInspector) {
      return JavaDataModel.get().lengthOfDate();
    } else if (oi instanceof WritableDoubleObjectInspector) {
      return JavaDataModel.get().primitive2();
    } else if (oi instanceof WritableFloatObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableHiveDecimalObjectInspector) {
      return JavaDataModel.get().lengthOfDecimal();
    } else if (oi instanceof WritableIntObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableLongObjectInspector) {
      return JavaDataModel.get().primitive2();
    } else if (oi instanceof WritableShortObjectInspector) {
      return JavaDataModel.get().primitive1();
    } else if (oi instanceof WritableTimestampObjectInspector) {
      return JavaDataModel.get().lengthOfTimestamp();
    }

    return 0;
  }

  /**
   * Get column statistics from parent statistics.
   * @param conf
   *          - hive conf
   * @param parentStats
   *          - parent statistics
   * @param colExprMap
   *          - column expression map
   * @param rowSchema
   *          - row schema
   * @return column statistics
   */
  public static List getColStatisticsFromExprMap(HiveConf conf,
      Statistics parentStats, Map colExprMap, RowSchema rowSchema) {

    List cs = Lists.newArrayList();
    if (colExprMap != null  && rowSchema != null) {
      for (ColumnInfo ci : rowSchema.getSignature()) {
        String outColName = ci.getInternalName();
        ExprNodeDesc end = colExprMap.get(outColName);
        ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, end);
        if (colStat != null) {
          colStat.setColumnName(outColName);
          cs.add(colStat);
        }
      }

      return cs;
    }

    // In cases where column expression map or row schema is missing, just pass on the parent column
    // stats. This could happen in cases like TS -> FIL where FIL does not map input column names to
    // internal names.
    if (colExprMap == null || rowSchema == null) {
      if (parentStats.getColumnStats() != null) {
        cs.addAll(parentStats.getColumnStats());
      }
    }
    return cs;
  }

  /**
   * Get column statistics from parent statistics given the
   * row schema of its child.
   * @param parentStats
   *          - parent statistics
   * @param rowSchema
   *          - row schema
   * @return column statistics
   */
  public static List getColStatisticsUpdatingTableAlias(
          Statistics parentStats, RowSchema rowSchema) {

    List cs = Lists.newArrayList();

    for (ColStatistics parentColStat : parentStats.getColumnStats()) {
      ColStatistics colStat;
      try {
        colStat = parentColStat.clone();
      } catch (CloneNotSupportedException e) {
        colStat = null;
      }
      if (colStat != null) {
        cs.add(colStat);
      }
    }

    return cs;
  }

  /**
   * Get column statistics expression nodes
   * @param conf
   *          - hive conf
   * @param parentStats
   *          - parent statistics
   * @param end
   *          - expression nodes
   * @return column statistics
   */
  public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats,
      ExprNodeDesc end) {

    if (end == null) {
      return null;
    }

    String colName = null;
    String colType = null;
    double avgColSize = 0;
    long countDistincts = 0;
    long numNulls = 0;
    ObjectInspector oi = null;
    long numRows = parentStats.getNumRows();

    if (end instanceof ExprNodeColumnDesc) {
      // column projection
      ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
      colName = encd.getColumn();

      if (encd.getIsPartitionColOrVirtualCol()) {

        ColStatistics colStats = parentStats.getColumnStatisticsFromColName(colName);
        if (colStats != null) {
          /* If statistics for the column already exist use it. */
          return colStats;
        }

        // virtual columns
        colType = encd.getTypeInfo().getTypeName();
        countDistincts = numRows;
        oi = encd.getWritableObjectInspector();
      } else {

        // clone the column stats and return
        ColStatistics result = parentStats.getColumnStatisticsFromColName(colName);
        if (result != null) {
          try {
            return result.clone();
          } catch (CloneNotSupportedException e) {
            return null;
          }
        }
        return null;
      }
    } else if (end instanceof ExprNodeConstantDesc) {

      // constant projection
      ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;

      // null projection
      if (encd.getValue() == null) {
        colName = encd.getName();
        colType = "null";
        numNulls = numRows;
      } else {
        colName = encd.getName();
        colType = encd.getTypeString();
        countDistincts = 1;
        oi = encd.getWritableObjectInspector();
      }
    } else if (end instanceof ExprNodeGenericFuncDesc) {

      // udf projection
      ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
      colName = engfd.getName();
      colType = engfd.getTypeString();
      countDistincts = numRows;
      oi = engfd.getWritableObjectInspector();
    } else if (end instanceof ExprNodeColumnListDesc) {

      // column list
      ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end;
      colName = Joiner.on(",").join(encd.getCols());
      colType = "array";
      countDistincts = numRows;
      oi = encd.getWritableObjectInspector();
    } else if (end instanceof ExprNodeFieldDesc) {

      // field within complex type
      ExprNodeFieldDesc enfd = (ExprNodeFieldDesc) end;
      colName = enfd.getFieldName();
      colType = enfd.getTypeString();
      countDistincts = numRows;
      oi = enfd.getWritableObjectInspector();
    }

    if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
        || colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)
        || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
        || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)
        || colType.startsWith(serdeConstants.LIST_TYPE_NAME)
        || colType.startsWith(serdeConstants.MAP_TYPE_NAME)
        || colType.startsWith(serdeConstants.STRUCT_TYPE_NAME)
        || colType.startsWith(serdeConstants.UNION_TYPE_NAME)) {
      avgColSize = getAvgColLenOfVariableLengthTypes(conf, oi, colType);
    } else {
      avgColSize = getAvgColLenOfFixedLengthTypes(colType);
    }

    ColStatistics colStats = new ColStatistics(colName, colType);
    colStats.setAvgColLen(avgColSize);
    colStats.setCountDistint(countDistincts);
    colStats.setNumNulls(numNulls);

    return colStats;
  }

  /**
   * Get number of rows of a give table
   * @return number of rows
   */
  public static long getNumRows(Table table) {
    return getBasicStatForTable(table, StatsSetupConst.ROW_COUNT);
  }

  /**
   * Get raw data size of a give table
   * @return raw data size
   */
  public static long getRawDataSize(Table table) {
    return getBasicStatForTable(table, StatsSetupConst.RAW_DATA_SIZE);
  }

  /**
   * Get total size of a give table
   * @return total size
   */
  public static long getTotalSize(Table table) {
    return getBasicStatForTable(table, StatsSetupConst.TOTAL_SIZE);
  }

  /**
   * Get basic stats of table
   * @param table
   *          - table
   * @param statType
   *          - type of stats
   * @return value of stats
   */
  public static long getBasicStatForTable(Table table, String statType) {
    Map params = table.getParameters();
    long result = 0;

    if (params != null) {
      try {
        result = Long.parseLong(params.get(statType));
      } catch (NumberFormatException e) {
        result = 0;
      }
    }
    return result;
  }

  /**
   * Get basic stats of partitions
   * @param table
   *          - table
   * @param parts
   *          - partitions
   * @param statType
   *          - type of stats
   * @return value of stats
   */
  public static List getBasicStatForPartitions(Table table, List parts,
      String statType) {

    List stats = Lists.newArrayList();
    for (Partition part : parts) {
      Map params = part.getParameters();
      long result = 0;
      if (params != null) {
        try {
          result = Long.parseLong(params.get(statType));
        } catch (NumberFormatException e) {
          result = 0;
        }
        stats.add(result);
      }
    }
    return stats;
  }

  /**
   * Compute raw data size from column statistics
   * @param numRows
   *          - number of rows
   * @param colStats
   *          - column statistics
   * @return raw data size
   */
  public static long getDataSizeFromColumnStats(long numRows, List colStats) {
    long result = 0;

    if (numRows <= 0) {
      return result;
    }

    for (ColStatistics cs : colStats) {
      if (cs != null) {
        String colType = cs.getColumnType();
        long nonNullCount = numRows - cs.getNumNulls();
        double sizeOf = 0;
        if (colType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)
            || colType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)
            || colType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)
            || colType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)
            || colType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)
            || colType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)
            || colType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) {
          sizeOf = cs.getAvgColLen();
        } else if (colType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)
            || colType.startsWith(serdeConstants.VARCHAR_TYPE_NAME)
            || colType.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
          int acl = (int) Math.round(cs.getAvgColLen());
          sizeOf = JavaDataModel.get().lengthForStringOfLength(acl);
        } else if (colType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) {
          int acl = (int) Math.round(cs.getAvgColLen());
          sizeOf = JavaDataModel.get().lengthForByteArrayOfSize(acl);
        } else if (colType.equalsIgnoreCase(serdeConstants.TIMESTAMP_TYPE_NAME)) {
          sizeOf = JavaDataModel.get().lengthOfTimestamp();
        } else if (colType.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
          sizeOf = JavaDataModel.get().lengthOfDecimal();
        } else if (colType.equalsIgnoreCase(serdeConstants.DATE_TYPE_NAME)) {
          sizeOf = JavaDataModel.get().lengthOfDate();
        } else {
          sizeOf = cs.getAvgColLen();
        }
        result = safeAdd(result, safeMult(nonNullCount, sizeOf));
      }
    }

    return result;
  }

  public static String getFullyQualifiedTableName(String dbName, String tabName) {
    return getFullyQualifiedName(dbName, tabName);
  }

  private static String getFullyQualifiedName(String... names) {
    List nonNullAndEmptyNames = Lists.newArrayList();
    for (String name : names) {
      if (name != null && !name.isEmpty()) {
        nonNullAndEmptyNames.add(name);
      }
    }
    return Joiner.on(".").join(nonNullAndEmptyNames);
  }

  /**
   * Get qualified column name from output key column names
   * @param keyExprs
   *          - output key names
   * @return list of qualified names
   */
  public static List getQualifedReducerKeyNames(List keyExprs) {
    List result = Lists.newArrayList();
    if (keyExprs != null) {
      for (String key : keyExprs) {
        result.add(Utilities.ReduceField.KEY.toString() + "." + key);
      }
    }
    return result;
  }

  public static long getAvailableMemory(Configuration conf) {
    int memory = HiveConf.getIntVar(conf, HiveConf.ConfVars.HIVETEZCONTAINERSIZE);
    if (memory <= 0) {
      memory = conf.getInt(MRJobConfig.MAP_MEMORY_MB, MRJobConfig.DEFAULT_MAP_MEMORY_MB);
      if (memory <= 0) {
        memory = 1024;
      }
    }
    return memory;
  }

  /**
   * negative number of rows or data sizes are invalid. It could be because of
   * long overflow in which case return Long.MAX_VALUE
   * @param val - input value
   * @return Long.MAX_VALUE if val is negative else val
   */
  public static long getMaxIfOverflow(long val) {
    return val < 0 ? Long.MAX_VALUE : val;
  }

  /** Bounded multiplication - overflows become MAX_VALUE */
  public static long safeMult(long a, double b) {
    double result = a * b;
    return (result > Long.MAX_VALUE) ? Long.MAX_VALUE : (long)result;
  }

  /** Bounded addition - overflows become MAX_VALUE */
  public static long safeAdd(long a, long b) {
    try {
      return LongMath.checkedAdd(a, b);
    } catch (ArithmeticException ex) {
      return Long.MAX_VALUE;
    }
  }

  /** Bounded multiplication - overflows become MAX_VALUE */
  public static long safeMult(long a, long b) {
    try {
      return LongMath.checkedMultiply(a, b);
    } catch (ArithmeticException ex) {
      return Long.MAX_VALUE;
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy