io.trino.plugin.hive.metastore.thrift.ThriftSparkMetastoreUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of trino-hive Show documentation
Show all versions of trino-hive Show documentation
This is a Databricks build of Trino's Hive plugin which includes support for HTTP based transport
for it's Hive metastore thrift interface.
The newest version!
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive.metastore.thrift;
import com.google.common.annotations.VisibleForTesting;
import io.trino.hive.thrift.metastore.FieldSchema;
import io.trino.hive.thrift.metastore.Table;
import io.trino.plugin.hive.HiveBasicStatistics;
import io.trino.plugin.hive.HiveType;
import io.trino.plugin.hive.PartitionStatistics;
import io.trino.plugin.hive.metastore.HiveColumnStatistics;
import io.trino.plugin.hive.type.PrimitiveTypeInfo;
import io.trino.plugin.hive.type.TypeInfo;
import java.util.AbstractMap;
import java.util.Map;
import java.util.OptionalDouble;
import java.util.OptionalLong;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static io.trino.plugin.hive.metastore.HiveColumnStatistics.createBinaryColumnStatistics;
import static io.trino.plugin.hive.metastore.HiveColumnStatistics.createBooleanColumnStatistics;
import static io.trino.plugin.hive.metastore.HiveColumnStatistics.createDateColumnStatistics;
import static io.trino.plugin.hive.metastore.HiveColumnStatistics.createDecimalColumnStatistics;
import static io.trino.plugin.hive.metastore.HiveColumnStatistics.createDoubleColumnStatistics;
import static io.trino.plugin.hive.metastore.HiveColumnStatistics.createIntegerColumnStatistics;
import static io.trino.plugin.hive.metastore.HiveColumnStatistics.createStringColumnStatistics;
import static io.trino.plugin.hive.metastore.thrift.ThriftMetastoreParameterParserUtils.toDate;
import static io.trino.plugin.hive.metastore.thrift.ThriftMetastoreParameterParserUtils.toDecimal;
import static io.trino.plugin.hive.metastore.thrift.ThriftMetastoreParameterParserUtils.toDouble;
import static io.trino.plugin.hive.metastore.thrift.ThriftMetastoreParameterParserUtils.toLong;
import static io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.NUM_ROWS;
import static io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getTotalSizeInBytes;
import static io.trino.plugin.hive.type.Category.PRIMITIVE;
final class ThriftSparkMetastoreUtil
{
private static final String SPARK_SQL_STATS_PREFIX = "spark.sql.statistics.";
private static final String COLUMN_STATS_PREFIX = SPARK_SQL_STATS_PREFIX + "colStats.";
private static final String NUM_FILES = "numFiles";
private static final String RAW_DATA_SIZE = "rawDataSize";
private static final String TOTAL_SIZE = "totalSize";
private static final String COLUMN_MIN = "min";
private static final String COLUMN_MAX = "max";
private ThriftSparkMetastoreUtil() {}
public static PartitionStatistics getTableStatistics(Table table)
{
Map parameters = table.getParameters();
HiveBasicStatistics sparkBasicStatistics = getSparkBasicStatistics(parameters);
if (sparkBasicStatistics.getRowCount().isEmpty()) {
return PartitionStatistics.empty();
}
Map columnStatistics = table.getSd().getCols().stream()
.map(fieldSchema -> new AbstractMap.SimpleEntry<>(
fieldSchema.getName(),
fromMetastoreColumnStatistics(fieldSchema, parameters, sparkBasicStatistics.getRowCount().getAsLong())))
.collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue));
return new PartitionStatistics(sparkBasicStatistics, columnStatistics);
}
public static HiveBasicStatistics getSparkBasicStatistics(Map parameters)
{
OptionalLong rowCount = toLong(parameters.get(SPARK_SQL_STATS_PREFIX + NUM_ROWS));
if (rowCount.isEmpty()) {
return HiveBasicStatistics.createEmptyStatistics();
}
OptionalLong fileCount = toLong(parameters.get(SPARK_SQL_STATS_PREFIX + NUM_FILES));
OptionalLong inMemoryDataSizeInBytes = toLong(parameters.get(SPARK_SQL_STATS_PREFIX + RAW_DATA_SIZE));
OptionalLong onDiskDataSizeInBytes = toLong(parameters.get(SPARK_SQL_STATS_PREFIX + TOTAL_SIZE));
return new HiveBasicStatistics(fileCount, rowCount, inMemoryDataSizeInBytes, onDiskDataSizeInBytes);
}
@VisibleForTesting
static HiveColumnStatistics fromMetastoreColumnStatistics(FieldSchema fieldSchema, Map columnStatistics, long rowCount)
{
HiveType type = HiveType.valueOf(fieldSchema.getType());
TypeInfo typeInfo = type.getTypeInfo();
if (typeInfo.getCategory() != PRIMITIVE) {
// Spark does not support table statistics for non-primitive types
return HiveColumnStatistics.empty();
}
String field = COLUMN_STATS_PREFIX + fieldSchema.getName() + ".";
OptionalLong maxLength = toLong(columnStatistics.get(field + "maxLen"));
OptionalDouble avgLength = toDouble(columnStatistics.get(field + "avgLen"));
OptionalLong nullsCount = toLong(columnStatistics.get(field + "nullCount"));
OptionalLong distinctValuesCount = toLong(columnStatistics.get(field + "distinctCount"));
return switch (((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) {
case BOOLEAN -> createBooleanColumnStatistics(
OptionalLong.empty(),
OptionalLong.empty(),
nullsCount);
case BYTE, SHORT, INT, LONG -> createIntegerColumnStatistics(
toLong(columnStatistics.get(field + COLUMN_MIN)),
toLong(columnStatistics.get(field + COLUMN_MAX)),
nullsCount,
fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount));
case TIMESTAMP -> createIntegerColumnStatistics(
OptionalLong.empty(),
OptionalLong.empty(),
nullsCount,
fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount));
case FLOAT, DOUBLE -> createDoubleColumnStatistics(
toDouble(columnStatistics.get(field + COLUMN_MIN)),
toDouble(columnStatistics.get(field + COLUMN_MAX)),
nullsCount,
fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount));
case STRING, VARCHAR, CHAR -> createStringColumnStatistics(
maxLength,
getTotalSizeInBytes(avgLength, OptionalLong.of(rowCount), nullsCount),
nullsCount,
fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount));
case DATE -> createDateColumnStatistics(
toDate(columnStatistics.get(field + COLUMN_MIN)),
toDate(columnStatistics.get(field + COLUMN_MAX)),
nullsCount,
fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount));
case BINARY -> createBinaryColumnStatistics(
maxLength,
getTotalSizeInBytes(avgLength, OptionalLong.of(rowCount), nullsCount),
nullsCount);
case DECIMAL -> createDecimalColumnStatistics(
toDecimal(columnStatistics.get(field + COLUMN_MIN)),
toDecimal(columnStatistics.get(field + COLUMN_MAX)),
nullsCount,
fromMetastoreDistinctValuesCount(distinctValuesCount, nullsCount, rowCount));
case TIMESTAMPLOCALTZ, INTERVAL_YEAR_MONTH, INTERVAL_DAY_TIME, VOID, UNKNOWN -> HiveColumnStatistics.empty();
};
}
/**
* Hive calculates NDV considering null as a distinct value, but Spark doesn't
*/
private static OptionalLong fromMetastoreDistinctValuesCount(OptionalLong distinctValuesCount, OptionalLong nullsCount, long rowCount)
{
if (distinctValuesCount.isPresent() && nullsCount.isPresent()) {
return OptionalLong.of(fromMetastoreDistinctValuesCount(distinctValuesCount.getAsLong(), nullsCount.getAsLong(), rowCount));
}
return OptionalLong.empty();
}
private static long fromMetastoreDistinctValuesCount(long distinctValuesCount, long nullsCount, long rowCount)
{
long nonNullsCount = rowCount - nullsCount;
// normalize distinctValuesCount in case there is a non-null element
if (nonNullsCount > 0 && distinctValuesCount == 0) {
distinctValuesCount = 1;
}
// the metastore may store an estimate, so the value stored may be higher than the total number of rows
return Math.min(distinctValuesCount, nonNullsCount);
}
}