io.trino.plugin.hive.metastore.MetastoreUtil Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive.metastore;
import com.google.common.base.Joiner;
import com.google.common.base.VerifyException;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.primitives.Longs;
import io.airlift.compress.Compressor;
import io.airlift.compress.zstd.ZstdCompressor;
import io.airlift.slice.Slice;
import io.trino.hive.thrift.metastore.ResourceType;
import io.trino.hive.thrift.metastore.ResourceUri;
import io.trino.plugin.hive.HiveBasicStatistics;
import io.trino.plugin.hive.HiveColumnHandle;
import io.trino.plugin.hive.PartitionOfflineException;
import io.trino.plugin.hive.TableOfflineException;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.SchemaTableName;
import io.trino.spi.connector.TableNotFoundException;
import io.trino.spi.predicate.Domain;
import io.trino.spi.predicate.TupleDomain;
import io.trino.spi.type.BigintType;
import io.trino.spi.type.BooleanType;
import io.trino.spi.type.CharType;
import io.trino.spi.type.DateType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.Decimals;
import io.trino.spi.type.DoubleType;
import io.trino.spi.type.Int128;
import io.trino.spi.type.IntegerType;
import io.trino.spi.type.RealType;
import io.trino.spi.type.SmallintType;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.TinyintType;
import io.trino.spi.type.Type;
import io.trino.spi.type.VarcharType;
import jakarta.annotation.Nullable;
import org.joda.time.format.DateTimeFormatter;
import org.joda.time.format.ISODateTimeFormat;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.OptionalLong;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Strings.isNullOrEmpty;
import static com.google.common.base.Strings.nullToEmpty;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.hash.Hashing.sha256;
import static com.google.common.io.BaseEncoding.base64Url;
import static io.trino.hive.formats.HiveClassNames.AVRO_SERDE_CLASS;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.BUCKET_COUNT;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.BUCKET_FIELD_NAME;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.FILE_INPUT_FORMAT;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.FILE_OUTPUT_FORMAT;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.META_TABLE_COLUMNS;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.META_TABLE_COLUMN_TYPES;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.META_TABLE_LOCATION;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.META_TABLE_NAME;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS;
import static io.trino.hive.thrift.metastore.hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES;
import static io.trino.plugin.hive.HiveMetadata.AVRO_SCHEMA_LITERAL_KEY;
import static io.trino.plugin.hive.HiveMetadata.AVRO_SCHEMA_URL_KEY;
import static io.trino.plugin.hive.HiveSplitManager.PRESTO_OFFLINE;
import static io.trino.plugin.hive.HiveStorageFormat.AVRO;
import static io.trino.plugin.hive.metastore.SparkMetastoreUtil.getSparkBasicStatistics;
import static io.trino.plugin.hive.util.HiveUtil.makePartName;
import static io.trino.plugin.hive.util.SerdeConstants.LIST_COLUMN_COMMENTS;
import static io.trino.plugin.hive.util.SerdeConstants.SERIALIZATION_LIB;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.predicate.TupleDomain.withColumnDomains;
import static io.trino.spi.security.PrincipalType.USER;
import static io.trino.spi.type.Chars.padSpaces;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.stream.Collectors.toList;
public final class MetastoreUtil
{
private MetastoreUtil() {}
private static final String HIVE_PARTITION_VALUE_WILDCARD = "";
public static final String NUM_ROWS = "numRows";
public static final String NUM_FILES = "numFiles";
public static final String RAW_DATA_SIZE = "rawDataSize";
public static final String TOTAL_SIZE = "totalSize";
public static final Set STATS_PROPERTIES = ImmutableSet.of(NUM_FILES, NUM_ROWS, RAW_DATA_SIZE, TOTAL_SIZE);
public static Map getHiveSchema(Table table)
{
// Mimics function in Hive: MetaStoreUtils.getTableMetadata(Table)
return getHiveSchema(
table.getStorage(),
Optional.empty(),
table.getDataColumns(),
table.getParameters(),
table.getDatabaseName(),
table.getTableName(),
table.getPartitionColumns());
}
public static Map getHiveSchema(Partition partition, Table table)
{
// Mimics function in Hive: MetaStoreUtils.getSchema(Partition, Table)
return getHiveSchema(
partition.getStorage(),
Optional.of(table.getStorage()),
table.getDataColumns(),
table.getParameters(),
table.getDatabaseName(),
table.getTableName(),
table.getPartitionColumns());
}
private static Map getHiveSchema(
Storage sd,
Optional tableSd,
List tableDataColumns,
Map parameters,
String databaseName,
String tableName,
List partitionKeys)
{
// Mimics function in Hive:
// MetaStoreUtils.getSchema(StorageDescriptor, StorageDescriptor, Map, String, String, List)
ImmutableMap.Builder schema = ImmutableMap.builder();
schema.put(FILE_INPUT_FORMAT, sd.getStorageFormat().getInputFormat());
schema.put(FILE_OUTPUT_FORMAT, sd.getStorageFormat().getOutputFormat());
schema.put(META_TABLE_NAME, databaseName + "." + tableName);
schema.put(META_TABLE_LOCATION, sd.getLocation());
if (sd.getBucketProperty().isPresent()) {
schema.put(BUCKET_FIELD_NAME, Joiner.on(",").join(sd.getBucketProperty().get().bucketedBy()));
schema.put(BUCKET_COUNT, Integer.toString(sd.getBucketProperty().get().bucketCount()));
}
else {
schema.put(BUCKET_COUNT, "0");
}
for (Map.Entry param : sd.getSerdeParameters().entrySet()) {
schema.put(param.getKey(), (param.getValue() != null) ? param.getValue() : "");
}
if (sd.getStorageFormat().getSerde().equals(AVRO_SERDE_CLASS) && tableSd.isPresent()) {
for (Map.Entry param : tableSd.get().getSerdeParameters().entrySet()) {
schema.put(param.getKey(), nullToEmpty(param.getValue()));
}
}
schema.put(SERIALIZATION_LIB, sd.getStorageFormat().getSerde());
StringBuilder columnNameBuilder = new StringBuilder();
StringBuilder columnTypeBuilder = new StringBuilder();
StringBuilder columnCommentBuilder = new StringBuilder();
boolean first = true;
for (Column column : tableDataColumns) {
if (!first) {
columnNameBuilder.append(",");
columnTypeBuilder.append(":");
columnCommentBuilder.append('\0');
}
columnNameBuilder.append(column.getName());
columnTypeBuilder.append(column.getType());
columnCommentBuilder.append(column.getComment().orElse(""));
first = false;
}
String columnNames = columnNameBuilder.toString();
String columnTypes = columnTypeBuilder.toString();
schema.put(META_TABLE_COLUMNS, columnNames);
schema.put(META_TABLE_COLUMN_TYPES, columnTypes);
schema.put(LIST_COLUMN_COMMENTS, columnCommentBuilder.toString());
StringBuilder partString = new StringBuilder();
String partStringSep = "";
StringBuilder partTypesString = new StringBuilder();
String partTypesStringSep = "";
for (Column partKey : partitionKeys) {
partString.append(partStringSep);
partString.append(partKey.getName());
partTypesString.append(partTypesStringSep);
partTypesString.append(partKey.getType().getHiveTypeName().toString());
if (partStringSep.length() == 0) {
partStringSep = "/";
partTypesStringSep = ":";
}
}
if (partString.length() > 0) {
schema.put(META_TABLE_PARTITION_COLUMNS, partString.toString());
schema.put(META_TABLE_PARTITION_COLUMN_TYPES, partTypesString.toString());
}
if (parameters != null) {
for (Map.Entry entry : parameters.entrySet()) {
// add non-null parameters to the schema
if (entry.getValue() != null) {
schema.put(entry.getKey(), entry.getValue());
}
}
}
return schema.buildKeepingLast();
}
public static ProtectMode getProtectMode(Partition partition)
{
return getProtectMode(partition.getParameters());
}
public static ProtectMode getProtectMode(Table table)
{
return getProtectMode(table.getParameters());
}
public static boolean isAvroTableWithSchemaSet(Table table)
{
return AVRO.getSerde().equals(table.getStorage().getStorageFormat().getSerDeNullable()) &&
((table.getParameters().get(AVRO_SCHEMA_URL_KEY) != null ||
(table.getStorage().getSerdeParameters().get(AVRO_SCHEMA_URL_KEY) != null)) ||
(table.getParameters().get(AVRO_SCHEMA_LITERAL_KEY) != null ||
(table.getStorage().getSerdeParameters().get(AVRO_SCHEMA_LITERAL_KEY) != null)));
}
public static String makePartitionName(Table table, Partition partition)
{
return makePartitionName(table.getPartitionColumns(), partition.getValues());
}
public static String makePartitionName(List partitionColumns, List values)
{
return toPartitionName(partitionColumns.stream().map(Column::getName).collect(toList()), values);
}
public static String toPartitionName(List names, List values)
{
checkArgument(names.size() == values.size(), "partition value count must match partition column count");
checkArgument(values.stream().allMatch(Objects::nonNull), "partition value must not be null");
return makePartName(names, values);
}
public static String getPartitionLocation(Table table, Optional partition)
{
if (partition.isEmpty()) {
return table.getStorage().getLocation();
}
return partition.get().getStorage().getLocation();
}
private static ProtectMode getProtectMode(Map parameters)
{
return ProtectMode.valueOf(nullToEmpty(parameters.get(ProtectMode.PARAMETER_NAME)));
}
public static void verifyOnline(SchemaTableName tableName, Optional partitionName, ProtectMode protectMode, Map parameters)
{
if (protectMode.offline()) {
if (partitionName.isPresent()) {
throw new PartitionOfflineException(tableName, partitionName.get(), false, null);
}
throw new TableOfflineException(tableName, false, null);
}
String prestoOffline = parameters.get(PRESTO_OFFLINE);
if (!isNullOrEmpty(prestoOffline)) {
if (partitionName.isPresent()) {
throw new PartitionOfflineException(tableName, partitionName.get(), true, prestoOffline);
}
throw new TableOfflineException(tableName, true, prestoOffline);
}
}
public static void verifyCanDropColumn(HiveMetastore metastore, String databaseName, String tableName, String columnName)
{
Table table = metastore.getTable(databaseName, tableName)
.orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName)));
if (table.getPartitionColumns().stream().anyMatch(column -> column.getName().equals(columnName))) {
throw new TrinoException(NOT_SUPPORTED, "Cannot drop partition columns");
}
if (table.getDataColumns().size() <= 1) {
throw new TrinoException(NOT_SUPPORTED, "Cannot drop the only non-partition column in a table");
}
}
public static PrincipalPrivileges buildInitialPrivilegeSet(String tableOwner)
{
HivePrincipal owner = new HivePrincipal(USER, tableOwner);
return new PrincipalPrivileges(
ImmutableMultimap.builder()
.put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.SELECT, true, owner, owner))
.put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.INSERT, true, owner, owner))
.put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.UPDATE, true, owner, owner))
.put(tableOwner, new HivePrivilegeInfo(HivePrivilegeInfo.HivePrivilege.DELETE, true, owner, owner))
.build(),
ImmutableMultimap.of());
}
/**
* @param assumeCanonicalPartitionKeys allow conversion of non-char types (eg BIGINT, timestamp) to canonical string formats. If false, non-char types will be replaced
* with the wildcard
* @return the domain for each partition key to either the wildcard or an equals check, or empty if {@code TupleDomain.isNone()}
*/
public static Optional> partitionKeyFilterToStringList(List columnNames, TupleDomain partitionKeysFilter, boolean assumeCanonicalPartitionKeys)
{
if (partitionKeysFilter.isNone()) {
return Optional.empty();
}
Map domainMap = partitionKeysFilter.getDomains().orElseThrow(VerifyException::new);
return Optional.of(columnNames.stream()
.map(cn -> domainToString(domainMap.get(cn), assumeCanonicalPartitionKeys, HIVE_PARTITION_VALUE_WILDCARD))
.collect(toImmutableList()));
}
/**
* @param domain - domain expression for the column. null => TupleDomain.all()
* @param partitionWildcardString wildcard
* @return string for scalar values
*/
private static String domainToString(Domain domain, boolean assumeCanonicalPartitionKeys, String partitionWildcardString)
{
if (domain != null && domain.isNullableSingleValue()) {
return sqlScalarToStringForParts(domain.getType(), domain.getNullableSingleValue(), assumeCanonicalPartitionKeys, partitionWildcardString);
}
return partitionWildcardString;
}
public static boolean canConvertSqlTypeToStringForParts(Type type, boolean assumeCanonicalPartitionKeys)
{
return !(type instanceof TimestampType) && (type instanceof CharType || type instanceof VarcharType || assumeCanonicalPartitionKeys);
}
/**
* @return canonical string representation of a given value according to its type. If there isn't a valid conversion, returns ""
*/
public static String sqlScalarToStringForParts(Type type, Object value, boolean assumeCanonicalPartitionKeys, String partitionWildcardString)
{
if (!canConvertSqlTypeToStringForParts(type, assumeCanonicalPartitionKeys)) {
return partitionWildcardString;
}
return sqlScalarToString(type, value, HIVE_PARTITION_VALUE_WILDCARD);
}
/**
* @return canonical string representation of a given value according to its type.
* @throws TrinoException if the type is not supported
*/
public static String sqlScalarToString(Type type, Object value, String nullString)
{
if (value == null) {
return nullString;
}
if (type instanceof CharType) {
Slice slice = (Slice) value;
return padSpaces(slice, (CharType) type).toStringUtf8();
}
if (type instanceof VarcharType) {
Slice slice = (Slice) value;
return slice.toStringUtf8();
}
if (type instanceof DecimalType decimalType && !decimalType.isShort()) {
return Decimals.toString((Int128) value, decimalType.getScale());
}
if (type instanceof DecimalType decimalType && decimalType.isShort()) {
return Decimals.toString((long) value, decimalType.getScale());
}
if (type instanceof DateType) {
DateTimeFormatter dateTimeFormatter = ISODateTimeFormat.date().withZoneUTC();
return dateTimeFormatter.print(TimeUnit.DAYS.toMillis((long) value));
}
if (type instanceof TimestampType) {
// we throw on this type as we don't have timezone. Callers should not ask for this conversion type, but document for possible future work (?)
throw new TrinoException(NOT_SUPPORTED, "TimestampType conversion to scalar expressions is not supported");
}
if (type instanceof TinyintType
|| type instanceof SmallintType
|| type instanceof IntegerType
|| type instanceof BigintType
|| type instanceof DoubleType
|| type instanceof RealType
|| type instanceof BooleanType) {
return value.toString();
}
throw new TrinoException(NOT_SUPPORTED, format("Unsupported partition key type: %s", type.getDisplayName()));
}
/**
* This method creates a TupleDomain for each partitionKey specified
*
* @return filtered version of relevant Domains in effectivePredicate.
*/
public static TupleDomain computePartitionKeyFilter(List partitionKeys, TupleDomain effectivePredicate)
{
checkArgument(effectivePredicate.getDomains().isPresent());
Map domains = new LinkedHashMap<>();
for (HiveColumnHandle partitionKey : partitionKeys) {
String name = partitionKey.getName();
Domain domain = effectivePredicate.getDomains().get().get(partitionKey);
if (domain != null) {
domains.put(name, domain);
}
}
return withColumnDomains(domains);
}
public static String metastoreFunctionName(String functionName, String signatureToken)
{
return "trino__%s__%s".formatted(functionName, sha256().hashUnencodedChars(signatureToken));
}
public static List toResourceUris(byte[] input)
{
Compressor compressor = new ZstdCompressor();
byte[] compressed = new byte[compressor.maxCompressedLength(input.length)];
int outputSize = compressor.compress(input, 0, input.length, compressed, 0, compressed.length);
ImmutableList.Builder resourceUris = ImmutableList.builder();
for (int offset = 0; offset < outputSize; offset += 750) {
int length = Math.min(750, outputSize - offset);
String encoded = base64Url().encode(compressed, offset, length);
resourceUris.add(new ResourceUri(ResourceType.FILE, encoded));
}
return resourceUris.build();
}
public static Map adjustRowCount(Map parameters, String description, long rowCountAdjustment)
{
String existingRowCount = parameters.get(NUM_ROWS);
if (existingRowCount == null) {
return parameters;
}
Long count = Longs.tryParse(existingRowCount);
requireNonNull(count, format("For %s, the existing row count (%s) is not a digit string", description, existingRowCount));
long newRowCount = count + rowCountAdjustment;
checkArgument(newRowCount >= 0, "For %s, the subtracted row count (%s) is less than zero, existing count %s, rows deleted %s", description, newRowCount, existingRowCount, rowCountAdjustment);
Map copiedParameters = new HashMap<>(parameters);
copiedParameters.put(NUM_ROWS, String.valueOf(newRowCount));
return ImmutableMap.copyOf(copiedParameters);
}
public static HiveBasicStatistics getHiveBasicStatistics(Map parameters)
{
OptionalLong numFiles = toLong(parameters.get(NUM_FILES));
OptionalLong numRows = toLong(parameters.get(NUM_ROWS));
OptionalLong inMemoryDataSizeInBytes = toLong(parameters.get(RAW_DATA_SIZE));
OptionalLong onDiskDataSizeInBytes = toLong(parameters.get(TOTAL_SIZE));
return new HiveBasicStatistics(numFiles, numRows, inMemoryDataSizeInBytes, onDiskDataSizeInBytes);
}
public static HiveBasicStatistics getBasicStatisticsWithSparkFallback(Map parameters)
{
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(parameters);
// Partitioned table without statistics
if (basicStatistics.getRowCount().isEmpty() || basicStatistics.getRowCount().getAsLong() == 0L) {
HiveBasicStatistics sparkBasicStatistics = getSparkBasicStatistics(parameters);
if (sparkBasicStatistics.getRowCount().isPresent()) {
return sparkBasicStatistics;
}
}
return basicStatistics;
}
public static Map updateStatisticsParameters(Map parameters, HiveBasicStatistics statistics)
{
ImmutableMap.Builder result = ImmutableMap.builder();
parameters.forEach((key, value) -> {
if (!STATS_PROPERTIES.contains(key)) {
result.put(key, value);
}
});
statistics.getFileCount().ifPresent(count -> result.put(NUM_FILES, Long.toString(count)));
statistics.getRowCount().ifPresent(count -> result.put(NUM_ROWS, Long.toString(count)));
statistics.getInMemoryDataSizeInBytes().ifPresent(size -> result.put(RAW_DATA_SIZE, Long.toString(size)));
statistics.getOnDiskDataSizeInBytes().ifPresent(size -> result.put(TOTAL_SIZE, Long.toString(size)));
// CDH 5.16 metastore ignores stats unless STATS_GENERATED_VIA_STATS_TASK is set
// https://github.com/cloudera/hive/blob/cdh5.16.2-release/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java#L227-L231
if (!parameters.containsKey("STATS_GENERATED_VIA_STATS_TASK")) {
result.put("STATS_GENERATED_VIA_STATS_TASK", "workaround for potential lack of HIVE-12730");
}
return result.buildOrThrow();
}
private static OptionalLong toLong(@Nullable String parameterValue)
{
if (parameterValue == null) {
return OptionalLong.empty();
}
Long longValue = Longs.tryParse(parameterValue);
if (longValue == null || longValue < 0) {
return OptionalLong.empty();
}
return OptionalLong.of(longValue);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy