io.trino.plugin.hive.metastore.glue.DefaultGlueColumnStatisticsProvider Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.plugin.hive.metastore.glue;
import com.amazonaws.services.glue.AWSGlueAsync;
import com.amazonaws.services.glue.model.ColumnStatistics;
import com.amazonaws.services.glue.model.ColumnStatisticsData;
import com.amazonaws.services.glue.model.ColumnStatisticsType;
import com.amazonaws.services.glue.model.DateColumnStatisticsData;
import com.amazonaws.services.glue.model.DecimalColumnStatisticsData;
import com.amazonaws.services.glue.model.DeleteColumnStatisticsForPartitionRequest;
import com.amazonaws.services.glue.model.DeleteColumnStatisticsForTableRequest;
import com.amazonaws.services.glue.model.DoubleColumnStatisticsData;
import com.amazonaws.services.glue.model.EntityNotFoundException;
import com.amazonaws.services.glue.model.GetColumnStatisticsForPartitionRequest;
import com.amazonaws.services.glue.model.GetColumnStatisticsForPartitionResult;
import com.amazonaws.services.glue.model.GetColumnStatisticsForTableRequest;
import com.amazonaws.services.glue.model.GetColumnStatisticsForTableResult;
import com.amazonaws.services.glue.model.LongColumnStatisticsData;
import com.amazonaws.services.glue.model.UpdateColumnStatisticsForPartitionRequest;
import com.amazonaws.services.glue.model.UpdateColumnStatisticsForTableRequest;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import io.trino.plugin.hive.HiveBasicStatistics;
import io.trino.plugin.hive.metastore.Column;
import io.trino.plugin.hive.metastore.HiveColumnStatistics;
import io.trino.plugin.hive.metastore.Partition;
import io.trino.plugin.hive.metastore.Table;
import io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil;
import io.trino.spi.TrinoException;
import io.trino.spi.statistics.ColumnStatisticType;
import io.trino.spi.type.Type;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.Sets.difference;
import static io.airlift.concurrent.MoreFutures.getFutureValue;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_METASTORE_ERROR;
import static io.trino.plugin.hive.HiveErrorCode.HIVE_PARTITION_NOT_FOUND;
import static io.trino.plugin.hive.metastore.glue.converter.GlueStatConverter.fromGlueColumnStatistics;
import static io.trino.plugin.hive.metastore.glue.converter.GlueStatConverter.toGlueColumnStatistics;
import static io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics;
import static java.util.concurrent.CompletableFuture.allOf;
import static java.util.concurrent.CompletableFuture.runAsync;
import static java.util.stream.Collectors.toUnmodifiableList;
public class DefaultGlueColumnStatisticsProvider
implements GlueColumnStatisticsProvider
{
// Read limit for AWS Glue API GetColumnStatisticsForPartition
// https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-partitions.html#aws-glue-api-catalog-partitions-GetColumnStatisticsForPartition
private static final int GLUE_COLUMN_READ_STAT_PAGE_SIZE = 100;
// Write limit for AWS Glue API UpdateColumnStatisticsForPartition
// https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-partitions.html#aws-glue-api-catalog-partitions-UpdateColumnStatisticsForPartition
private static final int GLUE_COLUMN_WRITE_STAT_PAGE_SIZE = 25;
private final AWSGlueAsync glueClient;
private final String catalogId;
private final Executor readExecutor;
private final Executor writeExecutor;
public DefaultGlueColumnStatisticsProvider(AWSGlueAsync glueClient, String catalogId, Executor readExecutor, Executor writeExecutor)
{
this.glueClient = glueClient;
this.catalogId = catalogId;
this.readExecutor = readExecutor;
this.writeExecutor = writeExecutor;
}
@Override
public Set getSupportedColumnStatistics(Type type)
{
return ThriftMetastoreUtil.getSupportedColumnStatistics(type);
}
@Override
public Map getTableColumnStatistics(Table table)
{
try {
List columnNames = getAllColumns(table);
List> columnChunks = Lists.partition(columnNames, GLUE_COLUMN_READ_STAT_PAGE_SIZE);
List> getStatsFutures = columnChunks.stream()
.map(partialColumns -> CompletableFuture.supplyAsync(() -> {
GetColumnStatisticsForTableRequest request = new GetColumnStatisticsForTableRequest()
.withCatalogId(catalogId)
.withDatabaseName(table.getDatabaseName())
.withTableName(table.getTableName())
.withColumnNames(partialColumns);
return glueClient.getColumnStatisticsForTable(request);
}, readExecutor)).collect(toImmutableList());
HiveBasicStatistics tableStatistics = getHiveBasicStatistics(table.getParameters());
ImmutableMap.Builder columnStatsMapBuilder = ImmutableMap.builder();
for (CompletableFuture future : getStatsFutures) {
GetColumnStatisticsForTableResult tableColumnsStats = getFutureValue(future, TrinoException.class);
for (ColumnStatistics columnStatistics : tableColumnsStats.getColumnStatisticsList()) {
columnStatsMapBuilder.put(
columnStatistics.getColumnName(),
fromGlueColumnStatistics(columnStatistics.getStatisticsData(), tableStatistics.getRowCount()));
}
}
return columnStatsMapBuilder.build();
}
catch (RuntimeException ex) {
throw new TrinoException(HIVE_METASTORE_ERROR, ex);
}
}
private Optional
© 2015 - 2025 Weber Informatics LLC | Privacy Policy