org.apache.flink.table.api.internal.AnalyzeTableUtil Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of flink-table-api-java Show documentation
Show all versions of flink-table-api-java Show documentation
This module contains the Table/SQL API for writing table programs
within the table ecosystem using the Java programming language.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.table.api.internal;
import org.apache.flink.annotation.Internal;
import org.apache.flink.table.api.TableException;
import org.apache.flink.table.api.TableResult;
import org.apache.flink.table.catalog.Catalog;
import org.apache.flink.table.catalog.CatalogPartitionSpec;
import org.apache.flink.table.catalog.Column;
import org.apache.flink.table.catalog.ObjectIdentifier;
import org.apache.flink.table.catalog.ObjectPath;
import org.apache.flink.table.catalog.exceptions.PartitionNotExistException;
import org.apache.flink.table.catalog.exceptions.TableNotExistException;
import org.apache.flink.table.catalog.exceptions.TablePartitionedException;
import org.apache.flink.table.catalog.stats.CatalogColumnStatistics;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBase;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBinary;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBoolean;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString;
import org.apache.flink.table.catalog.stats.CatalogTableStatistics;
import org.apache.flink.table.catalog.stats.Date;
import org.apache.flink.table.operations.ddl.AnalyzeTableOperation;
import org.apache.flink.types.Row;
import org.apache.flink.util.CollectionUtil;
import org.apache.flink.util.Preconditions;
import javax.annotation.Nullable;
import java.math.BigDecimal;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/** ANALYZE TABLE statement Util. */
@Internal
public class AnalyzeTableUtil {
private AnalyzeTableUtil() {}
public static TableResultInternal analyzeTable(
TableEnvironmentImpl tableEnv, AnalyzeTableOperation operation)
throws TableNotExistException, PartitionNotExistException, TablePartitionedException {
List columns = operation.getColumns();
// the TableIdentifier has be validated before
Catalog catalog =
tableEnv.getCatalogManager()
.getCatalog(operation.getTableIdentifier().getCatalogName())
.orElseThrow(() -> new TableException("This should not happen."));
ObjectPath objectPath = operation.getTableIdentifier().toObjectPath();
if (operation.getPartitionSpecs().isPresent()) {
List targetPartitions = operation.getPartitionSpecs().get();
if (targetPartitions.isEmpty()) {
return TableResultImpl.TABLE_RESULT_OK;
}
String statSql =
generateAnalyzeSqlForMultiParts(
operation.getTableIdentifier(), targetPartitions, columns);
int partitionCount = targetPartitions.size();
Map results =
executeSqlAndGenerateStatisticsForMultiParts(
tableEnv, columns, statSql, partitionCount);
for (int i = 0; i < partitionCount; ++i) {
StatisticsWrapper result = results.get(i);
CatalogPartitionSpec partitionSpec = targetPartitions.get(i);
catalog.alterPartitionStatistics(
objectPath, partitionSpec, result.tableStat, false);
CatalogColumnStatistics newColumnStat = result.columnStat;
if (newColumnStat != null) {
CatalogColumnStatistics oldColumnStat =
catalog.getPartitionColumnStatistics(objectPath, partitionSpec);
// merge stats
CatalogColumnStatistics mergedColumnStatistics =
mergeColumnStatistics(oldColumnStat, newColumnStat);
catalog.alterPartitionColumnStatistics(
objectPath, partitionSpec, mergedColumnStatistics, false);
}
}
} else {
String statSql = generateAnalyzeSql(operation.getTableIdentifier(), null, columns, -1);
StatisticsWrapper result = executeSqlAndGenerateStatistics(tableEnv, columns, statSql);
catalog.alterTableStatistics(objectPath, result.tableStat, false);
CatalogColumnStatistics newColumnStat = result.columnStat;
if (newColumnStat != null) {
CatalogColumnStatistics oldColumnStat =
catalog.getTableColumnStatistics(objectPath);
// merge stats.
CatalogColumnStatistics mergedColumnStatistics =
mergeColumnStatistics(oldColumnStat, newColumnStat);
catalog.alterTableColumnStatistics(objectPath, mergedColumnStatistics, false);
}
}
return TableResultImpl.TABLE_RESULT_OK;
}
private static CatalogColumnStatistics mergeColumnStatistics(
CatalogColumnStatistics oldColumnStatistics,
CatalogColumnStatistics newColumnStatistics) {
CatalogColumnStatistics columnStatistics = oldColumnStatistics.copy();
columnStatistics
.getColumnStatisticsData()
.putAll(newColumnStatistics.getColumnStatisticsData());
return columnStatistics;
}
private static StatisticsWrapper executeSqlAndGenerateStatistics(
TableEnvironmentImpl tableEnv, List columns, String statSql) {
TableResult tableResult = tableEnv.executeSql(statSql);
List result = CollectionUtil.iteratorToList(tableResult.collect());
Preconditions.checkArgument(result.size() == 1);
Row row = result.get(0);
CatalogTableStatistics tableStat = convertToTableStatistics(row);
CatalogColumnStatistics columnStat = null;
if (!columns.isEmpty()) {
columnStat = convertToColumnStatistics(row, columns);
}
return new StatisticsWrapper(tableStat, columnStat);
}
private static Map executeSqlAndGenerateStatisticsForMultiParts(
TableEnvironmentImpl tableEnv,
List columns,
String statSql,
int partitionCount) {
TableResult tableResult = tableEnv.executeSql(statSql);
List result = CollectionUtil.iteratorToList(tableResult.collect());
Preconditions.checkArgument(result.size() == partitionCount);
Map map = new HashMap<>();
for (Row row : result) {
CatalogTableStatistics tableStat = convertToTableStatistics(row);
CatalogColumnStatistics columnStat = null;
if (!columns.isEmpty()) {
columnStat = convertToColumnStatistics(row, columns);
}
int index = row.getFieldAs(getPartitionIdxColumn());
map.put(index, new StatisticsWrapper(tableStat, columnStat));
}
return map;
}
private static String generateAnalyzeSqlForMultiParts(
ObjectIdentifier tableIdentifier,
List partitionSpecs,
List columns) {
List sqlList = new ArrayList<>();
for (int i = 0; i < partitionSpecs.size(); ++i) {
sqlList.add(generateAnalyzeSql(tableIdentifier, partitionSpecs.get(i), columns, i));
}
return String.join("\n UNION ALL \n", sqlList);
}
private static String generateAnalyzeSql(
ObjectIdentifier tableIdentifier,
@Nullable CatalogPartitionSpec partitionSpec,
List columns,
int index) {
String partitionFilter;
if (partitionSpec != null) {
partitionFilter =
" WHERE "
+ partitionSpec.getPartitionSpec().entrySet().stream()
.map(e -> e.getKey() + "=" + e.getValue())
.collect(Collectors.joining(" AND "));
} else {
partitionFilter = "";
}
final String columnStatsSelects;
if (columns.isEmpty()) {
columnStatsSelects = "";
} else {
columnStatsSelects = ", " + getColumnStatsSelects(columns);
}
return String.format(
"SELECT COUNT(1) AS %s %s %s FROM %s %s",
getRowCountColumn(),
columnStatsSelects,
index >= 0 ? String.format(", %s as %s", index, getPartitionIdxColumn()) : "",
tableIdentifier,
partitionFilter);
}
private static String getColumnStatsSelects(List columns) {
return columns.stream()
.flatMap(
f -> {
String c = f.getName();
List columnStatSelect = new ArrayList<>();
String computeNullCount =
String.format(
"(COUNT(1) - COUNT(`%s`)) AS %s",
c, getNullCountColumn(c));
columnStatSelect.add(computeNullCount);
String computeNdv =
String.format(
"APPROX_COUNT_DISTINCT(`%s`) AS %s",
c, getNdvColumn(c));
switch (f.getDataType().getLogicalType().getTypeRoot()) {
case BOOLEAN:
columnStatSelect.add(
String.format(
"COUNT(`%s`) FILTER (WHERE `%s` IS TRUE) AS %s",
c, c, getTrueCountColumn(c)));
columnStatSelect.add(
String.format(
"COUNT(`%s`) FILTER (WHERE `%s` IS FALSE) AS %s",
c, c, getFalseCountColumn(c)));
break;
case TINYINT:
case SMALLINT:
case INTEGER:
case FLOAT:
case DATE:
case TIME_WITHOUT_TIME_ZONE:
case BIGINT:
case DOUBLE:
case DECIMAL:
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
case TIMESTAMP_WITHOUT_TIME_ZONE:
columnStatSelect.add(computeNdv);
columnStatSelect.add(
String.format("MAX(`%s`) AS %s", c, getMaxColumn(c)));
columnStatSelect.add(
String.format("MIN(`%s`) AS %s", c, getMinColumn(c)));
break;
case CHAR:
case VARCHAR:
columnStatSelect.add(computeNdv);
columnStatSelect.add(
String.format(
"AVG(CAST(CHAR_LENGTH(`%s`) AS DOUBLE)) AS %s",
c, getAvgLenColumn(c)));
columnStatSelect.add(
String.format(
"MAX(CAST(CHAR_LENGTH(`%s`) AS BIGINT)) AS %s",
c, getMaxLenColumn(c)));
break;
default:
break;
}
return columnStatSelect.stream();
})
.collect(Collectors.joining(", "));
}
private static CatalogTableStatistics convertToTableStatistics(Row row) {
Long rowCount = row.getFieldAs(getRowCountColumn());
return new CatalogTableStatistics(rowCount, -1, -1, -1);
}
private static CatalogColumnStatistics convertToColumnStatistics(
Row row, List columns) {
Preconditions.checkArgument(!columns.isEmpty());
Map columnStatMap = new HashMap<>();
for (Column column : columns) {
CatalogColumnStatisticsDataBase columnStat = convertToColumnStatisticsData(row, column);
if (columnStat != null) {
columnStatMap.put(column.getName(), columnStat);
}
}
return new CatalogColumnStatistics(columnStatMap);
}
private static CatalogColumnStatisticsDataBase convertToColumnStatisticsData(
Row row, Column column) {
String c = column.getName();
Long nullCount = row.getFieldAs(getNullCountColumn(c));
switch (column.getDataType().getLogicalType().getTypeRoot()) {
case BOOLEAN:
Long trueCount = row.getFieldAs(getTrueCountColumn(c));
Long falseCount = row.getFieldAs(getFalseCountColumn(c));
return new CatalogColumnStatisticsDataBoolean(trueCount, falseCount, nullCount);
case TINYINT:
Byte maxByte = row.getFieldAs(getMaxColumn(c));
Byte minByte = row.getFieldAs(getMinColumn(c));
Long ndvByte = row.getFieldAs(getNdvColumn(c));
return new CatalogColumnStatisticsDataLong(
minByte != null ? minByte.longValue() : null,
maxByte != null ? maxByte.longValue() : null,
ndvByte,
nullCount);
case SMALLINT:
Short maxShort = row.getFieldAs(getMaxColumn(c));
Short minShort = row.getFieldAs(getMinColumn(c));
Long ndvShort = row.getFieldAs(getNdvColumn(c));
return new CatalogColumnStatisticsDataLong(
minShort != null ? minShort.longValue() : null,
maxShort != null ? maxShort.longValue() : null,
ndvShort,
nullCount);
case INTEGER:
Integer maxInt = row.getFieldAs(getMaxColumn(c));
Integer minInt = row.getFieldAs(getMinColumn(c));
Long ndvInt = row.getFieldAs(getNdvColumn(c));
return new CatalogColumnStatisticsDataLong(
minInt != null ? minInt.longValue() : null,
maxInt != null ? maxInt.longValue() : null,
ndvInt,
nullCount);
case BIGINT:
Long ndvLong = row.getFieldAs(getNdvColumn(c));
Long maxLong = row.getFieldAs(getMaxColumn(c));
Long minLong = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataLong(minLong, maxLong, ndvLong, nullCount);
case TIMESTAMP_WITHOUT_TIME_ZONE:
Long ndvTs = row.getFieldAs(getNdvColumn(c));
LocalDateTime maxTs = row.getFieldAs(getMaxColumn(c));
LocalDateTime minTs = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataLong(
minTs != null ? minTs.toEpochSecond(ZoneOffset.UTC) : null,
maxTs != null ? maxTs.toEpochSecond(ZoneOffset.UTC) : null,
ndvTs,
nullCount);
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
Long ndvTsLtz = row.getFieldAs(getNdvColumn(c));
Instant maxTsLtz = row.getFieldAs(getMaxColumn(c));
Instant minTsLtz = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataLong(
minTsLtz != null ? minTsLtz.toEpochMilli() : null,
maxTsLtz != null ? maxTsLtz.toEpochMilli() : null,
ndvTsLtz,
nullCount);
case FLOAT:
Long ndvFloat = row.getFieldAs(getNdvColumn(c));
Float maxFloat = row.getFieldAs(getMaxColumn(c));
Float minFloat = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataDouble(
minFloat != null ? minFloat.doubleValue() : null,
maxFloat != null ? maxFloat.doubleValue() : null,
ndvFloat,
nullCount);
case DOUBLE:
Long ndvDouble = row.getFieldAs(getNdvColumn(c));
Double maxDouble = row.getFieldAs(getMaxColumn(c));
Double minDouble = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataDouble(
minDouble, maxDouble, ndvDouble, nullCount);
case DECIMAL:
Long ndvDecimal = row.getFieldAs(getNdvColumn(c));
BigDecimal maxDecimal = row.getFieldAs(getMaxColumn(c));
BigDecimal minDecimal = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataDouble(
minDecimal != null ? minDecimal.doubleValue() : null,
maxDecimal != null ? maxDecimal.doubleValue() : null,
ndvDecimal,
nullCount);
case DATE:
Long ndvDate = row.getFieldAs(getNdvColumn(c));
LocalDate maxDate = row.getFieldAs(getMaxColumn(c));
LocalDate minDate = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataDate(
minDate != null ? new Date(minDate.toEpochDay()) : null,
maxDate != null ? new Date(maxDate.toEpochDay()) : null,
ndvDate,
nullCount);
case TIME_WITHOUT_TIME_ZONE:
Long ndvTime = row.getFieldAs(getNdvColumn(c));
LocalTime maxTime = row.getFieldAs(getMaxColumn(c));
LocalTime minTime = row.getFieldAs(getMinColumn(c));
return new CatalogColumnStatisticsDataLong(
minTime != null ? minTime.toNanoOfDay() : null,
maxTime != null ? maxTime.toNanoOfDay() : null,
ndvTime,
nullCount);
case CHAR:
case VARCHAR:
Long ndvString = row.getFieldAs(getNdvColumn(c));
Double avgLen = row.getFieldAs(getAvgLenColumn(c));
Long maxLen = row.getFieldAs(getMaxLenColumn(c));
return new CatalogColumnStatisticsDataString(maxLen, avgLen, ndvString, nullCount);
case BINARY:
case VARBINARY:
return new CatalogColumnStatisticsDataBinary(null, null, nullCount);
default:
return null;
}
}
private static class StatisticsWrapper {
private final CatalogTableStatistics tableStat;
private final CatalogColumnStatistics columnStat;
private StatisticsWrapper(
CatalogTableStatistics tableStat, CatalogColumnStatistics columnStat) {
this.tableStat = tableStat;
this.columnStat = columnStat;
}
}
private static String getRowCountColumn() {
return "rowCount";
}
private static String getNullCountColumn(String column) {
return String.format("%s_nullCount", column);
}
private static String getNdvColumn(String column) {
return String.format("%s_ndv", column);
}
private static String getTrueCountColumn(String column) {
return String.format("%s_trueCount", column);
}
private static String getFalseCountColumn(String column) {
return String.format("%s_falseCount", column);
}
private static String getMaxColumn(String column) {
return String.format("%s_max", column);
}
private static String getMinColumn(String column) {
return String.format("%s_min", column);
}
private static String getAvgLenColumn(String column) {
return String.format("%s_avgLen", column);
}
private static String getMaxLenColumn(String column) {
return String.format("%s_maxLen", column);
}
private static String getPartitionIdxColumn() {
return "part_idx";
}
}