All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.paimon.flink.utils.TableStatsUtil Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.paimon.flink.utils;

import org.apache.paimon.Snapshot;
import org.apache.paimon.data.Decimal;
import org.apache.paimon.stats.ColStats;
import org.apache.paimon.stats.Statistics;
import org.apache.paimon.table.FileStoreTable;
import org.apache.paimon.types.DataField;
import org.apache.paimon.types.DataTypeRoot;

import org.apache.flink.table.catalog.exceptions.CatalogException;
import org.apache.flink.table.catalog.stats.CatalogColumnStatistics;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBase;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBinary;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataBoolean;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDate;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataDouble;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataLong;
import org.apache.flink.table.catalog.stats.CatalogColumnStatisticsDataString;
import org.apache.flink.table.catalog.stats.CatalogTableStatistics;

import javax.annotation.Nullable;

import java.math.BigDecimal;
import java.sql.Timestamp;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/** Utility methods for analysis table. */
public class TableStatsUtil {

    /** create Paimon statistics. */
    @Nullable
    public static Statistics createTableStats(
            FileStoreTable table, CatalogTableStatistics catalogTableStatistics) {
        Snapshot snapshot = table.snapshotManager().latestSnapshot();
        if (snapshot == null) {
            return null;
        }
        return new Statistics(
                snapshot.id(),
                snapshot.schemaId(),
                catalogTableStatistics.getRowCount(),
                catalogTableStatistics.getTotalSize());
    }

    /** Create Paimon statistics from given Flink columnStatistics. */
    @Nullable
    public static Statistics createTableColumnStats(
            FileStoreTable table, CatalogColumnStatistics columnStatistics) {
        if (!table.statistics().isPresent()) {
            return null;
        }
        Statistics statistics = table.statistics().get();
        List fields = table.schema().fields();
        Map> tableColumnStatsMap = new HashMap<>(fields.size());
        for (DataField field : fields) {
            CatalogColumnStatisticsDataBase catalogColumnStatisticsDataBase =
                    columnStatistics.getColumnStatisticsData().get(field.name());
            if (catalogColumnStatisticsDataBase == null) {
                continue;
            }
            tableColumnStatsMap.put(
                    field.name(), getPaimonColStats(field, catalogColumnStatisticsDataBase));
        }
        statistics.colStats().putAll(tableColumnStatsMap);
        return statistics;
    }

    /** Convert Flink ColumnStats to Paimon ColStats according to Paimon column type. */
    private static ColStats getPaimonColStats(
            DataField field, CatalogColumnStatisticsDataBase colStat) {
        DataTypeRoot typeRoot = field.type().getTypeRoot();
        if (colStat instanceof CatalogColumnStatisticsDataString) {
            CatalogColumnStatisticsDataString stringColStat =
                    (CatalogColumnStatisticsDataString) colStat;
            if (typeRoot.equals(DataTypeRoot.CHAR) || typeRoot.equals(DataTypeRoot.VARCHAR)) {
                return ColStats.newColStats(
                        field.id(),
                        null != stringColStat.getNdv() ? stringColStat.getNdv() : null,
                        null,
                        null,
                        null != stringColStat.getNullCount() ? stringColStat.getNullCount() : null,
                        null != stringColStat.getAvgLength()
                                ? stringColStat.getAvgLength().longValue()
                                : null,
                        null != stringColStat.getMaxLength() ? stringColStat.getMaxLength() : null);
            }
        } else if (colStat instanceof CatalogColumnStatisticsDataBoolean) {
            CatalogColumnStatisticsDataBoolean booleanColStat =
                    (CatalogColumnStatisticsDataBoolean) colStat;
            if (typeRoot.equals(DataTypeRoot.BOOLEAN)) {
                return ColStats.newColStats(
                        field.id(),
                        (booleanColStat.getFalseCount() > 0 ? 1L : 0)
                                + (booleanColStat.getTrueCount() > 0 ? 1L : 0),
                        null,
                        null,
                        booleanColStat.getNullCount(),
                        null,
                        null);
            }
        } else if (colStat instanceof CatalogColumnStatisticsDataLong) {
            CatalogColumnStatisticsDataLong longColStat = (CatalogColumnStatisticsDataLong) colStat;
            if (typeRoot.equals(DataTypeRoot.INTEGER)) {
                return ColStats.newColStats(
                        field.id(),
                        null != longColStat.getNdv() ? longColStat.getNdv() : null,
                        null != longColStat.getMin() ? longColStat.getMin().intValue() : null,
                        null != longColStat.getMax() ? longColStat.getMax().intValue() : null,
                        null != longColStat.getNullCount() ? longColStat.getNullCount() : null,
                        null,
                        null);
            } else if (typeRoot.equals(DataTypeRoot.TINYINT)) {
                return ColStats.newColStats(
                        field.id(),
                        null != longColStat.getNdv() ? longColStat.getNdv() : null,
                        null != longColStat.getMin() ? longColStat.getMin().byteValue() : null,
                        null != longColStat.getMax() ? longColStat.getMax().byteValue() : null,
                        null != longColStat.getNullCount() ? longColStat.getNullCount() : null,
                        null,
                        null);

            } else if (typeRoot.equals(DataTypeRoot.SMALLINT)) {
                return ColStats.newColStats(
                        field.id(),
                        null != longColStat.getNdv() ? longColStat.getNdv() : null,
                        null != longColStat.getMin() ? longColStat.getMin().shortValue() : null,
                        null != longColStat.getMax() ? longColStat.getMax().shortValue() : null,
                        null != longColStat.getNullCount() ? longColStat.getNullCount() : null,
                        null,
                        null);
            } else if (typeRoot.equals(DataTypeRoot.BIGINT)) {
                return ColStats.newColStats(
                        field.id(),
                        null != longColStat.getNdv() ? longColStat.getNdv() : null,
                        null != longColStat.getMin() ? longColStat.getMin() : null,
                        null != longColStat.getMax() ? longColStat.getMax() : null,
                        null != longColStat.getNullCount() ? longColStat.getNullCount() : null,
                        null,
                        null);
            } else if (typeRoot.equals(DataTypeRoot.TIMESTAMP_WITH_LOCAL_TIME_ZONE)) {
                return ColStats.newColStats(
                        field.id(),
                        null != longColStat.getNdv() ? longColStat.getNdv() : null,
                        null != longColStat.getMin()
                                ? org.apache.paimon.data.Timestamp.fromSQLTimestamp(
                                        new Timestamp(longColStat.getMin()))
                                : null,
                        null != longColStat.getMax()
                                ? org.apache.paimon.data.Timestamp.fromSQLTimestamp(
                                        new Timestamp(longColStat.getMax()))
                                : null,
                        null != longColStat.getNullCount() ? longColStat.getNullCount() : null,
                        null,
                        null);
            }
        } else if (colStat instanceof CatalogColumnStatisticsDataDouble) {
            CatalogColumnStatisticsDataDouble doubleColumnStatsData =
                    (CatalogColumnStatisticsDataDouble) colStat;
            if (typeRoot.equals(DataTypeRoot.FLOAT)) {
                return ColStats.newColStats(
                        field.id(),
                        null != doubleColumnStatsData.getNdv()
                                ? doubleColumnStatsData.getNdv()
                                : null,
                        null != doubleColumnStatsData.getMin()
                                ? doubleColumnStatsData.getMin().floatValue()
                                : null,
                        null != doubleColumnStatsData.getMax()
                                ? doubleColumnStatsData.getMax().floatValue()
                                : null,
                        null != doubleColumnStatsData.getNullCount()
                                ? doubleColumnStatsData.getNullCount()
                                : null,
                        null,
                        null);
            } else if (typeRoot.equals(DataTypeRoot.DOUBLE)) {
                return ColStats.newColStats(
                        field.id(),
                        null != doubleColumnStatsData.getNdv()
                                ? doubleColumnStatsData.getNdv()
                                : null,
                        null != doubleColumnStatsData.getMin()
                                ? doubleColumnStatsData.getMin()
                                : null,
                        null != doubleColumnStatsData.getMax()
                                ? doubleColumnStatsData.getMax()
                                : null,
                        null != doubleColumnStatsData.getNullCount()
                                ? doubleColumnStatsData.getNullCount()
                                : null,
                        null,
                        null);
            } else if (typeRoot.equals(DataTypeRoot.DECIMAL)) {
                BigDecimal max = BigDecimal.valueOf(doubleColumnStatsData.getMax());
                BigDecimal min = BigDecimal.valueOf(doubleColumnStatsData.getMin());
                return ColStats.newColStats(
                        field.id(),
                        null != doubleColumnStatsData.getNdv()
                                ? doubleColumnStatsData.getNdv()
                                : null,
                        null != doubleColumnStatsData.getMin()
                                ? Decimal.fromBigDecimal(min, min.precision(), min.scale())
                                : null,
                        null != doubleColumnStatsData.getMax()
                                ? Decimal.fromBigDecimal(max, max.precision(), max.scale())
                                : null,
                        null != doubleColumnStatsData.getNullCount()
                                ? doubleColumnStatsData.getNullCount()
                                : null,
                        null,
                        null);
            }
        } else if (colStat instanceof CatalogColumnStatisticsDataDate) {
            CatalogColumnStatisticsDataDate dateColumnStatsData =
                    (CatalogColumnStatisticsDataDate) colStat;
            if (typeRoot.equals(DataTypeRoot.DATE)) {
                return ColStats.newColStats(
                        field.id(),
                        null != dateColumnStatsData.getNdv() ? dateColumnStatsData.getNdv() : null,
                        null != dateColumnStatsData.getMin()
                                ? new Long(dateColumnStatsData.getMin().getDaysSinceEpoch())
                                        .intValue()
                                : null,
                        null != dateColumnStatsData.getMax()
                                ? new Long(dateColumnStatsData.getMax().getDaysSinceEpoch())
                                        .intValue()
                                : null,
                        null != dateColumnStatsData.getNullCount()
                                ? dateColumnStatsData.getNullCount()
                                : null,
                        null,
                        null);
            }
        } else if (colStat instanceof CatalogColumnStatisticsDataBinary) {
            CatalogColumnStatisticsDataBinary binaryColumnStatsData =
                    (CatalogColumnStatisticsDataBinary) colStat;
            if (typeRoot.equals(DataTypeRoot.VARBINARY) || typeRoot.equals(DataTypeRoot.BINARY)) {
                return ColStats.newColStats(
                        field.id(),
                        null,
                        null,
                        null,
                        null != binaryColumnStatsData.getNullCount()
                                ? binaryColumnStatsData.getNullCount()
                                : null,
                        null != binaryColumnStatsData.getAvgLength()
                                ? binaryColumnStatsData.getAvgLength().longValue()
                                : null,
                        null != binaryColumnStatsData.getMaxLength()
                                ? binaryColumnStatsData.getMaxLength()
                                : null);
            }
        }
        throw new CatalogException(
                String.format(
                        "Flink does not support convert ColumnStats '%s' for Paimon column "
                                + "type '%s' yet",
                        colStat, field.type()));
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy