All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.prestosql.tests.hive.TestHiveBasicTableStatistics Maven / Gradle / Ivy

There is a newer version: 350
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.prestosql.tests.hive;

import com.google.common.primitives.Longs;
import io.prestosql.tempto.Requires;
import io.prestosql.tempto.fulfillment.table.hive.tpch.ImmutableTpchTablesRequirements.ImmutableNationTable;
import io.prestosql.tempto.query.QueryExecutor;
import io.prestosql.tempto.query.QueryResult;
import org.testng.annotations.Test;

import java.util.List;
import java.util.Optional;
import java.util.OptionalLong;

import static com.google.common.base.Verify.verify;
import static io.prestosql.tests.TestGroups.SKIP_ON_CDH;
import static io.prestosql.tests.hive.util.TableLocationUtils.getTableLocation;
import static io.prestosql.tests.utils.QueryExecutors.onHive;
import static io.prestosql.tests.utils.QueryExecutors.onPresto;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static org.assertj.core.api.Assertions.assertThat;

@Requires(ImmutableNationTable.class)
public class TestHiveBasicTableStatistics
        extends HiveProductTest
{
    @Test
    public void testCreateUnpartitioned()
    {
        String tableName = "test_basic_statistics_unpartitioned_ctas_presto";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        onPresto().executeQuery(format("CREATE TABLE %s AS SELECT * FROM nation", tableName));

        try {
            BasicStatistics statistics = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsAreNonZero(statistics);
            assertThat(statistics.getNumRows().getAsLong()).isEqualTo(25);
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    @Test(groups = SKIP_ON_CDH /* CDH 5 metastore automatically gathers raw data size statistics on its own */)
    public void testCreateExternalUnpartitioned()
    {
        String tableName = "test_basic_statistics_external_unpartitioned_presto";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));

        try {
            String location = getTableLocation("nation");
            onPresto().executeQuery(format("" +
                            "CREATE TABLE %s (" +
                            "   n_nationkey bigint, " +
                            "   n_regionkey bigint, " +
                            "   n_name varchar(25), " +
                            "   n_comment varchar(152)) " +
                            "WITH (external_location = '%s', format = 'TEXTFILE', textfile_field_separator = '|')",
                    tableName,
                    location));
            BasicStatistics statistics = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsAreNotPresent(statistics);
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    @Test
    public void testCreateTableWithNoData()
    {
        String tableName = "test_basic_statistics_unpartitioned_ctas_presto_with_no_data";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        onPresto().executeQuery(format("CREATE TABLE %s AS SELECT * FROM nation WITH NO DATA", tableName));

        try {
            BasicStatistics statistics = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsAreZero(statistics);
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    @Test
    public void testInsertUnpartitioned()
    {
        String tableName = "test_basic_statistics_unpartitioned_insert_presto";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        onPresto().executeQuery(format("" +
                "CREATE TABLE %s (" +
                "   n_nationkey bigint, " +
                "   n_regionkey bigint, " +
                "   n_name varchar(25), " +
                "   n_comment varchar(152)" +
                ")", tableName));

        try {
            BasicStatistics statisticsEmpty = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsAreZero(statisticsEmpty);

            insertNationData(onPresto(), tableName);

            BasicStatistics statisticsFirstInsert = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsAreNonZero(statisticsFirstInsert);
            assertThat(statisticsFirstInsert.getNumRows().getAsLong()).isEqualTo(25);

            insertNationData(onPresto(), tableName);

            BasicStatistics statisticsSecondInsert = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsAreNonZero(statisticsSecondInsert);
            assertThatStatisticsValuesHaveIncreased(statisticsFirstInsert, statisticsSecondInsert);
            assertThat(statisticsSecondInsert.getNumRows().getAsLong()).isEqualTo(50);
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    @Test
    public void testCreatePartitioned()
    {
        String tableName = "test_basic_statistics_partitioned_ctas_presto";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        onPresto().executeQuery(format("" +
                "CREATE TABLE %s " +
                "WITH (" +
                "   partitioned_by = ARRAY['n_regionkey'] " +
                ") " +
                "AS " +
                "SELECT n_nationkey, n_name, n_comment, n_regionkey " +
                "FROM nation " +
                // turns out there are exactly 5 countries in each region
                // let's change records count for one of the regions to verify statistics are different
                "WHERE n_nationkey <> 23", tableName));

        try {
            if (getHiveVersionMajor() < 3) {
                BasicStatistics tableStatistics = getBasicStatisticsForTable(onHive(), tableName);
                assertThatStatisticsAreNotPresent(tableStatistics);
            }

            BasicStatistics firstPartitionStatistics = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=1");
            assertThatStatisticsAreNonZero(firstPartitionStatistics);
            assertThat(firstPartitionStatistics.getNumRows().getAsLong()).isEqualTo(5);

            BasicStatistics secondPartitionStatistics = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=3");
            assertThatStatisticsAreNonZero(secondPartitionStatistics);
            assertThat(secondPartitionStatistics.getNumRows().getAsLong()).isEqualTo(4);
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    @Test(groups = SKIP_ON_CDH /* CDH 5 metastore automatically gathers raw data size statistics on its own */)
    public void testAnalyzePartitioned()
    {
        String tableName = "test_basic_statistics_analyze_partitioned";

        onPresto().executeQuery("DROP TABLE IF EXISTS " + tableName);
        onPresto().executeQuery(format("" +
                "CREATE TABLE %s " +
                "WITH ( " +
                "   partitioned_by = ARRAY['n_regionkey'], " +
                "   bucketed_by = ARRAY['n_nationkey'], " +
                "   bucket_count = 10 " +
                ") " +
                "AS " +
                "SELECT n_nationkey, n_name, n_comment, n_regionkey " +
                "FROM nation " +
                "WHERE n_regionkey = 1", tableName));

        try {
            if (getHiveVersionMajor() < 3) {
                BasicStatistics tableStatistics = getBasicStatisticsForTable(onHive(), tableName);
                assertThatStatisticsAreNotPresent(tableStatistics);
            }

            BasicStatistics partitionStatisticsBefore = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=1");
            assertThatStatisticsArePresent(partitionStatisticsBefore);

            // run ANALYZE
            onPresto().executeQuery(format("ANALYZE %s", tableName));
            BasicStatistics partitionStatisticsAfter = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=1");
            assertThatStatisticsArePresent(partitionStatisticsAfter);

            // ANALYZE must not change the basic stats
            assertThat(partitionStatisticsBefore.getNumRows().getAsLong()).isEqualTo(partitionStatisticsAfter.getNumRows().getAsLong());
            assertThat(partitionStatisticsBefore.getNumFiles().getAsLong()).isEqualTo(partitionStatisticsAfter.getNumFiles().getAsLong());
            assertThat(partitionStatisticsBefore.getRawDataSize().getAsLong()).isEqualTo(partitionStatisticsAfter.getRawDataSize().getAsLong());
            assertThat(partitionStatisticsBefore.getTotalSize().getAsLong()).isEqualTo(partitionStatisticsAfter.getTotalSize().getAsLong());
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE %s", tableName));
        }
    }

    @Test
    public void testAnalyzeUnpartitioned()
    {
        String tableName = "test_basic_statistics_analyze_unpartitioned";

        onPresto().executeQuery("DROP TABLE IF EXISTS " + tableName);
        onPresto().executeQuery(format("" +
                "CREATE TABLE %s " +
                "AS " +
                "SELECT n_nationkey, n_name, n_comment, n_regionkey " +
                "FROM nation " +
                "WHERE n_regionkey = 1", tableName));

        try {
            BasicStatistics tableStatisticsBefore = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsArePresent(tableStatisticsBefore);

            // run ANALYZE
            onPresto().executeQuery(format("ANALYZE %s", tableName));
            BasicStatistics tableStatisticsAfter = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsArePresent(tableStatisticsAfter);

            // ANALYZE must not change the basic stats
            assertThat(tableStatisticsBefore.getNumRows()).isEqualTo(tableStatisticsAfter.getNumRows());
            assertThat(tableStatisticsBefore.getNumFiles()).isEqualTo(tableStatisticsAfter.getNumFiles());
            assertThat(tableStatisticsBefore.getRawDataSize()).isEqualTo(tableStatisticsAfter.getRawDataSize());
            assertThat(tableStatisticsBefore.getTotalSize()).isEqualTo(tableStatisticsAfter.getTotalSize());
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE %s", tableName));
        }
    }

    @Test
    public void testInsertPartitioned()
    {
        String tableName = "test_basic_statistics_partitioned_insert_presto";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        onPresto().executeQuery(format("" +
                "CREATE TABLE %s (" +
                "   n_nationkey bigint, " +
                "   n_name varchar(25), " +
                "   n_comment varchar(152), " +
                "   n_regionkey bigint " +
                ")" +
                "WITH (" +
                "   partitioned_by = ARRAY['n_regionkey'] " +
                ") ", tableName));

        try {
            if (getHiveVersionMajor() < 3) {
                BasicStatistics tableStatisticsAfterCreate = getBasicStatisticsForTable(onHive(), tableName);
                assertThatStatisticsAreNotPresent(tableStatisticsAfterCreate);
            }

            insertNationData(onPresto(), tableName);

            BasicStatistics partitionStatisticsFirstInsert = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=3");
            assertThatStatisticsAreNonZero(partitionStatisticsFirstInsert);
            assertThat(partitionStatisticsFirstInsert.getNumRows().getAsLong()).isEqualTo(5);

            insertNationData(onPresto(), tableName);

            BasicStatistics statisticsSecondInsert = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=3");
            assertThat(statisticsSecondInsert.getNumRows().getAsLong()).isEqualTo(10);
            assertThatStatisticsValuesHaveIncreased(partitionStatisticsFirstInsert, statisticsSecondInsert);
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    @Test
    public void testInsertBucketed()
    {
        String tableName = "test_basic_statistics_bucketed_insert_presto";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        onPresto().executeQuery(format("" +
                "CREATE TABLE %s " +
                "WITH ( " +
                "   bucketed_by = ARRAY['n_nationkey'], " +
                "   bucket_count = 50 " +
                ") " +
                "AS " +
                "SELECT n_nationkey, n_name, n_comment, n_regionkey " +
                "FROM nation", tableName));

        try {
            BasicStatistics statisticsAfterCreate = getBasicStatisticsForTable(onHive(), tableName);
            assertThatStatisticsAreNonZero(statisticsAfterCreate);
            assertThat(statisticsAfterCreate.getNumRows().getAsLong()).isEqualTo(25);
            assertThat(statisticsAfterCreate.getNumFiles().getAsLong()).isEqualTo(25); // no files for empty buckets

            insertNationData(onPresto(), tableName);

            BasicStatistics statisticsAfterInsert = getBasicStatisticsForTable(onHive(), tableName);
            assertThat(statisticsAfterInsert.getNumRows().getAsLong()).isEqualTo(50);
            assertThat(statisticsAfterInsert.getNumFiles().getAsLong()).isEqualTo(50); // no files for empty buckets

            insertNationData(onPresto(), tableName);

            BasicStatistics statisticsAfterInsert2 = getBasicStatisticsForTable(onHive(), tableName);
            assertThat(statisticsAfterInsert2.getNumRows().getAsLong()).isEqualTo(75);
            assertThat(statisticsAfterInsert2.getNumFiles().getAsLong()).isEqualTo(75); // no files for empty buckets
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    @Test
    public void testInsertBucketedPartitioned()
    {
        String tableName = "test_basic_statistics_bucketed_partitioned_insert_presto";

        onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        onPresto().executeQuery(format("" +
                "CREATE TABLE %s " +
                "WITH ( " +
                "   partitioned_by = ARRAY['n_regionkey'], " +
                "   bucketed_by = ARRAY['n_nationkey'], " +
                "   bucket_count = 10 " +
                ") " +
                "AS " +
                "SELECT n_nationkey, n_name, n_comment, n_regionkey " +
                "FROM nation " +
                "WHERE n_regionkey = 1", tableName));

        try {
            if (getHiveVersionMajor() < 3) {
                BasicStatistics tableStatistics = getBasicStatisticsForTable(onHive(), tableName);
                assertThatStatisticsAreNotPresent(tableStatistics);
            }

            BasicStatistics firstPartitionStatistics = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=1");
            assertThatStatisticsAreNonZero(firstPartitionStatistics);
            assertThat(firstPartitionStatistics.getNumRows().getAsLong()).isEqualTo(5);
            assertThat(firstPartitionStatistics.getNumFiles().getAsLong()).isEqualTo(5); // no files for empty buckets

            String insert = format("INSERT INTO %s (n_nationkey, n_regionkey, n_name, n_comment) " +
                    "SELECT n_nationkey, n_regionkey, n_name, n_comment " +
                    "FROM nation " +
                    "WHERE n_regionkey = 2", tableName);

            onPresto().executeQuery(insert);

            BasicStatistics secondPartitionStatistics = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=2");
            assertThat(secondPartitionStatistics.getNumRows().getAsLong()).isEqualTo(5);
            assertThat(secondPartitionStatistics.getNumFiles().getAsLong()).isEqualTo(4); // no files for empty buckets

            onPresto().executeQuery(insert);

            BasicStatistics secondPartitionUpdatedStatistics = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=2");
            assertThat(secondPartitionUpdatedStatistics.getNumRows().getAsLong()).isEqualTo(10);
            assertThat(secondPartitionUpdatedStatistics.getNumFiles().getAsLong()).isEqualTo(8); // no files for empty buckets
        }
        finally {
            onPresto().executeQuery(format("DROP TABLE IF EXISTS %s", tableName));
        }
    }

    private static void insertNationData(QueryExecutor executor, String tableName)
    {
        executor.executeQuery(format("" +
                "INSERT INTO %s (n_nationkey, n_regionkey, n_name, n_comment) " +
                "SELECT n_nationkey, n_regionkey, n_name, n_comment FROM nation", tableName));
    }

    private static void assertThatStatisticsAreNonZero(BasicStatistics statistics)
    {
        assertThatStatisticsArePresent(statistics);
        assertThat(statistics.getNumRows().getAsLong()).isGreaterThan(0);
        assertThat(statistics.getNumFiles().getAsLong()).isGreaterThan(0);
        assertThat(statistics.getRawDataSize().getAsLong()).isGreaterThan(0);
        assertThat(statistics.getTotalSize().getAsLong()).isGreaterThan(0);
    }

    private static void assertThatStatisticsAreZero(BasicStatistics statistics)
    {
        assertThatStatisticsArePresent(statistics);
        assertThat(statistics.getNumRows().getAsLong()).isEqualTo(0);
        assertThat(statistics.getNumFiles().getAsLong()).isEqualTo(0);
        assertThat(statistics.getRawDataSize().getAsLong()).isEqualTo(0);
        assertThat(statistics.getTotalSize().getAsLong()).isEqualTo(0);
    }

    private static void assertThatStatisticsArePresent(BasicStatistics statistics)
    {
        assertThat(statistics.getNumRows()).isPresent();
        assertThat(statistics.getNumFiles()).isPresent();
        assertThat(statistics.getRawDataSize()).isPresent();
        assertThat(statistics.getTotalSize()).isPresent();
    }

    private static void assertThatStatisticsAreNotPresent(BasicStatistics statistics)
    {
        assertThat(statistics.getNumRows()).isNotPresent();
        assertThat(statistics.getNumFiles()).isNotPresent();
        assertThat(statistics.getRawDataSize()).isNotPresent();
        assertThat(statistics.getTotalSize()).isNotPresent();
    }

    private static void assertThatStatisticsValuesHaveIncreased(BasicStatistics first, BasicStatistics second)
    {
        assertThat(second.getNumRows().getAsLong()).isGreaterThan(first.getNumRows().getAsLong());
        assertThat(second.getNumFiles().getAsLong()).isGreaterThan(first.getNumFiles().getAsLong());
        assertThat(second.getTotalSize().getAsLong()).isGreaterThan(first.getTotalSize().getAsLong());
        assertThat(second.getRawDataSize().getAsLong()).isGreaterThan(first.getRawDataSize().getAsLong());
    }

    private static BasicStatistics getBasicStatisticsForTable(QueryExecutor executor, String table)
    {
        QueryResult result = executor.executeQuery(format("DESCRIBE FORMATTED %s", table));
        return basicStatisticsFromDescribeResult(result);
    }

    private static BasicStatistics getBasicStatisticsForPartition(QueryExecutor executor, String table, String partition)
    {
        QueryResult result = executor.executeQuery(format("DESCRIBE FORMATTED %s partition (%s)", table, partition));
        return basicStatisticsFromDescribeResult(result);
    }

    private static BasicStatistics basicStatisticsFromDescribeResult(QueryResult result)
    {
        OptionalLong numFiles = getTableParameterValue(result, "numFiles");
        OptionalLong numRows = getTableParameterValue(result, "numRows");
        OptionalLong rawDataSize = getTableParameterValue(result, "rawDataSize");
        OptionalLong totalSize = getTableParameterValue(result, "totalSize");
        return new BasicStatistics(numFiles, numRows, rawDataSize, totalSize);
    }

    private static OptionalLong getTableParameterValue(QueryResult describeResult, String key)
    {
        verify(describeResult.getColumnsCount() == 3, "describe result is expected to have 3 columns");
        for (List row : describeResult.rows()) {
            Optional parameterKey = Optional.ofNullable(row.get(1))
                    .map(Object::toString)
                    .map(String::trim);

            if (parameterKey.isPresent() && key.equals(parameterKey.get())) {
                return Optional.ofNullable(row.get(2))
                        .map(Object::toString)
                        .map(String::trim)
                        .map(TestHiveBasicTableStatistics::tryParse)
                        .get();
            }
        }
        return OptionalLong.empty();
    }

    private static OptionalLong tryParse(String value)
    {
        Long number = Longs.tryParse(value);
        if (number != null) {
            return OptionalLong.of(number);
        }
        return OptionalLong.empty();
    }

    private static class BasicStatistics
    {
        private final OptionalLong numFiles;
        private final OptionalLong numRows;
        private final OptionalLong rawDataSize;
        private final OptionalLong totalSize;

        public BasicStatistics(OptionalLong numFiles, OptionalLong numRows, OptionalLong rawDataSize, OptionalLong totalSize)
        {
            this.numFiles = requireNonNull(numFiles, "numFiles is null");
            this.numRows = requireNonNull(numRows, "numRows is null");
            this.rawDataSize = requireNonNull(rawDataSize, "rawDataSize is null");
            this.totalSize = requireNonNull(totalSize, "totalSize is null");
        }

        public OptionalLong getNumFiles()
        {
            return numFiles;
        }

        public OptionalLong getNumRows()
        {
            return numRows;
        }

        public OptionalLong getRawDataSize()
        {
            return rawDataSize;
        }

        public OptionalLong getTotalSize()
        {
            return totalSize;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy