
io.trino.tests.product.hive.TestHiveSparkCompatibility Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.trino.tests.product.hive;
import com.google.common.collect.ImmutableList;
import com.google.inject.Inject;
import io.trino.tempto.ProductTest;
import io.trino.tempto.hadoop.hdfs.HdfsClient;
import org.testng.SkipException;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.io.IOException;
import java.math.BigDecimal;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static io.trino.tempto.assertions.QueryAssert.Row;
import static io.trino.tempto.assertions.QueryAssert.Row.row;
import static io.trino.tempto.assertions.QueryAssert.assertQueryFailure;
import static io.trino.testing.TestingNames.randomNameSuffix;
import static io.trino.tests.product.TestGroups.HIVE_SPARK;
import static io.trino.tests.product.TestGroups.HIVE_SPARK_NO_STATS_FALLBACK;
import static io.trino.tests.product.TestGroups.PROFILE_SPECIFIC_TESTS;
import static io.trino.tests.product.utils.QueryExecutors.onHive;
import static io.trino.tests.product.utils.QueryExecutors.onSpark;
import static io.trino.tests.product.utils.QueryExecutors.onTrino;
import static java.lang.String.format;
import static java.lang.String.join;
import static java.util.Collections.nCopies;
import static java.util.Locale.ENGLISH;
import static org.assertj.core.api.Assertions.assertThat;
public class TestHiveSparkCompatibility
extends ProductTest
{
// see spark-defaults.conf
private static final String TRINO_CATALOG = "hive";
@Inject
private HdfsClient hdfsClient;
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS}, dataProvider = "testReadSparkCreatedTableDataProvider")
public void testReadSparkCreatedTable(String sparkTableFormat, String expectedTrinoTableFormat)
{
String sparkTableName = "spark_created_table_" + sparkTableFormat.replaceAll("[^a-zA-Z]", "").toLowerCase(ENGLISH) + "_" + randomNameSuffix();
String trinoTableName = format("%s.default.%s", TRINO_CATALOG, sparkTableName);
onSpark().executeQuery(
"CREATE TABLE default." + sparkTableName + "( " +
" a_boolean boolean, " +
" a_tinyint tinyint, " +
" a_smallint smallint, " +
" an_integer int, " +
" a_bigint bigint, " +
" a_real float, " +
" a_double double, " +
" a_short_decimal decimal(11, 4), " +
" a_long_decimal decimal(26, 7), " +
" a_string string, " +
// TODO " a_binary binary, " +
" a_date date, " +
" a_timestamp_seconds timestamp, " +
" a_timestamp_millis timestamp, " +
" a_timestamp_micros timestamp, " +
" a_timestamp_nanos timestamp, " +
// TODO interval
// TODO array
// TODO struct
// TODO map
" a_dummy string) " +
sparkTableFormat + " " +
// By default Spark creates table as "transactional=true", but doesn't conform to Hive transactional format,
// nor file naming convention, so such table cannot be read. As a workaround, force table to be marked
// non-transactional.
"TBLPROPERTIES ('transactional'='false')");
// nulls
onSpark().executeQuery("INSERT INTO " + sparkTableName + " VALUES (" + join(",", nCopies(16, "NULL")) + ")");
// positive values
onSpark().executeQuery(
"INSERT INTO " + sparkTableName + " VALUES (" +
"true, " + // a_boolean
"127, " + // a_tinyint
"32767, " + // a_smallint
"1000000000, " + // an_integer
"1000000000000000, " + // a_bigint
"10000000.123, " + // a_real
"100000000000.123, " + // a_double
"CAST('1234567.8901' AS decimal(11, 4)), " + // a_short_decimal
"CAST('1234567890123456789.0123456' AS decimal(26, 7)), " + // a_short_decimal
"'some string', " + // a_string
"DATE '2005-09-10', " + // a_date
"TIMESTAMP '2005-09-10 13:00:00', " + // a_timestamp_seconds
"TIMESTAMP '2005-09-10 13:00:00.123', " + // a_timestamp_millis
"TIMESTAMP '2005-09-10 13:00:00.123456', " + // a_timestamp_micros
"TIMESTAMP '2005-09-10 13:00:00.123456789', " + // a_timestamp_nanos
"'dummy')");
// negative values
onSpark().executeQuery(
"INSERT INTO " + sparkTableName + " VALUES (" +
"false, " + // a_boolean
"-128, " + // a_tinyint
"-32768, " + // a_smallint
"-1000000012, " + // an_integer
"-1000000000000012, " + // a_bigint
"-10000000.123, " + // a_real
"-100000000000.123, " + // a_double
"CAST('-1234567.8901' AS decimal(11, 4)), " + // a_short_decimal
"CAST('-1234567890123456789.0123456' AS decimal(26, 7)), " + // a_short_decimal
"'', " + // a_string
"DATE '1965-09-10', " + // a_date
"TIMESTAMP '1965-09-10 13:00:00', " + // a_timestamp_seconds
"TIMESTAMP '1965-09-10 13:00:00.123', " + // a_timestamp_millis
"TIMESTAMP '1965-09-10 13:00:00.123456', " + // a_timestamp_micros
"TIMESTAMP '1965-09-10 13:00:00.123456789', " + // a_timestamp_nanos
"'dummy')");
List expected = List.of(
row(nCopies(16, null).toArray()),
row(
true, // a_boolean≥
(byte) 127, // a_tinyint
(short) 32767, // a_smallint
1000000000, // an_integer
1000000000000000L, // a_bigint
10000000.123F, // a_real
100000000000.123, // a_double
new BigDecimal("1234567.8901"), // a_short_decimal
new BigDecimal("1234567890123456789.0123456"), // a_long_decimal
"some string", // a_string
java.sql.Date.valueOf(LocalDate.of(2005, 9, 10)), // a_date
java.sql.Timestamp.valueOf(LocalDateTime.of(2005, 9, 10, 13, 0, 0)), // a_timestamp_seconds
java.sql.Timestamp.valueOf(LocalDateTime.of(2005, 9, 10, 13, 0, 0, 123_000_000)), // a_timestamp_millis
java.sql.Timestamp.valueOf(LocalDateTime.of(2005, 9, 10, 13, 0, 0, 123_456_000)), // a_timestamp_micros
java.sql.Timestamp.valueOf(LocalDateTime.of(2005, 9, 10, 13, 0, 0, 123_456_000)), // a_timestamp_nanos; note that Spark timestamp has microsecond precision
"dummy"),
row(
false, // a_bigint
(byte) -128, // a_tinyint
(short) -32768, // a_smallint
-1000000012, // an_integer
-1000000000000012L, // a_bigint
-10000000.123F, // a_real
-100000000000.123, // a_double
new BigDecimal("-1234567.8901"), // a_short_decimal
new BigDecimal("-1234567890123456789.0123456"), // a_long_decimal
"", // a_string
java.sql.Date.valueOf(LocalDate.of(1965, 9, 10)), // a_date
java.sql.Timestamp.valueOf(LocalDateTime.of(1965, 9, 10, 13, 0, 0)), // a_timestamp_seconds
java.sql.Timestamp.valueOf(LocalDateTime.of(1965, 9, 10, 13, 0, 0, 123_000_000)), // a_timestamp_millis
java.sql.Timestamp.valueOf(LocalDateTime.of(1965, 9, 10, 13, 0, 0, 123_456_000)), // a_timestamp_micros
java.sql.Timestamp.valueOf(LocalDateTime.of(1965, 9, 10, 13, 0, 0, 123_456_000)), // a_timestamp_nanos; note that Spark timestamp has microsecond precision
"dummy"));
assertThat(onSpark().executeQuery("SELECT * FROM " + sparkTableName)).containsOnly(expected);
onTrino().executeQuery("SET SESSION hive.timestamp_precision = 'NANOSECONDS'");
assertThat(onTrino().executeQuery("SELECT * FROM " + trinoTableName)).containsOnly(expected);
assertThat(onTrino().executeQuery("SHOW CREATE TABLE " + trinoTableName))
.containsOnly(row(format(
"CREATE TABLE %s (\n" +
" a_boolean boolean,\n" +
" a_tinyint tinyint,\n" +
" a_smallint smallint,\n" +
" an_integer integer,\n" +
" a_bigint bigint,\n" +
" a_real real,\n" +
" a_double double,\n" +
" a_short_decimal decimal(11, 4),\n" +
" a_long_decimal decimal(26, 7),\n" +
" a_string varchar,\n" +
" a_date date,\n" +
" a_timestamp_seconds timestamp(9),\n" +
" a_timestamp_millis timestamp(9),\n" +
" a_timestamp_micros timestamp(9),\n" +
" a_timestamp_nanos timestamp(9),\n" +
" a_dummy varchar\n" +
")\n" +
"WITH (\n" +
" format = '%s'\n" +
")",
trinoTableName,
expectedTrinoTableFormat)));
onSpark().executeQuery("DROP TABLE " + sparkTableName);
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS}, dataProvider = "sparkParquetTimestampFormats")
public void testSparkParquetTimestampCompatibility(String sparkTimestampFormat, String sparkTimestamp, String[] expectedValues)
{
String sparkTableName = "test_spark_parquet_timestamp_compatibility_" + sparkTimestampFormat.toLowerCase(ENGLISH) + "_" + randomNameSuffix();
String trinoTableName = format("%s.default.%s", TRINO_CATALOG, sparkTableName);
onSpark().executeQuery("SET spark.sql.parquet.outputTimestampType = " + sparkTimestampFormat);
onSpark().executeQuery(
"CREATE TABLE default." + sparkTableName + "(a_timestamp timestamp) " +
"USING PARQUET " +
"TBLPROPERTIES ('transactional'='false')");
onSpark().executeQuery("INSERT INTO " + sparkTableName + " VALUES (TIMESTAMP '" + sparkTimestamp + "')");
for (int i = 0; i < HIVE_TIMESTAMP_PRECISIONS.length; i++) {
String trinoTimestampPrecision = HIVE_TIMESTAMP_PRECISIONS[i];
String expected = expectedValues[i];
onTrino().executeQuery("SET SESSION hive.timestamp_precision = '" + trinoTimestampPrecision + "'");
assertThat(onTrino().executeQuery("SELECT * FROM " + trinoTableName)).containsOnly(row(java.sql.Timestamp.valueOf(expected)));
assertThat(onTrino().executeQuery("SELECT count(*) FROM " + trinoTableName + " WHERE a_timestamp = TIMESTAMP '" + expected + "'")).containsOnly(row(1));
assertThat(onTrino().executeQuery("SELECT count(*) FROM " + trinoTableName + " WHERE a_timestamp != TIMESTAMP '" + expected + "'")).containsOnly(row(0));
}
onSpark().executeQuery("DROP TABLE " + sparkTableName);
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testSparkClusteringCaseSensitiveCompatibility()
{
String sparkTableNameWithClusteringDifferentCase = "test_spark_clustering_case_sensitive_" + randomNameSuffix();
onSpark().executeQuery(
String.format("CREATE TABLE %s (row_id int, `segment_id` int, value long) ", sparkTableNameWithClusteringDifferentCase) +
"USING PARQUET " +
"PARTITIONED BY (`part` string) " +
"CLUSTERED BY (`SEGMENT_ID`) " +
" SORTED BY (`SEGMENT_ID`) " +
" INTO 10 BUCKETS");
onSpark().executeQuery(format("INSERT INTO %s ", sparkTableNameWithClusteringDifferentCase) +
"VALUES " +
" (1, 1, 100, 'part1')," +
" (100, 1, 123, 'part2')," +
" (101, 2, 202, 'part2')");
// Ensure that Trino can successfully read from the Spark bucketed table even though the clustering
// column `SEGMENT_ID` is in a different case than the data column `segment_id`
assertThat(onTrino().executeQuery(format("SELECT * FROM %s.default.%s", TRINO_CATALOG, sparkTableNameWithClusteringDifferentCase)))
.containsOnly(List.of(
row(1, 1, 100, "part1"),
row(100, 1, 123, "part2"),
row(101, 2, 202, "part2")));
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testSparkParquetBloomFilterCompatibility()
{
String sparkTableNameWithBloomFilter = "test_spark_parquet_bloom_filter_compatibility_enabled_" + randomNameSuffix();
String sparkTableNameNoBloomFilter = "test_spark_parquet_bloom_filter_compatibility_disabled_" + randomNameSuffix();
// disable dictionary predicate when testing bloom filter predicate
onSpark().executeQuery(
String.format("CREATE TABLE %s (testInteger INT, testLong Long, testString STRING, testDouble DOUBLE, testFloat FLOAT) ", sparkTableNameWithBloomFilter) +
"USING PARQUET OPTIONS (" +
"'parquet.bloom.filter.enabled'='true'," +
"'parquet.enable.dictionary'='false'" +
")");
onSpark().executeQuery(
String.format("CREATE TABLE %s (testInteger INT, testLong Long, testString STRING, testDouble DOUBLE, testFloat FLOAT) ", sparkTableNameNoBloomFilter) +
"USING PARQUET OPTIONS (" +
"'parquet.bloom.filter.enabled'='false'," +
"'parquet.enable.dictionary'='false'" +
")");
String[] sparkTables = new String[] {sparkTableNameWithBloomFilter, sparkTableNameNoBloomFilter};
String[] trinoTables = new String[] {
format("%s.default.%s", TRINO_CATALOG, sparkTableNameWithBloomFilter),
format("%s.default.%s", TRINO_CATALOG, sparkTableNameNoBloomFilter)};
// control number of spark output files via hint: https://issues.apache.org/jira/browse/SPARK-24940
// contain values such as aaaaaaaaaaa and zzzzzzzzzzz, this made sure file level statistics: min and max won't take effect
for (String sparkTable : sparkTables) {
onSpark().executeQuery(format(
"INSERT INTO %s " +
"SELECT /*+ REPARTITION(1) */ testInteger, testLong, testString, testDouble, testFloat FROM VALUES " +
" (-999999, -999999, 'aaaaaaaaaaa', -9999999999.99D, -9999999.9999F)" +
", (3, 30, 'fdsvxxbv33cb', 97662.2D, 98862.2F)" +
", (5324, 2466, 'refgfdfrexx', 8796.1D, -65496.1F)" +
", (999999, 9999999999999, 'zzzzzzzzzzz', 9999999999.99D, -9999999.9999F)" +
", (9444, 4132455, 'ff34322vxff', 32137758.7892D, 9978.129887F) AS DATA(testInteger, testLong, testString, testDouble, testFloat)",
sparkTable));
}
// explicitly make sure using bloom filter statistics
onTrino().executeQuery("set session hive.parquet_use_bloom_filter=true");
assertTrinoBloomFilterTableSelectResult(trinoTables);
// explicitly make sure not using bloom filter statistics
onTrino().executeQuery("set session hive.parquet_use_bloom_filter=false");
assertTrinoBloomFilterTableSelectResult(trinoTables);
for (String sparkTable : sparkTables) {
onSpark().executeQuery("DROP TABLE " + sparkTable);
}
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testTrinoSparkParquetBloomFilterCompatibility()
{
String trinoTableNameWithBloomFilter = "test_trino_spark_parquet_bloom_filter_compatibility_enabled_" + randomNameSuffix();
String trioTableNameNoBloomFilter = "test_trino_spark_parquet_bloom_filter_compatibility_disabled_" + randomNameSuffix();
onTrino().executeQuery(
String.format("CREATE TABLE %s.default.%s (testInteger INTEGER, testLong BIGINT, testString VARCHAR, testDouble DOUBLE, testFloat REAL) ", TRINO_CATALOG, trinoTableNameWithBloomFilter) +
"WITH (" +
"format = 'PARQUET'," +
"parquet_bloom_filter_columns = ARRAY['testInteger', 'testLong', 'testString', 'testDouble', 'testFloat']" +
")");
onTrino().executeQuery(
String.format("CREATE TABLE %s.default.%s (testInteger INTEGER, testLong BIGINT, testString VARCHAR, testDouble DOUBLE, testFloat REAL) WITH (FORMAT = 'PARQUET')", TRINO_CATALOG, trioTableNameNoBloomFilter));
String[] sparkTables = new String[] {trinoTableNameWithBloomFilter, trioTableNameNoBloomFilter};
String[] trinoTables = new String[] {
format("%s.default.%s", TRINO_CATALOG, trinoTableNameWithBloomFilter),
format("%s.default.%s", TRINO_CATALOG, trioTableNameNoBloomFilter)};
for (String trinoTable : trinoTables) {
onTrino().executeQuery(format(
"INSERT INTO %s " +
"SELECT testInteger, testLong, testString, testDouble, testFloat FROM (VALUES " +
" (-999999, -999999, 'aaaaaaaaaaa', DOUBLE '-9999999999.99', REAL '-9999999.9999')" +
", (3, 30, 'fdsvxxbv33cb', DOUBLE '97662.2', REAL '98862.2')" +
", (5324, 2466, 'refgfdfrexx', DOUBLE '8796.1', REAL '-65496.1')" +
", (999999, 9999999999999, 'zzzzzzzzzzz', DOUBLE '9999999999.99', REAL '-9999999.9999')" +
", (9444, 4132455, 'ff34322vxff', DOUBLE '32137758.7892', REAL '9978.129887')) AS DATA(testInteger, testLong, testString, testDouble, testFloat)",
trinoTable));
}
assertSparkBloomFilterTableSelectResult(sparkTables);
for (String trinoTable : trinoTables) {
onTrino().executeQuery("DROP TABLE " + trinoTable);
}
}
private static void assertTrinoBloomFilterTableSelectResult(String[] trinoTables)
{
for (String trinoTable : trinoTables) {
assertThat(onTrino().executeQuery("SELECT COUNT(*) FROM " + trinoTable + " WHERE testInteger IN (9444, -88777, 6711111)")).containsOnly(List.of(row(1)));
assertThat(onTrino().executeQuery("SELECT COUNT(*) FROM " + trinoTable + " WHERE testLong IN (4132455, 321324, 312321321322)")).containsOnly(List.of(row(1)));
assertThat(onTrino().executeQuery("SELECT COUNT(*) FROM " + trinoTable + " WHERE testString IN ('fdsvxxbv33cb', 'cxxx322', 'cxxx323')")).containsOnly(List.of(row(1)));
assertThat(onTrino().executeQuery("SELECT COUNT(*) FROM " + trinoTable + " WHERE testDouble IN (DOUBLE '97662.2', DOUBLE '-97221.2', DOUBLE '-88777.22233')")).containsOnly(List.of(row(1)));
assertThat(onTrino().executeQuery("SELECT COUNT(*) FROM " + trinoTable + " WHERE testFloat IN (REAL '-65496.1', REAL '98211862.2', REAL '6761111555.1222')")).containsOnly(List.of(row(1)));
}
}
private static void assertSparkBloomFilterTableSelectResult(String[] sparkTables)
{
for (String sparkTable : sparkTables) {
assertThat(onSpark().executeQuery("SELECT COUNT(*) FROM " + sparkTable + " WHERE testInteger IN (9444, -88777, 6711111)")).containsOnly(List.of(row(1)));
assertThat(onSpark().executeQuery("SELECT COUNT(*) FROM " + sparkTable + " WHERE testLong IN (4132455, 321324, 312321321322)")).containsOnly(List.of(row(1)));
assertThat(onSpark().executeQuery("SELECT COUNT(*) FROM " + sparkTable + " WHERE testString IN ('fdsvxxbv33cb', 'cxxx322', 'cxxx323')")).containsOnly(List.of(row(1)));
assertThat(onSpark().executeQuery("SELECT COUNT(*) FROM " + sparkTable + " WHERE testDouble IN (97662.2D, -97221.2D, -88777.22233D)")).containsOnly(List.of(row(1)));
assertThat(onSpark().executeQuery("SELECT COUNT(*) FROM " + sparkTable + " WHERE testFloat IN (-65496.1F, 98211862.2F, 6761111555.1222F)")).containsOnly(List.of(row(1)));
}
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testInsertFailsOnBucketedTableCreatedBySpark()
{
String hiveTableName = "spark_insert_bucketed_table_" + randomNameSuffix();
onSpark().executeQuery(
"CREATE TABLE default." + hiveTableName + "(a_key integer, a_value integer) " +
"USING PARQUET " +
"CLUSTERED BY (a_key) INTO 3 BUCKETS");
assertQueryFailure(() -> onTrino().executeQuery("INSERT INTO default." + hiveTableName + " VALUES (1, 100)"))
.hasMessageContaining("Inserting into Spark bucketed tables is not supported");
onSpark().executeQuery("DROP TABLE " + hiveTableName);
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testUpdateFailsOnBucketedTableCreatedBySpark()
{
String hiveTableName = "spark_update_bucketed_table_" + randomNameSuffix();
onSpark().executeQuery(
"CREATE TABLE default." + hiveTableName + "(a_key integer, a_value integer) " +
"USING ORC " +
"CLUSTERED BY (a_key) INTO 3 BUCKETS");
assertQueryFailure(() -> onTrino().executeQuery("UPDATE default." + hiveTableName + " SET a_value = 100 WHERE a_key = 1"))
.hasMessageContaining("Merging into Spark bucketed tables is not supported");
onSpark().executeQuery("DROP TABLE " + hiveTableName);
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testDeleteFailsOnBucketedTableCreatedBySpark()
{
String hiveTableName = "spark_delete_bucketed_table_" + randomNameSuffix();
onSpark().executeQuery(
"CREATE TABLE default." + hiveTableName + "(a_key integer, a_value integer) " +
"USING ORC " +
"CLUSTERED BY (a_key) INTO 3 BUCKETS");
assertQueryFailure(() -> onTrino().executeQuery("DELETE FROM default." + hiveTableName + " WHERE a_key = 1"))
.hasMessageContaining("Merging into Spark bucketed tables is not supported");
onSpark().executeQuery("DROP TABLE " + hiveTableName);
}
private static final String[] HIVE_TIMESTAMP_PRECISIONS = new String[] {"MILLISECONDS", "MICROSECONDS", "NANOSECONDS"};
@DataProvider
public static Object[][] sparkParquetTimestampFormats()
{
String millisTimestamp = "2005-09-10 13:00:00.123";
String microsTimestamp = "2005-09-10 13:00:00.123456";
String nanosTimestamp = "2005-09-10 13:00:00.123456789";
// Ordering of expected values matches the ordering in HIVE_TIMESTAMP_PRECISIONS
return new Object[][] {
{"TIMESTAMP_MILLIS", millisTimestamp, new String[] {millisTimestamp, millisTimestamp, millisTimestamp}},
{"TIMESTAMP_MICROS", microsTimestamp, new String[] {millisTimestamp, microsTimestamp, microsTimestamp}},
// note that Spark timestamp has microsecond precision
{"INT96", nanosTimestamp, new String[] {millisTimestamp, microsTimestamp, microsTimestamp}},
};
}
@DataProvider
public static Object[][] testReadSparkCreatedTableDataProvider()
{
return new Object[][] {
{"USING ORC", "ORC"},
{"USING PARQUET", "PARQUET"},
// TODO add Avro
};
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testReadTrinoCreatedOrcTable()
{
testReadTrinoCreatedTable("using_orc", "ORC");
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testReadTrinoCreatedParquetTable()
{
testReadTrinoCreatedTable("using_parquet", "PARQUET");
}
private void testReadTrinoCreatedTable(String tableName, String tableFormat)
{
String sparkTableName = "trino_created_table_" + tableName + "_" + randomNameSuffix();
String trinoTableName = format("%s.default.%s", TRINO_CATALOG, sparkTableName);
// Spark timestamps are in microsecond precision
onTrino().executeQuery("SET SESSION hive.timestamp_precision = 'MICROSECONDS'");
onTrino().executeQuery(format(
"CREATE TABLE %s ( " +
" a_boolean boolean, " +
" a_tinyint tinyint, " +
" a_smallint smallint, " +
" an_integer integer, " +
" a_bigint bigint, " +
" a_real real, " +
" a_double double, " +
" a_short_decimal decimal(11, 4), " +
" a_long_decimal decimal(26, 7), " +
" a_string varchar, " +
// TODO binary
" a_date date, " +
" a_timestamp_seconds timestamp(6), " +
" a_timestamp_millis timestamp(6), " +
" a_timestamp_micros timestamp(6), " +
// TODO interval
// TODO array
// TODO struct
// TODO map
" a_dummy varchar " +
") " +
"WITH ( " +
" format = '%s' " +
")",
trinoTableName,
tableFormat));
// nulls
onTrino().executeQuery("INSERT INTO " + trinoTableName + " VALUES (" + join(",", nCopies(15, "NULL")) + ")");
// positive values
onTrino().executeQuery(
"INSERT INTO " + trinoTableName + " VALUES (" +
"true, " + // a_boolean
"127, " + // a_tinyint
"32767, " + // a_smallint
"1000000000, " + // an_integer
"1000000000000000, " + // a_bigint
"10000000.123, " + // a_real
"100000000000.123, " + // a_double
"CAST('1234567.8901' AS decimal(11, 4)), " + // a_short_decimal
"CAST('1234567890123456789.0123456' AS decimal(26, 7)), " + // a_short_decimal
"'some string', " + // a_string
"DATE '2005-09-10', " + // a_date
"TIMESTAMP '2005-09-10 13:00:00', " + // a_timestamp_seconds
"TIMESTAMP '2005-09-10 13:00:00.123', " + // a_timestamp_millis
"TIMESTAMP '2005-09-10 13:00:00.123456', " + // a_timestamp_micros
"'dummy')");
// negative values
onTrino().executeQuery(
"INSERT INTO " + trinoTableName + " VALUES (" +
"false, " + // a_boolean
"-128, " + // a_tinyint
"-32768, " + // a_smallint
"-1000000012, " + // an_integer
"-1000000000000012, " + // a_bigint
"-10000000.123, " + // a_real
"-100000000000.123, " + // a_double
"CAST('-1234567.8901' AS decimal(11, 4)), " + // a_short_decimal
"CAST('-1234567890123456789.0123456' AS decimal(26, 7)), " + // a_short_decimal
"'', " + // a_string
"DATE '1965-09-10', " + // a_date
"TIMESTAMP '1965-09-10 13:00:00', " + // a_timestamp_seconds
"TIMESTAMP '1965-09-10 13:00:00.123', " + // a_timestamp_millis
"TIMESTAMP '1965-09-10 13:00:00.123456', " + // a_timestamp_micros
"'dummy')");
List expected = List.of(
row(nCopies(15, null).toArray()),
row(
true, // a_boolean
(byte) 127, // a_tinyint
(short) 32767, // a_smallint
1000000000, // an_integer
1000000000000000L, // a_bigint
10000000.123F, // a_real
100000000000.123, // a_double
new BigDecimal("1234567.8901"), // a_short_decimal
new BigDecimal("1234567890123456789.0123456"), // a_long_decimal
"some string", // a_string
java.sql.Date.valueOf(LocalDate.of(2005, 9, 10)), // a_date
java.sql.Timestamp.valueOf(LocalDateTime.of(2005, 9, 10, 13, 0, 0)), // a_timestamp_seconds
java.sql.Timestamp.valueOf(LocalDateTime.of(2005, 9, 10, 13, 0, 0, 123_000_000)), // a_timestamp_millis
java.sql.Timestamp.valueOf(LocalDateTime.of(2005, 9, 10, 13, 0, 0, 123_456_000)), // a_timestamp_micros
"dummy"),
row(
false, // a_bigint
(byte) -128, // a_tinyint
(short) -32768, // a_smallint
-1000000012, // an_integer
-1000000000000012L, // a_bigint
-10000000.123F, // a_real
-100000000000.123, // a_double
new BigDecimal("-1234567.8901"), // a_short_decimal
new BigDecimal("-1234567890123456789.0123456"), // a_long_decimal
"", // a_string
java.sql.Date.valueOf(LocalDate.of(1965, 9, 10)), // a_date
java.sql.Timestamp.valueOf(LocalDateTime.of(1965, 9, 10, 13, 0, 0)), // a_timestamp_seconds
java.sql.Timestamp.valueOf(LocalDateTime.of(1965, 9, 10, 13, 0, 0, 123_000_000)), // a_timestamp_millis
java.sql.Timestamp.valueOf(LocalDateTime.of(1965, 9, 10, 13, 0, 0, 123_456_000)), // a_timestamp_micros
"dummy"));
assertThat(onSpark().executeQuery("SELECT * FROM " + sparkTableName)).containsOnly(expected);
assertThat(onTrino().executeQuery("SELECT * FROM " + trinoTableName)).containsOnly(expected);
onTrino().executeQuery("DROP TABLE " + trinoTableName);
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testReadSparkdDateAndTimePartitionName()
{
String sparkTableName = "test_trino_reading_spark_date_and_time_type_partitioned_" + randomNameSuffix();
String trinoTableName = format("%s.default.%s", TRINO_CATALOG, sparkTableName);
onSpark().executeQuery(format("CREATE TABLE default.%s (value integer) PARTITIONED BY (dt date)", sparkTableName));
// Spark allows creating partition with time unit
// Hive denies creating such partitions, but allows reading
onSpark().executeQuery(format("INSERT INTO %s PARTITION(dt='2022-04-13 00:00:00.000000000') VALUES (1)", sparkTableName));
onSpark().executeQuery(format("INSERT INTO %s PARTITION(dt='2022-04-13 00:00:00') VALUES (2)", sparkTableName));
onSpark().executeQuery(format("INSERT INTO %s PARTITION(dt='2022-04-13 00:00') VALUES (3)", sparkTableName));
onSpark().executeQuery(format("INSERT INTO %s PARTITION(dt='12345-06-07') VALUES (4)", sparkTableName));
/// This INSERT statement was supported in older Spark version
assertQueryFailure(() -> onSpark().executeQuery(format("INSERT INTO %s PARTITION(dt='123-04-05') VALUES (5)", sparkTableName)))
.hasMessageContaining("cannot be cast to \"DATE\" because it is malformed");
onSpark().executeQuery(format("INSERT INTO %s PARTITION(dt='-0001-01-01') VALUES (6)", sparkTableName));
assertThat(onTrino().executeQuery("SELECT value, \"$partition\" FROM " + trinoTableName))
.containsOnly(List.of(
row(1, "dt=2022-04-13"),
row(2, "dt=2022-04-13"),
row(3, "dt=2022-04-13"),
row(4, "dt=+12345-06-07"),
row(6, "dt=-0001-01-01")));
// Use date_format function to avoid exception due to java.sql.Date.valueOf() with 5 digit year
assertThat(onSpark().executeQuery("SELECT value, date_format(dt, 'yyyy-MM-dd') FROM " + sparkTableName))
.containsOnly(List.of(
row(1, "2022-04-13"),
row(2, "2022-04-13"),
row(3, "2022-04-13"),
row(4, "+12345-06-07"),
row(6, "-0001-01-01")));
// Use date_format function to avoid exception due to java.sql.Date.valueOf() with 5 digit year
assertThat(onHive().executeQuery("SELECT value, date_format(dt, 'yyyy-MM-dd') FROM " + sparkTableName))
.containsOnly(List.of(
row(1, "2022-04-13"),
row(2, "2022-04-13"),
row(3, "2022-04-13"),
row(4, null),
row(6, "0002-01-03")));
// Cast to varchar so that we can compare with Spark & Hive easily
assertThat(onTrino().executeQuery("SELECT value, CAST(dt AS VARCHAR) FROM " + trinoTableName))
.containsOnly(List.of(
row(1, "2022-04-13"),
row(2, "2022-04-13"),
row(3, "2022-04-13"),
row(4, "12345-06-07"),
row(6, "-0001-01-01")));
onTrino().executeQuery("DROP TABLE " + trinoTableName);
}
@Test(groups = {HIVE_SPARK, PROFILE_SPECIFIC_TESTS})
public void testTextInputFormatWithParquetHiveSerDe()
throws IOException
{
String tableName = "test_text_input_format_with_parquet_hive_ser_de" + randomNameSuffix();
onHive().executeQuery("" +
"CREATE EXTERNAL TABLE " + tableName +
"(col INT) " +
"ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' " +
"STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' " +
"OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' " +
"LOCATION '/tmp/" + tableName + "'");
onSpark().executeQuery("INSERT INTO " + tableName + " VALUES(1)");
assertThat(onSpark().executeQuery("SELECT * FROM " + tableName)).containsOnly(row(1));
assertThat(onTrino().executeQuery("SELECT * FROM " + tableName)).containsOnly(row(1));
List files = hdfsClient.listDirectory("/tmp/" + tableName + "/");
assertThat(files).hasSize(2);
assertThat(files.stream().filter(name -> !name.contains("SUCCESS")).collect(toImmutableList()).get(0)).endsWith("parquet");
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy