All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.deltalake.transactionlog;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Enums;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.common.collect.Streams;
import io.airlift.json.ObjectMapperProvider;
import io.trino.plugin.deltalake.DeltaLakeColumnHandle;
import io.trino.plugin.deltalake.DeltaLakeColumnMetadata;
import io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics;
import io.trino.plugin.hive.util.HiveUtil;
import io.trino.spi.Location;
import io.trino.spi.TrinoException;
import io.trino.spi.connector.ColumnMetadata;
import io.trino.spi.type.ArrayType;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.MapType;
import io.trino.spi.type.RowType;
import io.trino.spi.type.TimestampType;
import io.trino.spi.type.TimestampWithTimeZoneType;
import io.trino.spi.type.Type;
import io.trino.spi.type.TypeManager;
import io.trino.spi.type.TypeNotFoundException;
import io.trino.spi.type.TypeSignature;
import io.trino.spi.type.TypeSignatureParameter;
import io.trino.spi.type.VarcharType;
import jakarta.annotation.Nullable;

import java.util.AbstractMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Strings.isNullOrEmpty;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.ImmutableList.toImmutableList;
import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static com.google.common.collect.Streams.stream;
import static com.google.common.primitives.Booleans.countTrue;
import static io.trino.plugin.deltalake.DeltaLakeColumnType.PARTITION_KEY;
import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA;
import static io.trino.plugin.deltalake.transactionlog.MetadataEntry.DELTA_CHANGE_DATA_FEED_ENABLED_PROPERTY;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS;
import static io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS;
import static io.trino.spi.type.TinyintType.TINYINT;
import static io.trino.spi.type.VarbinaryType.VARBINARY;
import static io.trino.spi.type.VarcharType.VARCHAR;
import static java.lang.Boolean.parseBoolean;
import static java.lang.String.format;
import static java.util.Locale.ENGLISH;
import static java.util.Objects.requireNonNull;

public final class DeltaLakeSchemaSupport
{
    private DeltaLakeSchemaSupport() {}

    public static final String APPEND_ONLY_CONFIGURATION_KEY = "delta.appendOnly";
    public static final String COLUMN_MAPPING_MODE_CONFIGURATION_KEY = "delta.columnMapping.mode";
    public static final String COLUMN_MAPPING_PHYSICAL_NAME_CONFIGURATION_KEY = "delta.columnMapping.physicalName";
    public static final String MAX_COLUMN_ID_CONFIGURATION_KEY = "delta.columnMapping.maxColumnId";
    private static final String DELETION_VECTORS_CONFIGURATION_KEY = "delta.enableDeletionVectors";

    // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#valid-feature-names-in-table-features
    private static final String APPEND_ONLY_FEATURE_NAME = "appendOnly";
    private static final String CHANGE_DATA_FEED_FEATURE_NAME = "changeDataFeed";
    private static final String CHECK_CONSTRAINTS_FEATURE_NAME = "checkConstraints";
    private static final String COLUMN_MAPPING_FEATURE_NAME = "columnMapping";
    private static final String DELETION_VECTORS_FEATURE_NAME = "deletionVectors";
    private static final String IDENTITY_COLUMNS_FEATURE_NAME = "identityColumns";
    private static final String INVARIANTS_FEATURE_NAME = "invariants";
    public static final String TIMESTAMP_NTZ_FEATURE_NAME = "timestampNtz";

    private static final Set SUPPORTED_READER_FEATURES = ImmutableSet.builder()
            .add(COLUMN_MAPPING_FEATURE_NAME)
            .add(TIMESTAMP_NTZ_FEATURE_NAME)
            .add(DELETION_VECTORS_FEATURE_NAME)
            .build();
    private static final Set SUPPORTED_WRITER_FEATURES = ImmutableSet.builder()
            .add(APPEND_ONLY_FEATURE_NAME)
            .add(INVARIANTS_FEATURE_NAME)
            .add(CHECK_CONSTRAINTS_FEATURE_NAME)
            .add(CHANGE_DATA_FEED_FEATURE_NAME)
            .add(COLUMN_MAPPING_FEATURE_NAME)
            .add(TIMESTAMP_NTZ_FEATURE_NAME)
            .build();

    public enum ColumnMappingMode
    {
        ID,
        NAME,
        NONE,
        UNKNOWN,
        /**/;
    }

    // only non-parametrized types are stored here
    private static final Map PRIMITIVE_TYPE_MAPPING = ImmutableMap.builder()
            .put(BIGINT, "long")
            .put(INTEGER, "integer")
            .put(SMALLINT, "short")
            .put(TINYINT, "byte")
            .put(REAL, "float")
            .put(DOUBLE, "double")
            .put(BOOLEAN, "boolean")
            .put(VARBINARY, "binary")
            .put(DATE, "date")
            .buildOrThrow();

    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapperProvider().get();

    public static boolean isAppendOnly(MetadataEntry metadataEntry, ProtocolEntry protocolEntry)
    {
        if (protocolEntry.supportsWriterFeatures() && !protocolEntry.writerFeaturesContains(APPEND_ONLY_FEATURE_NAME)) {
            return false;
        }
        return parseBoolean(metadataEntry.getConfiguration().getOrDefault(APPEND_ONLY_CONFIGURATION_KEY, "false"));
    }

    public static boolean isDeletionVectorEnabled(MetadataEntry metadataEntry, ProtocolEntry protocolEntry)
    {
        if (protocolEntry.supportsWriterFeatures() && !protocolEntry.writerFeaturesContains(DELETION_VECTORS_FEATURE_NAME)) {
            return false;
        }
        return parseBoolean(metadataEntry.getConfiguration().get(DELETION_VECTORS_CONFIGURATION_KEY));
    }

    public static ColumnMappingMode getColumnMappingMode(MetadataEntry metadata, ProtocolEntry protocolEntry)
    {
        if (protocolEntry.supportsReaderFeatures() || protocolEntry.supportsWriterFeatures()) {
            boolean supportsColumnMappingReader = protocolEntry.readerFeaturesContains(COLUMN_MAPPING_FEATURE_NAME);
            boolean supportsColumnMappingWriter = protocolEntry.writerFeaturesContains(COLUMN_MAPPING_FEATURE_NAME);
            int columnMappingEnabled = countTrue(supportsColumnMappingReader, supportsColumnMappingWriter);
            checkArgument(
                    columnMappingEnabled == 0 || columnMappingEnabled == 2,
                    "Both reader and writer features should must the same value for 'columnMapping'. reader: %s, writer: %s", supportsColumnMappingReader, supportsColumnMappingWriter);
            if (columnMappingEnabled == 0) {
                return ColumnMappingMode.NONE;
            }
        }
        String columnMappingMode = metadata.getConfiguration().getOrDefault(COLUMN_MAPPING_MODE_CONFIGURATION_KEY, "none");
        return Enums.getIfPresent(ColumnMappingMode.class, columnMappingMode.toUpperCase(ENGLISH)).or(ColumnMappingMode.UNKNOWN);
    }

    public static int getMaxColumnId(MetadataEntry metadata)
    {
        String maxColumnId = metadata.getConfiguration().get(MAX_COLUMN_ID_CONFIGURATION_KEY);
        requireNonNull(maxColumnId, MAX_COLUMN_ID_CONFIGURATION_KEY + " metadata configuration property not found");
        return Integer.parseInt(maxColumnId);
    }

    public static List extractPartitionColumns(MetadataEntry metadataEntry, ProtocolEntry protocolEntry, TypeManager typeManager)
    {
        return extractPartitionColumns(extractSchema(metadataEntry, protocolEntry, typeManager), metadataEntry.getOriginalPartitionColumns());
    }

    public static List extractPartitionColumns(List schema, List originalPartitionColumns)
    {
        if (originalPartitionColumns.isEmpty()) {
            return ImmutableList.of();
        }
        return schema.stream()
                .filter(entry -> originalPartitionColumns.contains(entry.getName()))
                .map(entry -> new DeltaLakeColumnHandle(entry.getName(), entry.getType(), OptionalInt.empty(), entry.getPhysicalName(), entry.getPhysicalColumnType(), PARTITION_KEY, Optional.empty()))
                .collect(toImmutableList());
    }

    public static String serializeSchemaAsJson(
            List columnNames,
            Map columnTypes,
            Map columnComments,
            Map columnNullability,
            Map> columnMetadata)
    {
        try {
            return OBJECT_MAPPER.writeValueAsString(serializeStructType(columnNames, columnTypes, columnComments, columnNullability, columnMetadata));
        }
        catch (JsonProcessingException e) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, getLocation(e), "Failed to encode Delta Lake schema", e);
        }
    }

    private static Map serializeStructType(
            List columnNames,
            Map columnTypes,
            Map columnComments,
            Map columnNullability,
            Map> columnMetadata)
    {
        // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#struct-type
        ImmutableMap.Builder schema = ImmutableMap.builder();

        schema.put("type", "struct");
        schema.put("fields", columnNames.stream()
                .map(columnName -> serializeStructField(
                        columnName,
                        columnTypes.get(columnName),
                        columnComments.get(columnName),
                        columnNullability.get(columnName),
                        columnMetadata.get(columnName)))
                .collect(toImmutableList()));

        return schema.buildOrThrow();
    }

    private static Map serializeStructField(String name, Object type, @Nullable String comment, @Nullable Boolean nullable, @Nullable Map metadata)
    {
        // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#struct-field
        ImmutableMap.Builder fieldContents = ImmutableMap.builder();

        fieldContents.put("name", name);
        fieldContents.put("type", type);
        fieldContents.put("nullable", nullable != null ? nullable : true);

        ImmutableMap.Builder columnMetadata = ImmutableMap.builder();
        if (comment != null) {
            columnMetadata.put("comment", comment);
        }
        if (metadata != null) {
            metadata.entrySet().stream()
                    .filter(entry -> !entry.getKey().equals("comment"))
                    .forEach(entry -> columnMetadata.put(entry.getKey(), entry.getValue()));
        }
        fieldContents.put("metadata", columnMetadata.buildOrThrow());

        return fieldContents.buildOrThrow();
    }

    public static Object serializeColumnType(ColumnMappingMode columnMappingMode, AtomicInteger maxColumnId, Type columnType)
    {
        if (columnType instanceof ArrayType) {
            return serializeArrayType(columnMappingMode, maxColumnId, (ArrayType) columnType);
        }
        if (columnType instanceof RowType) {
            return serializeStructType(columnMappingMode, maxColumnId, (RowType) columnType);
        }
        if (columnType instanceof MapType) {
            return serializeMapType(columnMappingMode, maxColumnId, (MapType) columnType);
        }
        return serializePrimitiveType(columnType);
    }

    private static Map serializeArrayType(ColumnMappingMode columnMappingMode, AtomicInteger maxColumnId, ArrayType arrayType)
    {
        // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#array-type
        ImmutableMap.Builder fields = ImmutableMap.builder();

        fields.put("type", "array");
        fields.put("elementType", serializeColumnType(columnMappingMode, maxColumnId, arrayType.getElementType()));
        fields.put("containsNull", true);

        return fields.buildOrThrow();
    }

    private static Map serializeMapType(ColumnMappingMode columnMappingMode, AtomicInteger maxColumnId, MapType mapType)
    {
        // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#map-type
        ImmutableMap.Builder fields = ImmutableMap.builder();

        fields.put("type", "map");
        fields.put("keyType", serializeColumnType(columnMappingMode, maxColumnId, mapType.getKeyType()));
        fields.put("valueType", serializeColumnType(columnMappingMode, maxColumnId, mapType.getValueType()));
        fields.put("valueContainsNull", true);

        return fields.buildOrThrow();
    }

    private static Map serializeStructType(ColumnMappingMode columnMappingMode, AtomicInteger maxColumnId, RowType rowType)
    {
        ImmutableMap.Builder fields = ImmutableMap.builder();

        fields.put("type", "struct");
        fields.put("fields", rowType.getFields().stream()
                .map(field -> {
                    Object fieldType = serializeColumnType(columnMappingMode, maxColumnId, field.getType());
                    Map metadata = generateColumnMetadata(columnMappingMode, maxColumnId);
                    return serializeStructField(field.getName().orElse(null), fieldType, null, null, metadata);
                })
                .collect(toImmutableList()));

        return fields.buildOrThrow();
    }

    public static Map generateColumnMetadata(ColumnMappingMode columnMappingMode, AtomicInteger maxColumnId)
    {
        return switch (columnMappingMode) {
            case NONE -> {
                verify(maxColumnId.get() == 0, "maxColumnId must be 0 for column mapping mode 'none'");
                yield ImmutableMap.of();
            }
            case ID, NAME -> ImmutableMap.builder()
                    // Set both 'id' and 'physicalName' regardless of the mode https://github.com/delta-io/delta/blob/master/PROTOCOL.md#column-mapping
                    // > There are two modes of column mapping, by name and by id.
                    // > In both modes, every column - nested or leaf - is assigned a unique physical name, and a unique 32-bit integer as an id.
                    .put("delta.columnMapping.id", maxColumnId.incrementAndGet())
                    .put("delta.columnMapping.physicalName", "col-" + UUID.randomUUID()) // This logic is same as DeltaColumnMapping.generatePhysicalName in Delta Lake
                    .buildOrThrow();
            default -> throw new IllegalArgumentException("Unexpected column mapping mode: " + columnMappingMode);
        };
    }

    private static String serializePrimitiveType(Type type)
    {
        return serializeSupportedPrimitiveType(type)
                .orElseThrow(() -> new TypeNotFoundException(type.getTypeSignature()));
    }

    private static Optional serializeSupportedPrimitiveType(Type type)
    {
        if (type instanceof TimestampType) {
            return Optional.of("timestamp_ntz");
        }
        if (type instanceof TimestampWithTimeZoneType) {
            return Optional.of("timestamp");
        }
        if (type instanceof VarcharType) {
            return Optional.of("string");
        }
        if (type instanceof DecimalType decimalType) {
            return Optional.of(String.format("decimal(%s,%s)", decimalType.getPrecision(), decimalType.getScale()));
        }
        return Optional.ofNullable(PRIMITIVE_TYPE_MAPPING.get(type));
    }

    public static void validateType(Type type)
    {
        validateType(Optional.empty(), type);
    }

    private static void validateType(Optional rootType, Type type)
    {
        if (HiveUtil.isStructuralType(type)) {
            validateStructuralType(Optional.of(rootType.orElse(type)), type);
        }
        else {
            validatePrimitiveType(type);
        }
    }

    private static void validateStructuralType(Optional rootType, Type type)
    {
        if (type instanceof ArrayType) {
            validateType(rootType, ((ArrayType) type).getElementType());
        }

        if (type instanceof MapType mapType) {
            validateType(rootType, mapType.getKeyType());
            validateType(rootType, mapType.getValueType());
        }

        if (type instanceof RowType rowType) {
            rowType.getFields().forEach(field -> validateType(rootType, field.getType()));
        }
    }

    private static void validatePrimitiveType(Type type)
    {
        if (serializeSupportedPrimitiveType(type).isEmpty() ||
                (type instanceof TimestampType && ((TimestampType) type).getPrecision() != 6) ||
                (type instanceof TimestampWithTimeZoneType && ((TimestampWithTimeZoneType) type).getPrecision() != 3)) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Unsupported type: " + type);
        }
    }

    public static String serializeStatsAsJson(DeltaLakeFileStatistics fileStatistics)
            throws JsonProcessingException
    {
        return OBJECT_MAPPER.writeValueAsString(fileStatistics);
    }

    public static List extractColumnMetadata(MetadataEntry metadataEntry, ProtocolEntry protocolEntry, TypeManager typeManager)
    {
        return extractSchema(metadataEntry, protocolEntry, typeManager).stream()
                .map(DeltaLakeColumnMetadata::getColumnMetadata)
                .collect(toImmutableList());
    }

    public static List extractSchema(MetadataEntry metadataEntry, ProtocolEntry protocolEntry, TypeManager typeManager)
    {
        ColumnMappingMode mappingMode = getColumnMappingMode(metadataEntry, protocolEntry);
        verifySupportedColumnMapping(mappingMode);
        return Optional.ofNullable(metadataEntry.getSchemaString())
                .map(json -> getColumnMetadata(json, typeManager, mappingMode))
                .orElseThrow(() -> new IllegalStateException("Serialized schema not found in transaction log for " + metadataEntry.getName()));
    }

    public static void verifySupportedColumnMapping(ColumnMappingMode mappingMode)
    {
        if (mappingMode != ColumnMappingMode.ID && mappingMode != ColumnMappingMode.NAME && mappingMode != ColumnMappingMode.NONE) {
            throw new TrinoException(NOT_SUPPORTED, format("Only 'id', 'name' or 'none' is supported for the '%s' table property", COLUMN_MAPPING_MODE_CONFIGURATION_KEY));
        }
    }

    public static List getColumnMetadata(String json, TypeManager typeManager, ColumnMappingMode mappingMode)
    {
        try {
            return stream(OBJECT_MAPPER.readTree(json).get("fields").elements())
                    .map(node -> mapColumn(typeManager, node, mappingMode))
                    .collect(toImmutableList());
        }
        catch (JsonProcessingException e) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, getLocation(e), "Failed to parse serialized schema: " + json, e);
        }
    }

    private static DeltaLakeColumnMetadata mapColumn(TypeManager typeManager, JsonNode node, ColumnMappingMode mappingMode)
    {
        String fieldName = node.get("name").asText();
        JsonNode typeNode = node.get("type");
        boolean nullable = node.get("nullable").asBoolean();
        Type columnType = buildType(typeManager, typeNode, false);
        OptionalInt fieldId = OptionalInt.empty();
        String physicalName;
        Type physicalColumnType;
        switch (mappingMode) {
            case ID:
                String columnMappingId = node.get("metadata").get("delta.columnMapping.id").asText();
                verify(!isNullOrEmpty(columnMappingId), "id is null or empty");
                fieldId = OptionalInt.of(Integer.parseInt(columnMappingId));
                // Databricks stores column statistics with physical name
                physicalName = node.get("metadata").get("delta.columnMapping.physicalName").asText();
                verify(!isNullOrEmpty(physicalName), "physicalName is null or empty");
                physicalColumnType = buildType(typeManager, typeNode, true);
                break;
            case NAME:
                physicalName = node.get("metadata").get("delta.columnMapping.physicalName").asText();
                verify(!isNullOrEmpty(physicalName), "physicalName is null or empty");
                physicalColumnType = buildType(typeManager, typeNode, true);
                break;
            default:
                physicalName = fieldName;
                physicalColumnType = columnType;
        }
        ColumnMetadata columnMetadata = ColumnMetadata.builder()
                .setName(fieldName)
                .setType(columnType)
                .setNullable(nullable)
                .setComment(Optional.ofNullable(getComment(node)))
                .build();
        return new DeltaLakeColumnMetadata(columnMetadata, fieldName, fieldId, physicalName, physicalColumnType);
    }

    public static Map getColumnTypes(MetadataEntry metadataEntry)
    {
        return getColumnProperties(metadataEntry, node -> OBJECT_MAPPER.convertValue(node.get("type"), new TypeReference<>(){}));
    }

    public static Map getColumnComments(MetadataEntry metadataEntry)
    {
        return getColumnProperties(metadataEntry, DeltaLakeSchemaSupport::getComment);
    }

    @Nullable
    private static String getComment(JsonNode node)
    {
        JsonNode comment = node.get("metadata").get("comment");
        return comment == null ? null : comment.asText();
    }

    public static Map getColumnsNullability(MetadataEntry metadataEntry)
    {
        return getColumnProperties(metadataEntry, node -> node.get("nullable").asBoolean());
    }

    public static Map getColumnIdentities(MetadataEntry metadataEntry, ProtocolEntry protocolEntry)
    {
        if (protocolEntry.supportsWriterFeatures() && !protocolEntry.writerFeaturesContains(IDENTITY_COLUMNS_FEATURE_NAME)) {
            return ImmutableMap.of();
        }
        return getColumnProperties(metadataEntry, DeltaLakeSchemaSupport::isIdentityColumn);
    }

    private static boolean isIdentityColumn(JsonNode node)
    {
        return Streams.stream(node.get("metadata").fieldNames())
                .anyMatch(name -> name.startsWith("delta.identity."));
    }

    public static Map getColumnInvariants(MetadataEntry metadataEntry, ProtocolEntry protocolEntry)
    {
        if (protocolEntry.supportsWriterFeatures()) {
            if (!protocolEntry.writerFeaturesContains(INVARIANTS_FEATURE_NAME)) {
                return ImmutableMap.of();
            }
            return getColumnProperties(metadataEntry, DeltaLakeSchemaSupport::getInvariantsWriterFeature);
        }
        return getColumnProperties(metadataEntry, DeltaLakeSchemaSupport::getInvariants);
    }

    @Nullable
    private static String getInvariantsWriterFeature(JsonNode node)
    {
        JsonNode invariants = node.get("metadata").get("delta.invariants");
        return invariants == null ? null : invariants.asText();
    }

    @Nullable
    private static String getInvariants(JsonNode node)
    {
        JsonNode invariants = node.get("metadata").get("delta.invariants");
        return invariants == null ? null : extractInvariantsExpression(invariants.asText());
    }

    private static String extractInvariantsExpression(String invariants)
    {
        try {
            return OBJECT_MAPPER.readTree(invariants).get("expression").get("expression").asText();
        }
        catch (JsonProcessingException e) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, getLocation(e), "Failed to parse invariants expression: " + invariants, e);
        }
    }

    public static Map getGeneratedColumnExpressions(MetadataEntry metadataEntry)
    {
        return getColumnProperties(metadataEntry, DeltaLakeSchemaSupport::getGeneratedColumnExpressions);
    }

    @Nullable
    private static String getGeneratedColumnExpressions(JsonNode node)
    {
        JsonNode generationExpression = node.get("metadata").get("delta.generationExpression");
        return generationExpression == null ? null : generationExpression.asText();
    }

    public static Map getCheckConstraints(MetadataEntry metadataEntry, ProtocolEntry protocolEntry)
    {
        if (protocolEntry.supportsWriterFeatures() && !protocolEntry.writerFeaturesContains(CHECK_CONSTRAINTS_FEATURE_NAME)) {
            return ImmutableMap.of();
        }
        return metadataEntry.getConfiguration().entrySet().stream()
                .filter(entry -> entry.getKey().startsWith("delta.constraints."))
                .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue));
    }

    public static Optional changeDataFeedEnabled(MetadataEntry metadataEntry, ProtocolEntry protocolEntry)
    {
        if (protocolEntry.supportsWriterFeatures() && !protocolEntry.writerFeaturesContains(CHANGE_DATA_FEED_FEATURE_NAME)) {
            return Optional.empty();
        }
        String enableChangeDataFeed = metadataEntry.getConfiguration().get(DELTA_CHANGE_DATA_FEED_ENABLED_PROPERTY);
        if (enableChangeDataFeed == null) {
            return Optional.empty();
        }
        return Optional.of(parseBoolean(enableChangeDataFeed));
    }

    public static Map> getColumnsMetadata(MetadataEntry metadataEntry)
    {
        return getColumnProperties(metadataEntry, node -> OBJECT_MAPPER.convertValue(node.get("metadata"), new TypeReference<>(){}));
    }

    public static  Map getColumnProperties(MetadataEntry metadataEntry, Function extractor)
    {
        return Optional.ofNullable(metadataEntry.getSchemaString())
                .map(json -> getColumnProperty(json, extractor))
                .orElseThrow(() -> new IllegalStateException("Serialized schema not found in transaction log for " + metadataEntry.getName()));
    }

    private static  Map getColumnProperty(String json, Function extractor)
    {
        try {
            return stream(OBJECT_MAPPER.readTree(json).get("fields").elements())
                    .map(field -> new AbstractMap.SimpleEntry<>(field.get("name").asText(), extractor.apply(field)))
                    .filter(entry -> entry.getValue() != null)
                    .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue));
        }
        catch (JsonProcessingException e) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, getLocation(e), "Failed to parse serialized schema: " + json, e);
        }
    }

    /**
     * @return the case-sensitive column names
     */
    public static List getExactColumnNames(MetadataEntry metadataEntry)
    {
        try {
            return stream(OBJECT_MAPPER.readTree(metadataEntry.getSchemaString()).get("fields").elements())
                    .map(field -> field.get("name").asText())
                    .collect(toImmutableList());
        }
        catch (JsonProcessingException e) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, getLocation(e), "Failed to parse serialized schema: " + metadataEntry.getSchemaString(), e);
        }
    }

    public static Set unsupportedReaderFeatures(Set features)
    {
        return Sets.difference(features, SUPPORTED_READER_FEATURES);
    }

    public static Set unsupportedWriterFeatures(Set features)
    {
        return Sets.difference(features, SUPPORTED_WRITER_FEATURES);
    }

    public static Type deserializeType(TypeManager typeManager, Object type, boolean usePhysicalName)
    {
        try {
            String json = OBJECT_MAPPER.writeValueAsString(type);
            return buildType(typeManager, OBJECT_MAPPER.readTree(json), usePhysicalName);
        }
        catch (JsonProcessingException e) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, "Failed to deserialize type: " + type);
        }
    }

    private static Type buildType(TypeManager typeManager, JsonNode typeNode, boolean usePhysicalName)
    {
        if (typeNode.isContainerNode()) {
            return buildContainerType(typeManager, typeNode, usePhysicalName);
        }
        String primitiveType = typeNode.asText();
        if (primitiveType.startsWith("decimal")) {
            return typeManager.fromSqlType(primitiveType);
        }
        return switch (primitiveType) {
            case "string" -> VARCHAR;
            case "long" -> BIGINT;
            case "integer" -> INTEGER;
            case "short" -> SMALLINT;
            case "byte" -> TINYINT;
            case "float" -> REAL;
            case "double" -> DOUBLE;
            case "boolean" -> BOOLEAN;
            case "binary" -> VARBINARY;
            case "date" -> DATE;
            // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#timestamp-without-timezone-timestampntz
            case "timestamp_ntz" -> TIMESTAMP_MICROS;
            // Spark/DeltaLake stores timestamps in UTC, but renders them in session time zone.
            // For more info, see https://delta-users.slack.com/archives/GKTUWT03T/p1585760533005400
            // and https://cwiki.apache.org/confluence/display/Hive/Different+TIMESTAMP+types
            case "timestamp" -> TIMESTAMP_TZ_MILLIS;
            default -> throw new TypeNotFoundException(new TypeSignature(primitiveType));
        };
    }

    private static Type buildContainerType(TypeManager typeManager, JsonNode typeNode, boolean usePhysicalName)
    {
        String containerType = typeNode.get("type").asText();
        return switch (containerType) {
            case "array" -> buildArrayType(typeManager, typeNode, usePhysicalName);
            case "map" -> buildMapType(typeManager, typeNode, usePhysicalName);
            case "struct" -> buildRowType(typeManager, typeNode, usePhysicalName);
            default -> throw new TypeNotFoundException(new TypeSignature(containerType));
        };
    }

    private static RowType buildRowType(TypeManager typeManager, JsonNode typeNode, boolean usePhysicalName)
    {
        return (RowType) typeManager.getType(TypeSignature.rowType(stream(typeNode.get("fields").elements())
                .map(element -> {
                    String fieldName = usePhysicalName ? element.get("metadata").get("delta.columnMapping.physicalName").asText() : element.get("name").asText();
                    verify(!isNullOrEmpty(fieldName), "fieldName is null or empty");
                    return TypeSignatureParameter.namedField(
                            // We lower case the struct field names.
                            // Otherwise, Trino will refuse to write to columns whose struct type has field names containing upper case characters.
                            // Users can't work around this by casting in their queries because Trino parser always lower case types.
                            // TODO: This is a hack. Engine should be able to handle identifiers in a case insensitive way where necessary.
                            // See also HiveTypeTranslator#toTypeSingature.
                            TransactionLogAccess.canonicalizeColumnName(fieldName),
                            buildType(typeManager, element.get("type"), usePhysicalName).getTypeSignature());
                })
                .collect(toImmutableList())));
    }

    private static ArrayType buildArrayType(TypeManager typeManager, JsonNode typeNode, boolean usePhysicalName)
    {
        return (ArrayType) typeManager.getType(TypeSignature.arrayType(buildType(typeManager, typeNode.get("elementType"), usePhysicalName).getTypeSignature()));
    }

    private static MapType buildMapType(TypeManager typeManager, JsonNode typeNode, boolean usePhysicalName)
    {
        return (MapType) typeManager.getType(TypeSignature.mapType(
                buildType(typeManager, typeNode.get("keyType"), usePhysicalName).getTypeSignature(),
                buildType(typeManager, typeNode.get("valueType"), usePhysicalName).getTypeSignature()));
    }

    private static Optional getLocation(JsonProcessingException e)
    {
        return Optional.ofNullable(e.getLocation()).map(location -> new Location(location.getLineNr(), location.getColumnNr()));
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy