All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.deltalake.DeltaLakeParquetSchemas Maven / Gradle / Ivy

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.deltalake;

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import io.airlift.json.ObjectMapperProvider;
import io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport;
import io.trino.plugin.deltalake.transactionlog.MetadataEntry;
import io.trino.plugin.deltalake.transactionlog.ProtocolEntry;
import io.trino.spi.Location;
import io.trino.spi.TrinoException;
import io.trino.spi.type.DecimalType;
import io.trino.spi.type.StandardTypes;
import io.trino.spi.type.Type;
import io.trino.spi.type.TypeManager;
import io.trino.spi.type.TypeNotFoundException;
import io.trino.spi.type.TypeSignature;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.LogicalTypeAnnotation;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Types;

import java.util.List;
import java.util.Optional;
import java.util.OptionalInt;

import static com.google.common.base.Strings.isNullOrEmpty;
import static com.google.common.base.Verify.verify;
import static com.google.common.collect.Streams.stream;
import static io.trino.plugin.deltalake.DeltaLakeErrorCode.DELTA_LAKE_INVALID_SCHEMA;
import static io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.getColumnMappingMode;
import static io.trino.spi.StandardErrorCode.NOT_SUPPORTED;
import static io.trino.spi.type.BigintType.BIGINT;
import static io.trino.spi.type.BooleanType.BOOLEAN;
import static io.trino.spi.type.DateType.DATE;
import static io.trino.spi.type.Decimals.MAX_PRECISION;
import static io.trino.spi.type.DoubleType.DOUBLE;
import static io.trino.spi.type.IntegerType.INTEGER;
import static io.trino.spi.type.RealType.REAL;
import static io.trino.spi.type.SmallintType.SMALLINT;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MICROS;
import static io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS;
import static io.trino.spi.type.TinyintType.TINYINT;
import static io.trino.spi.type.VarbinaryType.VARBINARY;
import static io.trino.spi.type.VarcharType.VARCHAR;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static org.apache.parquet.schema.LogicalTypeAnnotation.decimalType;
import static org.apache.parquet.schema.Type.Repetition.OPTIONAL;
import static org.apache.parquet.schema.Type.Repetition.REQUIRED;

/**
 * Delta Lake specific utility which converts the Delta table schema to
 * a Parquet schema.
 * This utility is used instead of Hive's
 * {@link io.trino.parquet.writer.ParquetSchemaConverter}
 * in order to be able to include the field IDs in the Parquet schema.
 */
public final class DeltaLakeParquetSchemas
{
    // Map precision to the number bytes needed for binary conversion.
    // Based on org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe
    private static final int[] PRECISION_TO_BYTE_COUNT = new int[MAX_PRECISION + 1];

    static {
        for (int precision = 1; precision <= MAX_PRECISION; precision++) {
            // Estimated number of bytes needed.
            PRECISION_TO_BYTE_COUNT[precision] = (int) Math.ceil((Math.log(Math.pow(10, precision) - 1) / Math.log(2) + 1) / 8);
        }
    }

    private static final ObjectMapper OBJECT_MAPPER = new ObjectMapperProvider().get();

    private DeltaLakeParquetSchemas() {}

    public static DeltaLakeParquetSchemaMapping createParquetSchemaMapping(MetadataEntry metadataEntry, ProtocolEntry protocolEntry, TypeManager typeManager)
    {
        return createParquetSchemaMapping(metadataEntry, protocolEntry, typeManager, false);
    }

    public static DeltaLakeParquetSchemaMapping createParquetSchemaMapping(MetadataEntry metadataEntry, ProtocolEntry protocolEntry, TypeManager typeManager, boolean addChangeDataFeedFields)
    {
        DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode = getColumnMappingMode(metadataEntry, protocolEntry);
        return createParquetSchemaMapping(
                metadataEntry.getSchemaString(),
                typeManager,
                columnMappingMode,
                metadataEntry.getOriginalPartitionColumns(),
                addChangeDataFeedFields);
    }

    public static DeltaLakeParquetSchemaMapping createParquetSchemaMapping(
            String jsonSchema,
            TypeManager typeManager,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List partitionColumnNames)
    {
        return createParquetSchemaMapping(jsonSchema, typeManager, columnMappingMode, partitionColumnNames, false);
    }

    private static DeltaLakeParquetSchemaMapping createParquetSchemaMapping(
            String jsonSchema,
            TypeManager typeManager,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List partitionColumnNames,
            boolean addChangeDataFeedFields)
    {
        requireNonNull(typeManager, "typeManager is null");
        requireNonNull(columnMappingMode, "columnMappingMode is null");
        requireNonNull(partitionColumnNames, "partitionColumnNames is null");
        Types.MessageTypeBuilder builder = Types.buildMessage();
        ImmutableMap.Builder, Type> primitiveTypesBuilder = ImmutableMap.builder();
        try {
            stream(OBJECT_MAPPER.readTree(jsonSchema).get("fields").elements())
                    .filter(fieldNode -> !partitionColumnNames.contains(fieldNode.get("name").asText()))
                    .map(fieldNode -> buildType(fieldNode, typeManager, columnMappingMode, ImmutableList.of(), primitiveTypesBuilder))
                    .forEach(builder::addField);
        }
        catch (JsonProcessingException e) {
            throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, getLocation(e), "Failed to parse serialized schema: " + jsonSchema, e);
        }
        if (addChangeDataFeedFields) {
            builder.addField(buildPrimitiveType("string", typeManager, OPTIONAL, DeltaLakeCdfPageSink.CHANGE_TYPE_COLUMN_NAME, OptionalInt.empty(), ImmutableList.of(), primitiveTypesBuilder));
        }

        return new DeltaLakeParquetSchemaMapping(builder.named("trino_schema"), primitiveTypesBuilder.buildOrThrow());
    }

    private static org.apache.parquet.schema.Type buildType(
            JsonNode fieldNode,
            TypeManager typeManager,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List parent,
            ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        JsonNode typeNode = fieldNode.get("type");
        OptionalInt fieldId = OptionalInt.empty();
        String physicalName;

        switch (columnMappingMode) {
            case ID -> {
                String columnMappingId = fieldNode.get("metadata").get("delta.columnMapping.id").asText();
                verify(!isNullOrEmpty(columnMappingId), "id is null or empty");
                fieldId = OptionalInt.of(Integer.parseInt(columnMappingId));
                // Databricks stores column statistics with physical name
                physicalName = fieldNode.get("metadata").get("delta.columnMapping.physicalName").asText();
                verify(!isNullOrEmpty(physicalName), "physicalName is null or empty");
            }
            case NAME -> {
                physicalName = fieldNode.get("metadata").get("delta.columnMapping.physicalName").asText();
                verify(!isNullOrEmpty(physicalName), "physicalName is null or empty");
            }
            case NONE -> {
                physicalName = fieldNode.get("name").asText();
                verify(!isNullOrEmpty(physicalName), "name is null or empty");
            }
            default -> throw new UnsupportedOperationException("Unsupported parameter columnMappingMode");
        }

        return buildType(typeNode, typeManager, OPTIONAL, physicalName, fieldId, columnMappingMode, parent, primitiveTypesBuilder);
    }

    private static org.apache.parquet.schema.Type buildType(
            JsonNode typeNode,
            TypeManager typeManager,
            org.apache.parquet.schema.Type.Repetition repetition,
            String name,
            OptionalInt id,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List parent,
            ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        if (typeNode.isContainerNode()) {
            return buildContainerType(typeNode, typeManager, repetition, name, id, columnMappingMode, parent, primitiveTypesBuilder);
        }

        String primitiveType = typeNode.asText();
        return buildPrimitiveType(primitiveType, typeManager, repetition, name, id, parent, primitiveTypesBuilder);
    }

    private static org.apache.parquet.schema.Type buildPrimitiveType(
            String primitiveType,
            TypeManager typeManager,
            org.apache.parquet.schema.Type.Repetition repetition,
            String name,
            OptionalInt id,
            List parent,
            ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        Types.PrimitiveBuilder typeBuilder;
        Type trinoType;
        if (primitiveType.startsWith(StandardTypes.DECIMAL)) {
            trinoType = typeManager.fromSqlType(primitiveType);
            verify(trinoType instanceof DecimalType, "type %s does not map to Trino decimal".formatted(primitiveType));
            DecimalType trinoDecimalType = (DecimalType) trinoType;
            if (trinoDecimalType.getPrecision() <= 9) {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition);
            }
            else if (trinoDecimalType.isShort()) {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition);
            }
            else {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY, repetition)
                        .length(PRECISION_TO_BYTE_COUNT[trinoDecimalType.getPrecision()]);
            }
            typeBuilder = typeBuilder.as(decimalType(trinoDecimalType.getScale(), trinoDecimalType.getPrecision()));
            return buildType(name, id, parent, typeBuilder, trinoType, primitiveTypesBuilder);
        }
        switch (primitiveType) {
            case "string" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition).as(LogicalTypeAnnotation.stringType());
                trinoType = VARCHAR;
            }
            case "byte" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition);
                trinoType = TINYINT;
            }
            case "short" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition);
                trinoType = SMALLINT;
            }
            case "integer" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition);
                trinoType = INTEGER;
            }
            case "long" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition);
                trinoType = BIGINT;
            }
            case "float" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.FLOAT, repetition);
                trinoType = REAL;
            }
            case "double" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.DOUBLE, repetition);
                trinoType = DOUBLE;
            }
            case "boolean" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.BOOLEAN, repetition);
                trinoType = BOOLEAN;
            }
            case "binary" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition);
                trinoType = VARBINARY;
            }
            case "date" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT32, repetition).as(LogicalTypeAnnotation.dateType());
                trinoType = DATE;
            }
            case "timestamp" -> {
                // Spark / Delta Lake stores timestamps in UTC, but renders them in session time zone.
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition).as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MILLIS));
                trinoType = TIMESTAMP_MILLIS;
            }
            case "timestamp_ntz" -> {
                typeBuilder = Types.primitive(PrimitiveType.PrimitiveTypeName.INT64, repetition).as(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.MICROS));
                trinoType = TIMESTAMP_MICROS;
            }
            default -> throw new TrinoException(NOT_SUPPORTED, format("Unsupported primitive type: %s", primitiveType));
        }

        return buildType(name, id, parent, typeBuilder, trinoType, primitiveTypesBuilder);
    }

    private static PrimitiveType buildType(String name, OptionalInt id, List parent, Types.PrimitiveBuilder typeBuilder, Type trinoType, ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        if (id.isPresent()) {
            typeBuilder.id(id.getAsInt());
        }

        List fullName = ImmutableList.builder().addAll(parent).add(name).build();
        primitiveTypesBuilder.put(fullName, trinoType);

        return typeBuilder.named(name);
    }

    private static org.apache.parquet.schema.Type buildContainerType(
            JsonNode typeNode,
            TypeManager typeManager,
            org.apache.parquet.schema.Type.Repetition repetition,
            String name,
            OptionalInt id,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List parent,
            ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        String containerType = typeNode.get("type").asText();
        return switch (containerType) {
            case "array" -> buildArrayType(typeNode, typeManager, repetition, name, id, columnMappingMode, parent, primitiveTypesBuilder);
            case "map" -> buildMapType(typeNode, typeManager, repetition, name, id, columnMappingMode, parent, primitiveTypesBuilder);
            case "struct" -> buildRowType(typeNode, typeManager, repetition, name, id, columnMappingMode, parent, primitiveTypesBuilder);
            default -> throw new TypeNotFoundException(new TypeSignature(containerType));
        };
    }

    private static org.apache.parquet.schema.Type buildArrayType(
            JsonNode typeNode,
            TypeManager typeManager,
            org.apache.parquet.schema.Type.Repetition repetition,
            String name,
            OptionalInt id,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List parent,
            ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        parent = ImmutableList.builder().addAll(parent).add(name).add("list").build();

        JsonNode elementTypeNode = typeNode.get("elementType");
        org.apache.parquet.schema.Type elementType;

        if (elementTypeNode.isContainerNode()) {
            elementType = buildContainerType(elementTypeNode, typeManager, OPTIONAL, "element", OptionalInt.empty(), columnMappingMode, parent, primitiveTypesBuilder);
        }
        else {
            elementType = buildType(elementTypeNode, typeManager, OPTIONAL, "element", OptionalInt.empty(), columnMappingMode, parent, primitiveTypesBuilder);
        }

        GroupType arrayType = Types.list(repetition)
                .element(elementType)
                .named(name);
        if (id.isPresent()) {
            arrayType = arrayType.withId(id.getAsInt());
        }
        return arrayType;
    }

    private static org.apache.parquet.schema.Type buildMapType(
            JsonNode typeNode,
            TypeManager typeManager,
            org.apache.parquet.schema.Type.Repetition repetition,
            String name,
            OptionalInt id,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List parent,
            ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        Types.MapBuilder builder = Types.map(repetition);
        if (id.isPresent()) {
            builder.id(id.getAsInt());
        }

        parent = ImmutableList.builder().addAll(parent).add(name).add("key_value").build();

        JsonNode keyTypeNode = typeNode.get("keyType");
        org.apache.parquet.schema.Type keyType;
        if (keyTypeNode.isContainerNode()) {
            keyType = buildContainerType(keyTypeNode, typeManager, REQUIRED, "key", OptionalInt.empty(), columnMappingMode, parent, primitiveTypesBuilder);
        }
        else {
            keyType = buildType(keyTypeNode, typeManager, REQUIRED, "key", OptionalInt.empty(), columnMappingMode, parent, primitiveTypesBuilder);
        }
        JsonNode valueTypeNode = typeNode.get("valueType");
        org.apache.parquet.schema.Type valueType;
        if (valueTypeNode.isContainerNode()) {
            valueType = buildContainerType(valueTypeNode, typeManager, OPTIONAL, "value", OptionalInt.empty(), columnMappingMode, parent, primitiveTypesBuilder);
        }
        else {
            valueType = buildType(valueTypeNode, typeManager, OPTIONAL, "value", OptionalInt.empty(), columnMappingMode, parent, primitiveTypesBuilder);
        }

        return builder
                .key(keyType)
                .value(valueType)
                .named(name);
    }

    private static org.apache.parquet.schema.Type buildRowType(
            JsonNode typeNode,
            TypeManager typeManager,
            org.apache.parquet.schema.Type.Repetition repetition,
            String name,
            OptionalInt id,
            DeltaLakeSchemaSupport.ColumnMappingMode columnMappingMode,
            List parent,
            ImmutableMap.Builder, Type> primitiveTypesBuilder)
    {
        Types.GroupBuilder builder = Types.buildGroup(repetition);
        if (id.isPresent()) {
            builder.id(id.getAsInt());
        }
        List currentParent = ImmutableList.builder().addAll(parent).add(name).build();
        stream(typeNode.get("fields").elements())
                .map(node -> buildType(node, typeManager, columnMappingMode, currentParent, primitiveTypesBuilder))
                .forEach(builder::addField);
        return builder.named(name);
    }

    private static Optional getLocation(JsonProcessingException e)
    {
        return Optional.ofNullable(e.getLocation()).map(location -> new Location(location.getLineNr(), location.getColumnNr()));
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy