All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.trino.plugin.hive.avro.AvroHiveFileUtils Maven / Gradle / Ivy

There is a newer version: 468
Show newest version
/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.trino.plugin.hive.avro;

import com.google.common.base.Splitter;
import io.trino.filesystem.Location;
import io.trino.filesystem.TrinoFileSystem;
import io.trino.filesystem.TrinoInputFile;
import io.trino.filesystem.TrinoInputStream;
import io.trino.hive.formats.avro.NativeLogicalTypesAvroTypeManager;
import io.trino.metastore.HiveType;
import io.trino.metastore.type.CharTypeInfo;
import io.trino.metastore.type.DecimalTypeInfo;
import io.trino.metastore.type.ListTypeInfo;
import io.trino.metastore.type.MapTypeInfo;
import io.trino.metastore.type.PrimitiveCategory;
import io.trino.metastore.type.PrimitiveTypeInfo;
import io.trino.metastore.type.StructTypeInfo;
import io.trino.metastore.type.TypeInfo;
import io.trino.metastore.type.UnionTypeInfo;
import io.trino.metastore.type.VarcharTypeInfo;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;

import static com.google.common.collect.ImmutableMap.toImmutableMap;
import static io.trino.hive.formats.avro.AvroHiveConstants.CHAR_TYPE_LOGICAL_NAME;
import static io.trino.hive.formats.avro.AvroHiveConstants.SCHEMA_DOC;
import static io.trino.hive.formats.avro.AvroHiveConstants.SCHEMA_LITERAL;
import static io.trino.hive.formats.avro.AvroHiveConstants.SCHEMA_NAME;
import static io.trino.hive.formats.avro.AvroHiveConstants.SCHEMA_NAMESPACE;
import static io.trino.hive.formats.avro.AvroHiveConstants.SCHEMA_NONE;
import static io.trino.hive.formats.avro.AvroHiveConstants.SCHEMA_URL;
import static io.trino.hive.formats.avro.AvroHiveConstants.TABLE_NAME;
import static io.trino.hive.formats.avro.AvroHiveConstants.VARCHAR_AND_CHAR_LOGICAL_TYPE_LENGTH_PROP;
import static io.trino.hive.formats.avro.AvroHiveConstants.VARCHAR_TYPE_LOGICAL_NAME;
import static io.trino.metastore.Table.TABLE_COMMENT;
import static io.trino.plugin.hive.util.HiveUtil.getColumnNames;
import static io.trino.plugin.hive.util.HiveUtil.getColumnTypes;
import static io.trino.plugin.hive.util.SerdeConstants.LIST_COLUMN_COMMENTS;
import static java.util.Collections.emptyList;
import static java.util.function.Predicate.not;
import static java.util.function.UnaryOperator.identity;

public final class AvroHiveFileUtils
{
    private final AtomicInteger recordNameSuffix = new AtomicInteger(0);

    private AvroHiveFileUtils() {}

    // Lifted and shifted from org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils.determineSchemaOrThrowException
    public static Schema determineSchemaOrThrowException(TrinoFileSystem fileSystem, Map properties)
            throws IOException
    {
        // Try pull schema from literal table property
        String schemaString = properties.getOrDefault(SCHEMA_LITERAL, "");
        if (!schemaString.isBlank() && !schemaString.equals(SCHEMA_NONE)) {
            return getSchemaParser().parse(schemaString);
        }

        // Try pull schema directly from URL
        String schemaURL = properties.getOrDefault(SCHEMA_URL, "");
        if (!schemaURL.isBlank()) {
            TrinoInputFile schemaFile = fileSystem.newInputFile(Location.of(schemaURL));
            if (!schemaFile.exists()) {
                throw new IOException("No avro schema file not found at " + schemaURL);
            }
            try (TrinoInputStream inputStream = schemaFile.newStream()) {
                return getSchemaParser().parse(inputStream);
            }
            catch (IOException e) {
                throw new IOException("Unable to read avro schema file from given path: " + schemaURL, e);
            }
        }
        return getSchemaFromProperties(properties);
    }

    private static Schema getSchemaFromProperties(Map schema)
            throws IOException
    {
        List columnNames = getColumnNames(schema);
        List columnTypes = getColumnTypes(schema);
        if (columnNames.isEmpty() || columnTypes.isEmpty()) {
            throw new IOException("Unable to parse column names or column types from schema to create Avro Schema");
        }
        if (columnNames.size() != columnTypes.size()) {
            throw new IllegalArgumentException("Avro Schema initialization failed. Number of column name and column type differs. columnNames = %s, columnTypes = %s".formatted(columnNames, columnTypes));
        }
        List columnComments = Optional.ofNullable(schema.get(LIST_COLUMN_COMMENTS))
                .filter(not(String::isBlank))
                .map(Splitter.on('\0')::splitToList)
                .orElse(emptyList());

        String tableName = schema.get(TABLE_NAME);
        String tableComment = schema.get(TABLE_COMMENT);

        return constructSchemaFromParts(
                columnNames,
                columnTypes,
                columnComments,
                Optional.ofNullable(schema.get(SCHEMA_NAMESPACE)),
                Optional.ofNullable(schema.getOrDefault(SCHEMA_NAME, tableName)),
                Optional.ofNullable(schema.getOrDefault(SCHEMA_DOC, tableComment)));
    }

    private static Schema constructSchemaFromParts(List columnNames, List columnTypes,
            List columnComments, Optional namespace, Optional name, Optional doc)
    {
        // create instance of this class to keep nested record naming consistent for any given inputs
        AvroHiveFileUtils recordIncrementingUtil = new AvroHiveFileUtils();
        SchemaBuilder.RecordBuilder schemaBuilder = SchemaBuilder.record(name.orElse("baseRecord"));
        namespace.ifPresent(schemaBuilder::namespace);
        doc.ifPresent(schemaBuilder::doc);
        SchemaBuilder.FieldAssembler fieldBuilder = schemaBuilder.fields();

        for (int i = 0; i < columnNames.size(); ++i) {
            String comment = columnComments.size() > i ? columnComments.get(i) : null;
            Schema fieldSchema = recordIncrementingUtil.avroSchemaForHiveType(columnTypes.get(i));
            fieldBuilder = fieldBuilder
                    .name(columnNames.get(i))
                    .doc(comment)
                    .type(fieldSchema)
                    .withDefault(null);
        }
        return fieldBuilder.endRecord();
    }

    private Schema avroSchemaForHiveType(HiveType hiveType)
    {
        Schema schema = switch (hiveType.getCategory()) {
            case PRIMITIVE -> createAvroPrimitive(hiveType);
            case LIST -> {
                ListTypeInfo listTypeInfo = (ListTypeInfo) hiveType.getTypeInfo();
                yield Schema.createArray(avroSchemaForHiveType(HiveType.fromTypeInfo(listTypeInfo.getListElementTypeInfo())));
            }
            case MAP -> {
                MapTypeInfo mapTypeInfo = ((MapTypeInfo) hiveType.getTypeInfo());
                TypeInfo keyTypeInfo = mapTypeInfo.getMapKeyTypeInfo();
                if (!(keyTypeInfo instanceof PrimitiveTypeInfo primitiveKeyTypeInfo) ||
                        primitiveKeyTypeInfo.getPrimitiveCategory() != PrimitiveCategory.STRING) {
                    throw new UnsupportedOperationException("Key of Map must be a String");
                }
                TypeInfo valueTypeInfo = mapTypeInfo.getMapValueTypeInfo();
                yield Schema.createMap(avroSchemaForHiveType(HiveType.fromTypeInfo(valueTypeInfo)));
            }
            case STRUCT -> createAvroRecord(hiveType);
            case UNION -> {
                List childSchemas = new ArrayList<>();
                for (TypeInfo childTypeInfo : ((UnionTypeInfo) hiveType.getTypeInfo()).getAllUnionObjectTypeInfos()) {
                    final Schema childSchema = avroSchemaForHiveType(HiveType.fromTypeInfo(childTypeInfo));
                    if (childSchema.getType() == Schema.Type.UNION) {
                        childSchemas.addAll(childSchema.getTypes());
                    }
                    else {
                        childSchemas.add(childSchema);
                    }
                }
                yield Schema.createUnion(removeDuplicateNullSchemas(childSchemas));
            }
        };

        return wrapInUnionWithNull(schema);
    }

    private static Schema createAvroPrimitive(HiveType hiveType)
    {
        if (!(hiveType.getTypeInfo() instanceof PrimitiveTypeInfo primitiveTypeInfo)) {
            throw new IllegalStateException("HiveType in primitive category must have PrimitiveTypeInfo");
        }
        return switch (primitiveTypeInfo.getPrimitiveCategory()) {
            case STRING -> Schema.create(Schema.Type.STRING);
            case CHAR -> {
                Schema charSchema = SchemaBuilder.builder().type(Schema.create(Schema.Type.STRING));
                charSchema.addProp(LogicalType.LOGICAL_TYPE_PROP, CHAR_TYPE_LOGICAL_NAME);
                charSchema.addProp(VARCHAR_AND_CHAR_LOGICAL_TYPE_LENGTH_PROP, ((CharTypeInfo) hiveType.getTypeInfo()).getLength());
                yield charSchema;
            }
            case VARCHAR -> {
                Schema varcharSchema = SchemaBuilder.builder().type(Schema.create(Schema.Type.STRING));
                varcharSchema.addProp(LogicalType.LOGICAL_TYPE_PROP, VARCHAR_TYPE_LOGICAL_NAME);
                varcharSchema.addProp(VARCHAR_AND_CHAR_LOGICAL_TYPE_LENGTH_PROP, ((VarcharTypeInfo) hiveType.getTypeInfo()).getLength());
                yield varcharSchema;
            }
            case BINARY -> Schema.create(Schema.Type.BYTES);
            case BYTE, SHORT, INT -> Schema.create(Schema.Type.INT);
            case LONG -> Schema.create(Schema.Type.LONG);
            case FLOAT -> Schema.create(Schema.Type.FLOAT);
            case DOUBLE -> Schema.create(Schema.Type.DOUBLE);
            case BOOLEAN -> Schema.create(Schema.Type.BOOLEAN);
            case DECIMAL -> {
                DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) hiveType.getTypeInfo();
                LogicalTypes.Decimal decimalLogicalType = LogicalTypes.decimal(decimalTypeInfo.precision(), decimalTypeInfo.scale());
                yield decimalLogicalType.addToSchema(Schema.create(Schema.Type.BYTES));
            }
            case DATE -> NativeLogicalTypesAvroTypeManager.DATE_SCHEMA;
            case TIMESTAMP -> NativeLogicalTypesAvroTypeManager.TIMESTAMP_MILLIS_SCHEMA;
            case VOID -> Schema.create(Schema.Type.NULL);
            default -> throw new UnsupportedOperationException(hiveType + " is not supported.");
        };
    }

    private Schema createAvroRecord(HiveType hiveType)
    {
        if (!(hiveType.getTypeInfo() instanceof StructTypeInfo structTypeInfo)) {
            throw new IllegalStateException("HiveType type info must be Struct Type info to make Avro Record");
        }

        final List allStructFieldNames =
                structTypeInfo.getAllStructFieldNames();
        final List allStructFieldTypeInfo =
                structTypeInfo.getAllStructFieldTypeInfos();
        if (allStructFieldNames.size() != allStructFieldTypeInfo.size()) {
            throw new IllegalArgumentException("Failed to generate avro schema from hive schema. " +
                    "name and column type differs. names = " + allStructFieldNames + ", types = " +
                    allStructFieldTypeInfo);
        }

        SchemaBuilder.FieldAssembler fieldAssembler = SchemaBuilder
                .record("record_" + recordNameSuffix.getAndIncrement())
                .doc(structTypeInfo.toString())
                .fields();

        for (int i = 0; i < allStructFieldNames.size(); ++i) {
            final TypeInfo childTypeInfo = allStructFieldTypeInfo.get(i);
            final Schema fieldSchema = avroSchemaForHiveType(HiveType.fromTypeInfo(childTypeInfo));
            fieldAssembler = fieldAssembler
                    .name(allStructFieldNames.get(i))
                    .doc(childTypeInfo.toString())
                    .type(fieldSchema)
                    .withDefault(null);
        }
        return fieldAssembler.endRecord();
    }

    public static Schema wrapInUnionWithNull(Schema schema)
    {
        return switch (schema.getType()) {
            case NULL -> schema;
            case UNION -> Schema.createUnion(removeDuplicateNullSchemas(schema.getTypes()));
            default -> Schema.createUnion(Arrays.asList(Schema.create(Schema.Type.NULL), schema));
        };
    }

    private static List removeDuplicateNullSchemas(List childSchemas)
    {
        List prunedSchemas = new ArrayList<>();
        boolean isNullPresent = false;
        for (Schema schema : childSchemas) {
            if (schema.getType() == Schema.Type.NULL) {
                isNullPresent = true;
            }
            else {
                prunedSchemas.add(schema);
            }
        }
        if (isNullPresent) {
            prunedSchemas.add(0, Schema.create(Schema.Type.NULL));
        }

        return prunedSchemas;
    }

    static Map getCanonicalToGivenFieldName(Schema schema)
    {
        // Lower case top level fields to allow for manually set avro schema (passed in via avro_schema_literal or avro_schema_url) to have uppercase field names
        return schema.getFields().stream()
                .map(Schema.Field::name)
                .collect(toImmutableMap(fieldName -> fieldName.toLowerCase(Locale.ENGLISH), identity()));
    }

    private static Schema.Parser getSchemaParser()
    {
        // HIVE-24797: Disable validate default values when parsing Avro schemas.
        return new Schema.Parser().setValidateDefaults(false);
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy