All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.util.AvroSchemaConverter Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.util;

import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.types.AtomicDataType;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.ArrayType;
import org.apache.flink.table.types.logical.DecimalType;
import org.apache.flink.table.types.logical.IntType;
import org.apache.flink.table.types.logical.LocalZonedTimestampType;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.LogicalTypeFamily;
import org.apache.flink.table.types.logical.MapType;
import org.apache.flink.table.types.logical.MultisetType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.table.types.logical.TimeType;
import org.apache.flink.table.types.logical.TimestampType;
import org.apache.flink.table.types.logical.TypeInformationRawType;

import java.util.List;
import java.util.stream.Collectors;

/**
 * Converts an Avro schema into Flink's type information. It uses {@link org.apache.flink.api.java.typeutils.RowTypeInfo} for
 * representing objects and converts Avro types into types that are compatible with Flink's Table &
 * SQL API.
 *
 * 

Note: Changes in this class need to be kept in sync with the corresponding runtime classes * {@code org.apache.flink.formats.avro.AvroRowDeserializationSchema} and {@code org.apache.flink.formats.avro.AvroRowSerializationSchema}. * *

NOTE: reference from Flink release 1.12.0, should remove when Flink version upgrade to that. */ public class AvroSchemaConverter { /** * Converts an Avro schema {@code schema} into a nested row structure with deterministic field order and * data types that are compatible with Flink's Table & SQL API. * * @param schema Avro schema definition * @return data type matching the schema */ public static DataType convertToDataType(Schema schema) { switch (schema.getType()) { case RECORD: final List schemaFields = schema.getFields(); final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()]; for (int i = 0; i < schemaFields.size(); i++) { final Schema.Field field = schemaFields.get(i); fields[i] = DataTypes.FIELD(field.name(), convertToDataType(field.schema())); } return DataTypes.ROW(fields).notNull(); case ENUM: case STRING: // convert Avro's Utf8/CharSequence to String return DataTypes.STRING().notNull(); case ARRAY: return DataTypes.ARRAY(convertToDataType(schema.getElementType())).notNull(); case MAP: return DataTypes.MAP( DataTypes.STRING().notNull(), convertToDataType(schema.getValueType())) .notNull(); case UNION: final Schema actualSchema; final boolean nullable; if (schema.getTypes().size() == 2 && schema.getTypes().get(0).getType() == Schema.Type.NULL) { actualSchema = schema.getTypes().get(1); nullable = true; } else if (schema.getTypes().size() == 2 && schema.getTypes().get(1).getType() == Schema.Type.NULL) { actualSchema = schema.getTypes().get(0); nullable = true; } else if (schema.getTypes().size() == 1) { actualSchema = schema.getTypes().get(0); nullable = false; } else { List nonNullTypes = schema.getTypes().stream() .filter(s -> s.getType() != Schema.Type.NULL) .collect(Collectors.toList()); nullable = schema.getTypes().size() > nonNullTypes.size(); // use Kryo for serialization DataType rawDataType = new AtomicDataType( new TypeInformationRawType<>(false, Types.GENERIC(Object.class))) .notNull(); if (recordTypesOfSameNumFields(nonNullTypes)) { DataType converted = DataTypes.ROW( DataTypes.FIELD("wrapper", rawDataType)) .notNull(); return nullable ? converted.nullable() : converted; } // use Kryo for serialization return nullable ? rawDataType.nullable() : rawDataType; } DataType converted = convertToDataType(actualSchema); return nullable ? converted.nullable() : converted; case FIXED: // logical decimal type if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()) .notNull(); } // convert fixed size binary data to primitive byte arrays return DataTypes.VARBINARY(schema.getFixedSize()).notNull(); case BYTES: // logical decimal type if (schema.getLogicalType() instanceof LogicalTypes.Decimal) { final LogicalTypes.Decimal decimalType = (LogicalTypes.Decimal) schema.getLogicalType(); return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale()) .notNull(); } return DataTypes.BYTES().notNull(); case INT: // logical date and time type final org.apache.avro.LogicalType logicalType = schema.getLogicalType(); if (logicalType == LogicalTypes.date()) { return DataTypes.DATE().notNull(); } else if (logicalType == LogicalTypes.timeMillis()) { return DataTypes.TIME(3).notNull(); } return DataTypes.INT().notNull(); case LONG: // logical timestamp type if (schema.getLogicalType() == LogicalTypes.timestampMillis()) { return DataTypes.TIMESTAMP(3).notNull(); } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) { return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull(); } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) { return DataTypes.TIMESTAMP(6).notNull(); } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) { return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull(); } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) { return DataTypes.TIME(3).notNull(); } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) { return DataTypes.TIME(6).notNull(); } return DataTypes.BIGINT().notNull(); case FLOAT: return DataTypes.FLOAT().notNull(); case DOUBLE: return DataTypes.DOUBLE().notNull(); case BOOLEAN: return DataTypes.BOOLEAN().notNull(); case NULL: return DataTypes.NULL(); default: throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'."); } } /** * Returns true if all the types are RECORD type with same number of fields. */ private static boolean recordTypesOfSameNumFields(List types) { if (types == null || types.size() == 0) { return false; } if (types.stream().anyMatch(s -> s.getType() != Schema.Type.RECORD)) { return false; } int numFields = types.get(0).getFields().size(); return types.stream().allMatch(s -> s.getFields().size() == numFields); } /** * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. * *

Use "record" as the type name. * * @param schema the schema type, usually it should be the top level record type, e.g. not a * nested type * @return Avro's {@link Schema} matching this logical type. */ public static Schema convertToSchema(LogicalType schema) { return convertToSchema(schema, "record"); } /** * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema. * *

The "{rowName}." is used as the nested row type name prefix in order to generate the right * schema. Nested record type that only differs with type name is still compatible. * * @param logicalType logical type * @param rowName the record name * @return Avro's {@link Schema} matching this logical type. */ public static Schema convertToSchema(LogicalType logicalType, String rowName) { int precision; boolean nullable = logicalType.isNullable(); switch (logicalType.getTypeRoot()) { case NULL: return SchemaBuilder.builder().nullType(); case BOOLEAN: Schema bool = SchemaBuilder.builder().booleanType(); return nullable ? nullableSchema(bool) : bool; case TINYINT: case SMALLINT: case INTEGER: Schema integer = SchemaBuilder.builder().intType(); return nullable ? nullableSchema(integer) : integer; case BIGINT: Schema bigint = SchemaBuilder.builder().longType(); return nullable ? nullableSchema(bigint) : bigint; case FLOAT: Schema f = SchemaBuilder.builder().floatType(); return nullable ? nullableSchema(f) : f; case DOUBLE: Schema d = SchemaBuilder.builder().doubleType(); return nullable ? nullableSchema(d) : d; case CHAR: case VARCHAR: Schema str = SchemaBuilder.builder().stringType(); return nullable ? nullableSchema(str) : str; case BINARY: case VARBINARY: Schema binary = SchemaBuilder.builder().bytesType(); return nullable ? nullableSchema(binary) : binary; case TIMESTAMP_WITHOUT_TIME_ZONE: // use long to represents Timestamp final TimestampType timestampType = (TimestampType) logicalType; precision = timestampType.getPrecision(); org.apache.avro.LogicalType timestampLogicalType; if (precision <= 3) { timestampLogicalType = LogicalTypes.timestampMillis(); } else if (precision <= 6) { timestampLogicalType = LogicalTypes.timestampMicros(); } else { throw new IllegalArgumentException( "Avro does not support TIMESTAMP type with precision: " + precision + ", it only support precisions <= 6."); } Schema timestamp = timestampLogicalType.addToSchema(SchemaBuilder.builder().longType()); return nullable ? nullableSchema(timestamp) : timestamp; case TIMESTAMP_WITH_LOCAL_TIME_ZONE: // use long to represents LocalZonedTimestampType final LocalZonedTimestampType localZonedTimestampType = (LocalZonedTimestampType) logicalType; precision = localZonedTimestampType.getPrecision(); org.apache.avro.LogicalType localZonedTimestampLogicalType; if (precision <= 3) { localZonedTimestampLogicalType = LogicalTypes.localTimestampMillis(); } else if (precision <= 6) { localZonedTimestampLogicalType = LogicalTypes.localTimestampMicros(); } else { throw new IllegalArgumentException( "Avro does not support LOCAL TIMESTAMP type with precision: " + precision + ", it only support precisions <= 6."); } Schema localZonedTimestamp = localZonedTimestampLogicalType.addToSchema(SchemaBuilder.builder().longType()); return nullable ? nullableSchema(localZonedTimestamp) : localZonedTimestamp; case DATE: // use int to represents Date Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType()); return nullable ? nullableSchema(date) : date; case TIME_WITHOUT_TIME_ZONE: precision = ((TimeType) logicalType).getPrecision(); if (precision > 3) { throw new IllegalArgumentException( "Avro does not support TIME type with precision: " + precision + ", it only supports precision less than 3."); } // use int to represents Time, we only support millisecond when deserialization Schema time = LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType()); return nullable ? nullableSchema(time) : time; case DECIMAL: DecimalType decimalType = (DecimalType) logicalType; // store BigDecimal as Fixed // for spark compatibility. Schema decimal = LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale()) .addToSchema(SchemaBuilder .fixed(String.format("%s.fixed", rowName)) .size(computeMinBytesForDecimalPrecision(decimalType.getPrecision()))); return nullable ? nullableSchema(decimal) : decimal; case ROW: RowType rowType = (RowType) logicalType; List fieldNames = rowType.getFieldNames(); // we have to make sure the record name is different in a Schema SchemaBuilder.FieldAssembler builder = SchemaBuilder.builder().record(rowName).fields(); for (int i = 0; i < rowType.getFieldCount(); i++) { String fieldName = fieldNames.get(i); LogicalType fieldType = rowType.getTypeAt(i); SchemaBuilder.GenericDefault fieldBuilder = builder.name(fieldName) .type(convertToSchema(fieldType, rowName + "." + fieldName)); if (fieldType.isNullable()) { builder = fieldBuilder.withDefault(null); } else { builder = fieldBuilder.noDefault(); } } Schema record = builder.endRecord(); return nullable ? nullableSchema(record) : record; case MULTISET: case MAP: Schema map = SchemaBuilder.builder() .map() .values( convertToSchema( extractValueTypeToAvroMap(logicalType), rowName)); return nullable ? nullableSchema(map) : map; case ARRAY: ArrayType arrayType = (ArrayType) logicalType; Schema array = SchemaBuilder.builder() .array() .items(convertToSchema(arrayType.getElementType(), rowName)); return nullable ? nullableSchema(array) : array; case RAW: default: throw new UnsupportedOperationException( "Unsupported to derive Schema for type: " + logicalType); } } public static LogicalType extractValueTypeToAvroMap(LogicalType type) { LogicalType keyType; LogicalType valueType; if (type instanceof MapType) { MapType mapType = (MapType) type; keyType = mapType.getKeyType(); valueType = mapType.getValueType(); } else { MultisetType multisetType = (MultisetType) type; keyType = multisetType.getElementType(); valueType = new IntType(); } if (!DataTypeUtils.isFamily(keyType, LogicalTypeFamily.CHARACTER_STRING)) { throw new UnsupportedOperationException( "Avro format doesn't support non-string as key type of map. " + "The key type is: " + keyType.asSummaryString()); } return valueType; } /** * Returns schema with nullable true. */ private static Schema nullableSchema(Schema schema) { return schema.isNullable() ? schema : Schema.createUnion(SchemaBuilder.builder().nullType(), schema); } private static int computeMinBytesForDecimalPrecision(int precision) { int numBytes = 1; while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, precision)) { numBytes += 1; } return numBytes; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy