org.apache.hudi.util.AvroSchemaConverter Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.util;

import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.SchemaBuilder;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.table.api.DataTypes;
import org.apache.flink.table.types.AtomicDataType;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.types.logical.ArrayType;
import org.apache.flink.table.types.logical.DecimalType;
import org.apache.flink.table.types.logical.IntType;
import org.apache.flink.table.types.logical.LocalZonedTimestampType;
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.LogicalTypeFamily;
import org.apache.flink.table.types.logical.MapType;
import org.apache.flink.table.types.logical.MultisetType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.table.types.logical.TimeType;
import org.apache.flink.table.types.logical.TimestampType;
import org.apache.flink.table.types.logical.TypeInformationRawType;

import java.util.List;
import java.util.stream.Collectors;

/**
 * Converts an Avro schema into Flink's type information. It uses {@link org.apache.flink.api.java.typeutils.RowTypeInfo} for
 * representing objects and converts Avro types into types that are compatible with Flink's Table &
 * SQL API.
 *
 * Note: Changes in this class need to be kept in sync with the corresponding runtime classes
 * {@code org.apache.flink.formats.avro.AvroRowDeserializationSchema} and {@code org.apache.flink.formats.avro.AvroRowSerializationSchema}.
 *
 * 
NOTE: reference from Flink release 1.12.0, should remove when Flink version upgrade to that.
 */
public class AvroSchemaConverter {

  /**
   * Converts an Avro schema {@code schema} into a nested row structure with deterministic field order and
   * data types that are compatible with Flink's Table & SQL API.
   *
   * @param schema Avro schema definition
   * @return data type matching the schema
   */
  public static DataType convertToDataType(Schema schema) {
    switch (schema.getType()) {
      case RECORD:
        final List schemaFields = schema.getFields();

        final DataTypes.Field[] fields = new DataTypes.Field[schemaFields.size()];
        for (int i = 0; i < schemaFields.size(); i++) {
          final Schema.Field field = schemaFields.get(i);
          fields[i] = DataTypes.FIELD(field.name(), convertToDataType(field.schema()));
        }
        return DataTypes.ROW(fields).notNull();
      case ENUM:
      case STRING:
        // convert Avro's Utf8/CharSequence to String
        return DataTypes.STRING().notNull();
      case ARRAY:
        return DataTypes.ARRAY(convertToDataType(schema.getElementType())).notNull();
      case MAP:
        return DataTypes.MAP(
                DataTypes.STRING().notNull(),
                convertToDataType(schema.getValueType()))
            .notNull();
      case UNION:
        final Schema actualSchema;
        final boolean nullable;
        if (schema.getTypes().size() == 2
            && schema.getTypes().get(0).getType() == Schema.Type.NULL) {
          actualSchema = schema.getTypes().get(1);
          nullable = true;
        } else if (schema.getTypes().size() == 2
            && schema.getTypes().get(1).getType() == Schema.Type.NULL) {
          actualSchema = schema.getTypes().get(0);
          nullable = true;
        } else if (schema.getTypes().size() == 1) {
          actualSchema = schema.getTypes().get(0);
          nullable = false;
        } else {
          List nonNullTypes = schema.getTypes().stream()
              .filter(s -> s.getType() != Schema.Type.NULL)
              .collect(Collectors.toList());
          nullable = schema.getTypes().size() > nonNullTypes.size();

          // use Kryo for serialization
          DataType rawDataType = new AtomicDataType(
              new TypeInformationRawType<>(false, Types.GENERIC(Object.class)))
              .notNull();

          if (recordTypesOfSameNumFields(nonNullTypes)) {
            DataType converted = DataTypes.ROW(
                    DataTypes.FIELD("wrapper", rawDataType))
                .notNull();
            return nullable ? converted.nullable() : converted;
          }
          // use Kryo for serialization
          return nullable ? rawDataType.nullable() : rawDataType;
        }
        DataType converted = convertToDataType(actualSchema);
        return nullable ? converted.nullable() : converted;
      case FIXED:
        // logical decimal type
        if (schema.getLogicalType() instanceof LogicalTypes.Decimal) {
          final LogicalTypes.Decimal decimalType =
              (LogicalTypes.Decimal) schema.getLogicalType();
          return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale())
              .notNull();
        }
        // convert fixed size binary data to primitive byte arrays
        return DataTypes.VARBINARY(schema.getFixedSize()).notNull();
      case BYTES:
        // logical decimal type
        if (schema.getLogicalType() instanceof LogicalTypes.Decimal) {
          final LogicalTypes.Decimal decimalType =
              (LogicalTypes.Decimal) schema.getLogicalType();
          return DataTypes.DECIMAL(decimalType.getPrecision(), decimalType.getScale())
              .notNull();
        }
        return DataTypes.BYTES().notNull();
      case INT:
        // logical date and time type
        final org.apache.avro.LogicalType logicalType = schema.getLogicalType();
        if (logicalType == LogicalTypes.date()) {
          return DataTypes.DATE().notNull();
        } else if (logicalType == LogicalTypes.timeMillis()) {
          return DataTypes.TIME(3).notNull();
        }
        return DataTypes.INT().notNull();
      case LONG:
        // logical timestamp type
        if (schema.getLogicalType() == LogicalTypes.timestampMillis()) {
          return DataTypes.TIMESTAMP(3).notNull();
        } else if (schema.getLogicalType() == LogicalTypes.localTimestampMillis()) {
          return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(3).notNull();
        } else if (schema.getLogicalType() == LogicalTypes.timestampMicros()) {
          return DataTypes.TIMESTAMP(6).notNull();
        } else if (schema.getLogicalType() == LogicalTypes.localTimestampMicros()) {
          return DataTypes.TIMESTAMP_WITH_LOCAL_TIME_ZONE(6).notNull();
        } else if (schema.getLogicalType() == LogicalTypes.timeMillis()) {
          return DataTypes.TIME(3).notNull();
        } else if (schema.getLogicalType() == LogicalTypes.timeMicros()) {
          return DataTypes.TIME(6).notNull();
        }
        return DataTypes.BIGINT().notNull();
      case FLOAT:
        return DataTypes.FLOAT().notNull();
      case DOUBLE:
        return DataTypes.DOUBLE().notNull();
      case BOOLEAN:
        return DataTypes.BOOLEAN().notNull();
      case NULL:
        return DataTypes.NULL();
      default:
        throw new IllegalArgumentException("Unsupported Avro type '" + schema.getType() + "'.");
    }
  }

  /**
   * Returns true if all the types are RECORD type with same number of fields.
   */
  private static boolean recordTypesOfSameNumFields(List types) {
    if (types == null || types.size() == 0) {
      return false;
    }
    if (types.stream().anyMatch(s -> s.getType() != Schema.Type.RECORD)) {
      return false;
    }
    int numFields = types.get(0).getFields().size();
    return types.stream().allMatch(s -> s.getFields().size() == numFields);
  }

  /**
   * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema.
   *
   * 
Use "record" as the type name.
   *
   * @param schema the schema type, usually it should be the top level record type, e.g. not a
   *               nested type
   * @return Avro's {@link Schema} matching this logical type.
   */
  public static Schema convertToSchema(LogicalType schema) {
    return convertToSchema(schema, "record");
  }

  /**
   * Converts Flink SQL {@link LogicalType} (can be nested) into an Avro schema.
   *
   * The "{rowName}." is used as the nested row type name prefix in order to generate the right
   * schema. Nested record type that only differs with type name is still compatible.
   *
   * @param logicalType logical type
   * @param rowName     the record name
   * @return Avro's {@link Schema} matching this logical type.
   */
  public static Schema convertToSchema(LogicalType logicalType, String rowName) {
    int precision;
    boolean nullable = logicalType.isNullable();
    switch (logicalType.getTypeRoot()) {
      case NULL:
        return SchemaBuilder.builder().nullType();
      case BOOLEAN:
        Schema bool = SchemaBuilder.builder().booleanType();
        return nullable ? nullableSchema(bool) : bool;
      case TINYINT:
      case SMALLINT:
      case INTEGER:
        Schema integer = SchemaBuilder.builder().intType();
        return nullable ? nullableSchema(integer) : integer;
      case BIGINT:
        Schema bigint = SchemaBuilder.builder().longType();
        return nullable ? nullableSchema(bigint) : bigint;
      case FLOAT:
        Schema f = SchemaBuilder.builder().floatType();
        return nullable ? nullableSchema(f) : f;
      case DOUBLE:
        Schema d = SchemaBuilder.builder().doubleType();
        return nullable ? nullableSchema(d) : d;
      case CHAR:
      case VARCHAR:
        Schema str = SchemaBuilder.builder().stringType();
        return nullable ? nullableSchema(str) : str;
      case BINARY:
      case VARBINARY:
        Schema binary = SchemaBuilder.builder().bytesType();
        return nullable ? nullableSchema(binary) : binary;
      case TIMESTAMP_WITHOUT_TIME_ZONE:
        // use long to represents Timestamp
        final TimestampType timestampType = (TimestampType) logicalType;
        precision = timestampType.getPrecision();
        org.apache.avro.LogicalType timestampLogicalType;
        if (precision <= 3) {
          timestampLogicalType = LogicalTypes.timestampMillis();
        } else if (precision <= 6) {
          timestampLogicalType = LogicalTypes.timestampMicros();
        } else {
          throw new IllegalArgumentException(
              "Avro does not support TIMESTAMP type with precision: "
                  + precision
                  + ", it only support precisions <= 6.");
        }
        Schema timestamp = timestampLogicalType.addToSchema(SchemaBuilder.builder().longType());
        return nullable ? nullableSchema(timestamp) : timestamp;
      case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
        // use long to represents LocalZonedTimestampType
        final LocalZonedTimestampType localZonedTimestampType = (LocalZonedTimestampType) logicalType;
        precision = localZonedTimestampType.getPrecision();
        org.apache.avro.LogicalType localZonedTimestampLogicalType;
        if (precision <= 3) {
          localZonedTimestampLogicalType = LogicalTypes.localTimestampMillis();
        } else if (precision <= 6) {
          localZonedTimestampLogicalType = LogicalTypes.localTimestampMicros();
        } else {
          throw new IllegalArgumentException(
              "Avro does not support LOCAL TIMESTAMP type with precision: "
                  + precision
                  + ", it only support precisions <= 6.");
        }
        Schema localZonedTimestamp = localZonedTimestampLogicalType.addToSchema(SchemaBuilder.builder().longType());
        return nullable ? nullableSchema(localZonedTimestamp) : localZonedTimestamp;
      case DATE:
        // use int to represents Date
        Schema date = LogicalTypes.date().addToSchema(SchemaBuilder.builder().intType());
        return nullable ? nullableSchema(date) : date;
      case TIME_WITHOUT_TIME_ZONE:
        precision = ((TimeType) logicalType).getPrecision();
        if (precision > 3) {
          throw new IllegalArgumentException(
              "Avro does not support TIME type with precision: "
                  + precision
                  + ", it only supports precision less than 3.");
        }
        // use int to represents Time, we only support millisecond when deserialization
        Schema time =
            LogicalTypes.timeMillis().addToSchema(SchemaBuilder.builder().intType());
        return nullable ? nullableSchema(time) : time;
      case DECIMAL:
        DecimalType decimalType = (DecimalType) logicalType;
        // store BigDecimal as Fixed
        // for spark compatibility.
        Schema decimal =
            LogicalTypes.decimal(decimalType.getPrecision(), decimalType.getScale())
                .addToSchema(SchemaBuilder
                    .fixed(String.format("%s.fixed", rowName))
                    .size(computeMinBytesForDecimalPrecision(decimalType.getPrecision())));
        return nullable ? nullableSchema(decimal) : decimal;
      case ROW:
        RowType rowType = (RowType) logicalType;
        List fieldNames = rowType.getFieldNames();
        // we have to make sure the record name is different in a Schema
        SchemaBuilder.FieldAssembler builder =
            SchemaBuilder.builder().record(rowName).fields();
        for (int i = 0; i < rowType.getFieldCount(); i++) {
          String fieldName = fieldNames.get(i);
          LogicalType fieldType = rowType.getTypeAt(i);
          SchemaBuilder.GenericDefault fieldBuilder =
              builder.name(fieldName)
                  .type(convertToSchema(fieldType, rowName + "." + fieldName));

          if (fieldType.isNullable()) {
            builder = fieldBuilder.withDefault(null);
          } else {
            builder = fieldBuilder.noDefault();
          }
        }
        Schema record = builder.endRecord();
        return nullable ? nullableSchema(record) : record;
      case MULTISET:
      case MAP:
        Schema map =
            SchemaBuilder.builder()
                .map()
                .values(
                    convertToSchema(
                        extractValueTypeToAvroMap(logicalType), rowName));
        return nullable ? nullableSchema(map) : map;
      case ARRAY:
        ArrayType arrayType = (ArrayType) logicalType;
        Schema array =
            SchemaBuilder.builder()
                .array()
                .items(convertToSchema(arrayType.getElementType(), rowName));
        return nullable ? nullableSchema(array) : array;
      case RAW:
      default:
        throw new UnsupportedOperationException(
            "Unsupported to derive Schema for type: " + logicalType);
    }
  }

  public static LogicalType extractValueTypeToAvroMap(LogicalType type) {
    LogicalType keyType;
    LogicalType valueType;
    if (type instanceof MapType) {
      MapType mapType = (MapType) type;
      keyType = mapType.getKeyType();
      valueType = mapType.getValueType();
    } else {
      MultisetType multisetType = (MultisetType) type;
      keyType = multisetType.getElementType();
      valueType = new IntType();
    }
    if (!DataTypeUtils.isFamily(keyType, LogicalTypeFamily.CHARACTER_STRING)) {
      throw new UnsupportedOperationException(
          "Avro format doesn't support non-string as key type of map. "
              + "The key type is: "
              + keyType.asSummaryString());
    }
    return valueType;
  }

  /**
   * Returns schema with nullable true.
   */
  private static Schema nullableSchema(Schema schema) {
    return schema.isNullable()
        ? schema
        : Schema.createUnion(SchemaBuilder.builder().nullType(), schema);
  }

  private static int computeMinBytesForDecimalPrecision(int precision) {
    int numBytes = 1;
    while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, precision)) {
      numBytes += 1;
    }
    return numBytes;
  }
}