All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.internal.schema.convert.AvroInternalSchemaConverter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.internal.schema.convert;

import org.apache.hudi.avro.AvroSchemaUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.VisibleForTesting;
import org.apache.hudi.exception.HoodieNullSchemaTypeException;
import org.apache.hudi.internal.schema.HoodieSchemaException;
import org.apache.hudi.internal.schema.InternalSchema;
import org.apache.hudi.internal.schema.Type;
import org.apache.hudi.internal.schema.Types;
import org.apache.hudi.internal.schema.utils.InternalSchemaUtils;

import org.apache.avro.JsonProperties;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import static org.apache.avro.Schema.Type.STRING;
import static org.apache.avro.Schema.Type.UNION;

/**
 * Auxiliary class.
 * Converts an avro schema into InternalSchema, or convert InternalSchema to an avro schema
 */
public class AvroInternalSchemaConverter {

  // NOTE: We're using dot as field's name delimiter for nested fields
  //       so that Avro is able to interpret qualified name as rather
  //       the combination of the Avro's namespace and actual record's name.
  //       For example qualified nested field's name "trip.fare.amount",
  //       Avro will produce a record with
  //          - Namespace: "trip.fare"
  //          - Name: "amount"
  //
  //        This is crucial aspect of maintaining compatibility b/w schemas, after
  //        converting Avro [[Schema]]s to [[InternalSchema]]s and back
  private static final String AVRO_NAME_DELIMITER = ".";

  /**
   * Convert internalSchema to avro Schema.
   *
   * @param internalSchema internal schema.
   * @param name the record name.
   * @return an avro Schema.
   */
  public static Schema convert(InternalSchema internalSchema, String name) {
    return buildAvroSchemaFromInternalSchema(internalSchema, name);
  }

  public static InternalSchema pruneAvroSchemaToInternalSchema(Schema schema, InternalSchema originSchema) {
    List pruneNames = collectColNamesFromSchema(schema);
    return InternalSchemaUtils.pruneInternalSchema(originSchema, pruneNames);
  }

  /**
   * Collect all the leaf nodes names.
   *
   * @param schema a avro schema.
   * @return leaf nodes full names.
   */
  @VisibleForTesting
  static List collectColNamesFromSchema(Schema schema) {
    List result = new ArrayList<>();
    Deque visited = new LinkedList<>();
    collectColNamesFromAvroSchema(schema, visited, result);
    return result;
  }

  private static void collectColNamesFromAvroSchema(Schema schema, Deque visited, List resultSet) {
    switch (schema.getType()) {
      case RECORD:
        List fields = schema.getFields();
        for (Schema.Field f : fields) {
          visited.push(f.name());
          collectColNamesFromAvroSchema(f.schema(), visited, resultSet);
          visited.pop();
          addFullNameIfLeafNode(f.schema(), f.name(), visited, resultSet);
        }
        return;

      case UNION:
        collectColNamesFromAvroSchema(AvroSchemaUtils.resolveNullableSchema(schema), visited, resultSet);
        return;

      case ARRAY:
        visited.push("element");
        collectColNamesFromAvroSchema(schema.getElementType(), visited, resultSet);
        visited.pop();
        addFullNameIfLeafNode(schema.getElementType(), "element", visited, resultSet);
        return;

      case MAP:
        addFullNameIfLeafNode(STRING, "key", visited, resultSet);
        visited.push("value");
        collectColNamesFromAvroSchema(schema.getValueType(), visited, resultSet);
        visited.pop();
        addFullNameIfLeafNode(schema.getValueType(), "value", visited, resultSet);
        return;

      default:
    }
  }

  private static void addFullNameIfLeafNode(Schema schema, String name, Deque visited, List resultSet) {
    addFullNameIfLeafNode(AvroSchemaUtils.resolveNullableSchema(schema).getType(), name, visited, resultSet);
  }

  private static void addFullNameIfLeafNode(Schema.Type type, String name, Deque visited, List resultSet) {
    switch (type) {
      case RECORD:
      case ARRAY:
      case MAP:
        return;
      default:
        resultSet.add(InternalSchemaUtils.createFullName(name, visited));
    }
  }

  /**
   * Converting from avro -> internal schema -> avro
   * causes null to always be first in unions.
   * if we compare a schema that has not been converted to internal schema
   * at any stage, the difference in ordering can cause issues. To resolve this,
   * we order null to be first for any avro schema that enters into hudi.
   * AvroSchemaUtils.isProjectionOfInternal uses index based comparison for unions.
   * Spark and flink don't support complex unions so this would not be an issue
   * but for the metadata table HoodieMetadata.avsc uses a trick where we have a bunch of
   * different types wrapped in record for col stats.
   *
   * @param schema avro schema.
   * @return an avro Schema where null is the first.
   */
  public static Schema fixNullOrdering(Schema schema) {
    if (schema == null) {
      return Schema.create(Schema.Type.NULL);
    } else if (schema.getType() == Schema.Type.NULL) {
      return schema;
    }
    return convert(convert(schema), schema.getFullName());
  }

  /**
   * Convert RecordType to avro Schema.
   *
   * @param type internal schema.
   * @param name the record name.
   * @return an avro Schema.
   */
  public static Schema convert(Types.RecordType type, String name) {
    return buildAvroSchemaFromType(type, name);
  }

  /**
   * Convert internal type to avro Schema.
   *
   * @param type internal type.
   * @param name the record name.
   * @return an avro Schema.
   */
  public static Schema convert(Type type, String name) {
    return buildAvroSchemaFromType(type, name);
  }

  /** Convert an avro schema into internal type. */
  public static Type convertToField(Schema schema) {
    return buildTypeFromAvroSchema(schema, Collections.emptyMap());
  }

  private static Type convertToField(Schema schema, Map existingFieldNameToPositionMapping) {
    return buildTypeFromAvroSchema(schema, existingFieldNameToPositionMapping);
  }

  /** Convert an avro schema into internalSchema. */
  public static InternalSchema convert(Schema schema, Map existingFieldNameToPositionMapping) {
    return new InternalSchema((Types.RecordType) convertToField(schema, existingFieldNameToPositionMapping));
  }

  public static InternalSchema convert(Schema schema) {
    return new InternalSchema((Types.RecordType) convertToField(schema));
  }

  /** Check whether current avro schema is optional?. */
  public static boolean isOptional(Schema schema) {
    if (schema.getType() == UNION && schema.getTypes().size() == 2) {
      return schema.getTypes().get(0).getType() == Schema.Type.NULL || schema.getTypes().get(1).getType() == Schema.Type.NULL;
    }
    return false;
  }

  /** Returns schema with nullable true. */
  public static Schema nullableSchema(Schema schema) {
    if (schema.getType() == UNION) {
      if (!isOptional(schema)) {
        throw new HoodieSchemaException(String.format("Union schemas are not supported: %s", schema));
      }
      return schema;
    } else {
      return Schema.createUnion(Schema.create(Schema.Type.NULL), schema);
    }
  }

  /**
   * Build hudi type from avro schema.
   *
   * @param schema a avro schema.
   * @return a hudi type.
   */
  public static Type buildTypeFromAvroSchema(Schema schema, Map existingNameToPositions) {
    // set flag to check this has not been visited.
    Deque visited = new LinkedList<>();
    AtomicInteger nextId = new AtomicInteger(0);
    return visitAvroSchemaToBuildType(schema, visited, "", nextId, existingNameToPositions);
  }

  private static void checkNullType(Type fieldType, String fieldName, Deque visited) {
    if (fieldType == null) {
      StringBuilder sb = new StringBuilder();
      sb.append("Field '");
      Iterator visitedIterator = visited.descendingIterator();
      while (visitedIterator.hasNext()) {
        sb.append(visitedIterator.next());
        sb.append(".");
      }
      sb.append(fieldName);
      sb.append("' has type null");
      throw new HoodieNullSchemaTypeException(sb.toString());
    } else if (fieldType.typeId() == Type.TypeID.ARRAY) {
      visited.push(fieldName);
      checkNullType(((Types.ArrayType) fieldType).elementType(), InternalSchema.ARRAY_ELEMENT, visited);
      visited.pop();
    } else if (fieldType.typeId() == Type.TypeID.MAP) {
      visited.push(fieldName);
      checkNullType(((Types.MapType) fieldType).valueType(), InternalSchema.MAP_VALUE, visited);
      visited.pop();
    }
  }

  /**
   * Converts an avro schema into hudi type.
   *
   * @param schema a avro schema.
   * @param visited track the visit node when do traversal for avro schema; used to check if the name of avro record schema is correct.
   * @param currentFieldPath the dot-separated path to the current field; empty at the root and always ends in a '.' otherwise for ease of concatenation.
   * @param nextId an initial id which used to create id for all fields.
   * @return a hudi type match avro schema.
   */
  private static Type visitAvroSchemaToBuildType(Schema schema, Deque visited, String currentFieldPath, AtomicInteger nextId, Map existingNameToPosition) {
    switch (schema.getType()) {
      case RECORD:
        String name = schema.getFullName();
        if (visited.contains(name)) {
          throw new HoodieSchemaException(String.format("cannot convert recursive avro record %s", name));
        }
        visited.push(name);
        List fields = existingNameToPosition.isEmpty() ? schema.getFields() :
            schema.getFields().stream()
                .sorted(Comparator.comparing(field -> existingNameToPosition.getOrDefault(currentFieldPath + field.name(), Integer.MAX_VALUE)))
                .collect(Collectors.toList());
        List fieldTypes = new ArrayList<>(fields.size());
        int nextAssignId = nextId.get();
        nextId.set(nextAssignId + fields.size());
        fields.forEach(field -> {
          Type fieldType = visitAvroSchemaToBuildType(field.schema(), visited, currentFieldPath + field.name() + ".", nextId, existingNameToPosition);
          checkNullType(fieldType, field.name(), visited);
          fieldTypes.add(fieldType);
        });
        visited.pop();
        List internalFields = new ArrayList<>(fields.size());

        for (int i  = 0; i < fields.size(); i++) {
          Schema.Field field = fields.get(i);
          Type fieldType = fieldTypes.get(i);
          internalFields.add(Types.Field.get(nextAssignId, AvroInternalSchemaConverter.isOptional(field.schema()), field.name(), fieldType, field.doc()));
          nextAssignId += 1;
        }
        // NOTE: We're keeping a tab of full-name here to make sure we stay
        //       compatible across various Spark (>= 2.4) and Avro (>= 1.8.2) versions;
        //       Avro will be properly handling fully-qualified names on its own (splitting
        //       them up into namespace/struct-name pair)
        return Types.RecordType.get(internalFields, schema.getFullName());
      case UNION:
        List fTypes = new ArrayList<>(2);
        schema.getTypes().forEach(t -> {
          fTypes.add(visitAvroSchemaToBuildType(t, visited, currentFieldPath, nextId, existingNameToPosition));
        });
        return fTypes.get(0) == null ? fTypes.get(1) : fTypes.get(0);
      case ARRAY:
        String elementPath = currentFieldPath + InternalSchema.ARRAY_ELEMENT + ".";
        Schema elementSchema = schema.getElementType();
        int elementId = nextId.get();
        nextId.set(elementId + 1);
        Type elementType = visitAvroSchemaToBuildType(elementSchema, visited, elementPath, nextId, existingNameToPosition);
        return Types.ArrayType.get(elementId, AvroInternalSchemaConverter.isOptional(schema.getElementType()), elementType);
      case MAP:
        int keyId = nextId.get();
        int valueId = keyId + 1;
        nextId.set(valueId + 1);
        String valuePath = currentFieldPath + InternalSchema.MAP_VALUE + ".";
        Type valueType = visitAvroSchemaToBuildType(schema.getValueType(), visited, valuePath, nextId, existingNameToPosition);
        return Types.MapType.get(keyId, valueId, Types.StringType.get(), valueType, AvroInternalSchemaConverter.isOptional(schema.getValueType()));
      default:
        return visitAvroPrimitiveToBuildInternalType(schema);
    }
  }

  private static Type visitAvroPrimitiveToBuildInternalType(Schema primitive) {
    LogicalType logical = primitive.getLogicalType();
    if (logical != null) {
      String name = logical.getName();
      if (logical instanceof LogicalTypes.Decimal) {
        return Types.DecimalType.get(
                ((LogicalTypes.Decimal) logical).getPrecision(),
                ((LogicalTypes.Decimal) logical).getScale());

      } else if (logical instanceof LogicalTypes.Date) {
        return Types.DateType.get();

      } else if (
              logical instanceof LogicalTypes.TimeMillis
                      || logical instanceof LogicalTypes.TimeMicros) {
        return Types.TimeType.get();

      } else if (
              logical instanceof LogicalTypes.TimestampMillis
                      || logical instanceof LogicalTypes.TimestampMicros) {
        return Types.TimestampType.get();
      } else if (LogicalTypes.uuid().getName().equals(name)) {
        return Types.UUIDType.get();
      }
    }

    switch (primitive.getType()) {
      case BOOLEAN:
        return Types.BooleanType.get();
      case INT:
        return Types.IntType.get();
      case LONG:
        return Types.LongType.get();
      case FLOAT:
        return Types.FloatType.get();
      case DOUBLE:
        return Types.DoubleType.get();
      case STRING:
      case ENUM:
        return Types.StringType.get();
      case FIXED:
        return Types.FixedType.getFixed(primitive.getFixedSize());
      case BYTES:
        return Types.BinaryType.get();
      case NULL:
        return null;
      default:
        throw new UnsupportedOperationException("Unsupported primitive type: " + primitive);
    }
  }

  /**
   * Converts hudi type into an Avro Schema.
   *
   * @param type a hudi type.
   * @param recordName the record name
   * @return an Avro schema match this type
   */
  public static Schema buildAvroSchemaFromType(Type type, String recordName) {
    Map cache = new HashMap<>();
    return visitInternalSchemaToBuildAvroSchema(type, cache, recordName);
  }

  /**
   * Converts hudi internal Schema into an Avro Schema.
   *
   * @param schema a hudi internal Schema.
   * @param recordName the record name
   * @return a Avro schema match hudi internal schema.
   */
  public static Schema buildAvroSchemaFromInternalSchema(InternalSchema schema, String recordName) {
    Map cache = new HashMap<>();
    return visitInternalSchemaToBuildAvroSchema(schema.getRecord(), cache, recordName);
  }

  /**
   * Converts hudi type into an Avro Schema.
   *
   * @param type a hudi type.
   * @param cache use to cache intermediate convert result to save cost.
   * @param recordName auto-generated record name used as a fallback, in case
   * {@link org.apache.hudi.internal.schema.Types.RecordType} doesn't bear original record-name
   * @return an Avro schema match this type
   */
  private static Schema visitInternalSchemaToBuildAvroSchema(Type type, Map cache, String recordName) {
    switch (type.typeId()) {
      case RECORD:
        Types.RecordType record = (Types.RecordType) type;
        List schemas = new ArrayList<>();
        record.fields().forEach(f -> {
          String nestedRecordName = recordName + AVRO_NAME_DELIMITER + f.name();
          Schema tempSchema = visitInternalSchemaToBuildAvroSchema(f.type(), cache, nestedRecordName);
          // convert tempSchema
          Schema result = f.isOptional() ? AvroInternalSchemaConverter.nullableSchema(tempSchema) : tempSchema;
          schemas.add(result);
        });
        // check visited
        Schema recordSchema;
        recordSchema = cache.get(record);
        if (recordSchema != null) {
          return recordSchema;
        }
        recordSchema = visitInternalRecordToBuildAvroRecord(record, schemas, recordName);
        cache.put(record, recordSchema);
        return recordSchema;
      case ARRAY:
        Types.ArrayType array = (Types.ArrayType) type;
        Schema elementSchema;
        elementSchema = visitInternalSchemaToBuildAvroSchema(array.elementType(), cache, recordName);
        Schema arraySchema;
        arraySchema = cache.get(array);
        if (arraySchema != null) {
          return arraySchema;
        }
        arraySchema = visitInternalArrayToBuildAvroArray(array, elementSchema);
        cache.put(array, arraySchema);
        return arraySchema;
      case MAP:
        Types.MapType map = (Types.MapType) type;
        Schema keySchema;
        Schema valueSchema;
        keySchema = visitInternalSchemaToBuildAvroSchema(map.keyType(), cache, recordName);
        valueSchema = visitInternalSchemaToBuildAvroSchema(map.valueType(), cache, recordName);
        Schema mapSchema;
        mapSchema = cache.get(map);
        if (mapSchema != null) {
          return mapSchema;
        }
        mapSchema = visitInternalMapToBuildAvroMap(map, keySchema, valueSchema);
        cache.put(map, mapSchema);
        return mapSchema;
      default:
        Schema primitiveSchema = visitInternalPrimitiveToBuildAvroPrimitiveType((Type.PrimitiveType) type, recordName);
        cache.put(type, primitiveSchema);
        return primitiveSchema;
    }
  }

  /**
   * Converts hudi RecordType to Avro RecordType.
   * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
   */
  private static Schema visitInternalRecordToBuildAvroRecord(Types.RecordType recordType, List fieldSchemas, String recordNameFallback) {
    List fields = recordType.fields();
    List avroFields = new ArrayList<>();
    for (int i = 0; i < fields.size(); i++) {
      Types.Field f = fields.get(i);
      Schema.Field field = new Schema.Field(f.name(), fieldSchemas.get(i), f.doc(), f.isOptional() ? JsonProperties.NULL_VALUE : null);
      avroFields.add(field);
    }
    String recordName = Option.ofNullable(recordType.name()).orElse(recordNameFallback);
    return Schema.createRecord(recordName, null, null, false, avroFields);
  }

  /**
   * Converts hudi ArrayType to Avro ArrayType.
   * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
   */
  private static Schema visitInternalArrayToBuildAvroArray(Types.ArrayType array, Schema elementSchema) {
    Schema result;
    if (array.isElementOptional()) {
      result = Schema.createArray(AvroInternalSchemaConverter.nullableSchema(elementSchema));
    } else {
      result = Schema.createArray(elementSchema);
    }
    return result;
  }

  /**
   * Converts hudi MapType to Avro MapType.
   * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
   */
  private static Schema visitInternalMapToBuildAvroMap(Types.MapType map, Schema keySchema, Schema valueSchema) {
    Schema mapSchema;
    if (keySchema.getType() == Schema.Type.STRING) {
      mapSchema = Schema.createMap(map.isValueOptional() ? AvroInternalSchemaConverter.nullableSchema(valueSchema) : valueSchema);
    } else {
      throw new HoodieSchemaException("only support StringType key for avro MapType");
    }
    return mapSchema;
  }

  /**
   * Converts hudi PrimitiveType to Avro PrimitiveType.
   * this is auxiliary function used by visitInternalSchemaToBuildAvroSchema
   */
  private static Schema visitInternalPrimitiveToBuildAvroPrimitiveType(Type.PrimitiveType primitive, String recordName) {
    switch (primitive.typeId()) {
      case BOOLEAN:
        return Schema.create(Schema.Type.BOOLEAN);

      case INT:
        return Schema.create(Schema.Type.INT);

      case LONG:
        return Schema.create(Schema.Type.LONG);

      case FLOAT:
        return Schema.create(Schema.Type.FLOAT);

      case DOUBLE:
        return Schema.create(Schema.Type.DOUBLE);

      case DATE:
        return LogicalTypes.date().addToSchema(Schema.create(Schema.Type.INT));

      case TIME:
        return LogicalTypes.timeMicros().addToSchema(Schema.create(Schema.Type.LONG));

      case TIMESTAMP:
        return LogicalTypes.timestampMicros().addToSchema(Schema.create(Schema.Type.LONG));

      case STRING:
        return Schema.create(Schema.Type.STRING);

      case BINARY:
        return Schema.create(Schema.Type.BYTES);

      case UUID: {
        // NOTE: All schemas corresponding to Avro's type [[FIXED]] are generated
        //       with the "fixed" name to stay compatible w/ [[SchemaConverters]]
        String name = recordName + AVRO_NAME_DELIMITER + "fixed";
        Schema fixedSchema = Schema.createFixed(name, null, null, 16);
        return LogicalTypes.uuid().addToSchema(fixedSchema);
      }

      case FIXED: {
        Types.FixedType fixed = (Types.FixedType) primitive;
        // NOTE: All schemas corresponding to Avro's type [[FIXED]] are generated
        //       with the "fixed" name to stay compatible w/ [[SchemaConverters]]
        String name = recordName + AVRO_NAME_DELIMITER + "fixed";
        return Schema.createFixed(name, null, null, fixed.getFixedSize());
      }

      case DECIMAL: {
        Types.DecimalType decimal = (Types.DecimalType) primitive;
        // NOTE: All schemas corresponding to Avro's type [[FIXED]] are generated
        //       with the "fixed" name to stay compatible w/ [[SchemaConverters]]
        String name = recordName + AVRO_NAME_DELIMITER + "fixed";
        Schema fixedSchema = Schema.createFixed(name,
            null, null, computeMinBytesForPrecision(decimal.precision()));
        return LogicalTypes.decimal(decimal.precision(), decimal.scale())
            .addToSchema(fixedSchema);
      }

      default:
        throw new UnsupportedOperationException(
                "Unsupported type ID: " + primitive.typeId());
    }
  }

  /**
   * Return the minimum number of bytes needed to store a decimal with a give 'precision'.
   * reference from Spark release 3.1 .
   */
  private static int computeMinBytesForPrecision(int precision) {
    int numBytes = 1;
    while (Math.pow(2.0, 8 * numBytes - 1) < Math.pow(10.0, precision)) {
      numBytes += 1;
    }
    return numBytes;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy