org.apache.hudi.avro.AvroSchemaUtils Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.avro;

import org.apache.hudi.common.util.Option;
import org.apache.hudi.exception.HoodieAvroSchemaException;
import org.apache.hudi.exception.InvalidUnionTypeException;
import org.apache.hudi.exception.MissingSchemaFieldException;
import org.apache.hudi.exception.SchemaBackwardsCompatibilityException;
import org.apache.hudi.exception.SchemaCompatibilityException;

import org.apache.avro.Schema;
import org.apache.avro.SchemaCompatibility;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Deque;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.BiFunction;
import java.util.stream.Collectors;

import static org.apache.hudi.common.util.ValidationUtils.checkState;

/**
 * Utils for Avro Schema.
 */
public class AvroSchemaUtils {

  private static final Logger LOG = LoggerFactory.getLogger(AvroSchemaUtils.class);

  private AvroSchemaUtils() {}

  /**
   * See {@link #isSchemaCompatible(Schema, Schema, boolean, boolean)} doc for more details
   */
  public static boolean isSchemaCompatible(Schema prevSchema, Schema newSchema) {
    return isSchemaCompatible(prevSchema, newSchema, true);
  }

  /**
   * See {@link #isSchemaCompatible(Schema, Schema, boolean, boolean)} doc for more details
   */
  public static boolean isSchemaCompatible(Schema prevSchema, Schema newSchema, boolean allowProjection) {
    return isSchemaCompatible(prevSchema, newSchema, true, allowProjection);
  }

  /**
   * Establishes whether {@code newSchema} is compatible w/ {@code prevSchema}, as
   * defined by Avro's {@link AvroSchemaCompatibility}.
   * From avro's compatibility standpoint, prevSchema is writer schema and new schema is reader schema.
   * {@code newSchema} is considered compatible to {@code prevSchema}, iff data written using {@code prevSchema}
   * could be read by {@code newSchema}
   *
   * @param prevSchema previous instance of the schema
   * @param newSchema new instance of the schema
   * @param checkNaming controls whether schemas fully-qualified names should be checked
   */
  public static boolean isSchemaCompatible(Schema prevSchema, Schema newSchema, boolean checkNaming, boolean allowProjection) {
    // NOTE: We're establishing compatibility of the {@code prevSchema} and {@code newSchema}
    //       as following: {@code newSchema} is considered compatible to {@code prevSchema},
    //       iff data written using {@code prevSchema} could be read by {@code newSchema}

    // In case schema projection is not allowed, new schema has to have all the same fields as the
    // old schema
    if (!allowProjection) {
      if (!canProject(prevSchema, newSchema)) {
        return false;
      }
    }

    AvroSchemaCompatibility.SchemaPairCompatibility result =
        AvroSchemaCompatibility.checkReaderWriterCompatibility(newSchema, prevSchema, checkNaming);
    return result.getType() == AvroSchemaCompatibility.SchemaCompatibilityType.COMPATIBLE;
  }

  /**
   * Check that each field in the prevSchema can be populated in the newSchema
   * @param prevSchema prev schema.
   * @param newSchema new schema
   * @return true if prev schema is a projection of new schema.
   */
  public static boolean canProject(Schema prevSchema, Schema newSchema) {
    return findMissingFields(prevSchema, newSchema, Collections.emptySet()).isEmpty();
  }

  /**
   * Check that each top level field in the prevSchema can be populated in the newSchema except specified columns
   * @param prevSchema prev schema.
   * @param newSchema new schema
   * @return List of fields that should be in the new schema
   */
  private static List findMissingFields(Schema prevSchema, Schema newSchema, Set exceptCols) {
    return prevSchema.getFields().stream()
        .filter(f -> !exceptCols.contains(f.name()))
        .filter(oldSchemaField -> SchemaCompatibility.lookupWriterField(newSchema, oldSchemaField) == null)
        .collect(Collectors.toList());
  }

  /**
   * Generates fully-qualified name for the Avro's schema based on the Table's name
   *
   * NOTE: PLEASE READ CAREFULLY BEFORE CHANGING
   *       This method should not change for compatibility reasons as older versions
   *       of Avro might be comparing fully-qualified names rather than just the record
   *       names
   */
  public static String getAvroRecordQualifiedName(String tableName) {
    String sanitizedTableName = HoodieAvroUtils.sanitizeName(tableName);
    return "hoodie." + sanitizedTableName + "." + sanitizedTableName + "_record";
  }

  /**
   * Validate whether the {@code targetSchema} is a "compatible" projection of {@code sourceSchema}.
   * Only difference of this method from {@link #isStrictProjectionOf(Schema, Schema)} is
   * the fact that it allows some legitimate type promotions (like {@code int -> long},
   * {@code decimal(3, 2) -> decimal(5, 2)}, etc.) that allows projection to have a "wider"
   * atomic type (whereas strict projection requires atomic type to be identical)
   */
  public static boolean isCompatibleProjectionOf(Schema sourceSchema, Schema targetSchema) {
    return isProjectionOfInternal(sourceSchema, targetSchema,
        AvroSchemaUtils::isAtomicSchemasCompatible);
  }

  private static boolean isAtomicSchemasCompatible(Schema oneAtomicType, Schema anotherAtomicType) {
    // NOTE: Checking for compatibility of atomic types, we should ignore their
    //       corresponding fully-qualified names (as irrelevant)
    return isSchemaCompatible(oneAtomicType, anotherAtomicType, false, true);
  }

  /**
   * Validate whether the {@code targetSchema} is a strict projection of {@code sourceSchema}.
   *
   * Schema B is considered a strict projection of schema A iff
   * 
   *   Schemas A and B are equal, or
   *   Schemas A and B are array schemas and element-type of B is a strict projection
   *   of the element-type of A, or
   *   Schemas A and B are map schemas and value-type of B is a strict projection
   *   of the value-type of A, or
   *   Schemas A and B are union schemas (of the same size) and every element-type of B
   *   is a strict projection of the corresponding element-type of A, or
   *   Schemas A and B are record schemas and every field of the record B has corresponding
   *   counterpart (w/ the same name) in the schema A, such that the schema of the field of the schema
   *   B is also a strict projection of the A field's schema
   * 
   */
  public static boolean isStrictProjectionOf(Schema sourceSchema, Schema targetSchema) {
    return isProjectionOfInternal(sourceSchema, targetSchema, Objects::equals);
  }

  private static boolean isProjectionOfInternal(Schema sourceSchema,
                                                Schema targetSchema,
                                                BiFunction atomicTypeEqualityPredicate) {
    if (sourceSchema.getType() == targetSchema.getType()) {
      if (sourceSchema.getType() == Schema.Type.RECORD) {
        for (Schema.Field targetField : targetSchema.getFields()) {
          Schema.Field sourceField = sourceSchema.getField(targetField.name());
          if (sourceField == null || !isProjectionOfInternal(sourceField.schema(), targetField.schema(), atomicTypeEqualityPredicate)) {
            return false;
          }
        }
        return true;
      } else if (sourceSchema.getType() == Schema.Type.ARRAY) {
        return isProjectionOfInternal(sourceSchema.getElementType(), targetSchema.getElementType(), atomicTypeEqualityPredicate);
      } else if (sourceSchema.getType() == Schema.Type.MAP) {
        return isProjectionOfInternal(sourceSchema.getValueType(), targetSchema.getValueType(), atomicTypeEqualityPredicate);
      } else if (sourceSchema.getType() == Schema.Type.UNION) {
        List sourceNestedSchemas = sourceSchema.getTypes();
        List targetNestedSchemas = targetSchema.getTypes();
        if (sourceNestedSchemas.size() != targetNestedSchemas.size()) {
          return false;
        }

        for (int i = 0; i < sourceNestedSchemas.size(); ++i) {
          if (!isProjectionOfInternal(sourceNestedSchemas.get(i), targetNestedSchemas.get(i), atomicTypeEqualityPredicate)) {
            return false;
          }
        }
        return true;
      }
    }

    return atomicTypeEqualityPredicate.apply(sourceSchema, targetSchema);
  }

  public static Option findNestedField(Schema schema, String fieldName) {
    return findNestedField(schema, fieldName.split("\\."), 0);
  }

  private static Option findNestedField(Schema schema, String[] fieldParts, int index) {
    if (schema.getType().equals(Schema.Type.UNION)) {
      Option notUnion = findNestedField(resolveNullableSchema(schema), fieldParts, index);
      if (!notUnion.isPresent()) {
        return Option.empty();
      }
      Schema.Field nu = notUnion.get();
      return Option.of(new Schema.Field(nu.name(), nu.schema(), nu.doc(), nu.defaultVal()));
    }
    if (fieldParts.length <= index) {
      return Option.empty();
    }

    Schema.Field foundField = schema.getField(fieldParts[index]);
    if (foundField == null) {
      return Option.empty();
    }

    if (index == fieldParts.length - 1) {
      return Option.of(new Schema.Field(foundField.name(), foundField.schema(), foundField.doc(), foundField.defaultVal()));
    }

    Schema foundSchema = foundField.schema();
    Option nestedPart = findNestedField(foundSchema, fieldParts, index + 1);
    if (!nestedPart.isPresent()) {
      return Option.empty();
    }
    boolean isUnion = false;
    if (foundSchema.getType().equals(Schema.Type.UNION)) {
      isUnion = true;
      foundSchema = resolveNullableSchema(foundSchema);
    }
    Schema newSchema = createNewSchemaFromFieldsWithReference(foundSchema, Collections.singletonList(nestedPart.get()));
    return Option.of(new Schema.Field(foundField.name(), isUnion ? createNullableSchema(newSchema) : newSchema, foundField.doc(), foundField.defaultVal()));
  }

  public static Schema appendFieldsToSchemaDedupNested(Schema schema, List newFields) {
    return appendFieldsToSchemaBase(schema, newFields, true);
  }

  public static Schema mergeSchemas(Schema a, Schema b) {
    if (!a.getType().equals(Schema.Type.RECORD)) {
      return a;
    }
    List fields = new ArrayList<>();
    for (Schema.Field f : a.getFields()) {
      Schema.Field foundField = b.getField(f.name());
      fields.add(new Schema.Field(f.name(), foundField == null ? f.schema() : mergeSchemas(f.schema(), foundField.schema()),
          f.doc(), f.defaultVal()));
    }
    for (Schema.Field f : b.getFields()) {
      if (a.getField(f.name()) == null) {
        fields.add(new Schema.Field(f.name(), f.schema(), f.doc(), f.defaultVal()));
      }
    }
    return createNewSchemaFromFieldsWithReference(a, fields);
  }

  /**
   * Appends provided new fields at the end of the given schema
   *
   * NOTE: No deduplication is made, this method simply appends fields at the end of the list
   *       of the source schema as is
   */
  public static Schema appendFieldsToSchema(Schema schema, List newFields) {
    return appendFieldsToSchemaBase(schema, newFields, false);
  }

  private static Schema appendFieldsToSchemaBase(Schema schema, List newFields, boolean dedupNested) {
    List fields = schema.getFields().stream()
        .map(field -> new Schema.Field(field.name(), field.schema(), field.doc(), field.defaultVal()))
        .collect(Collectors.toList());
    if (dedupNested) {
      for (Schema.Field f : newFields) {
        Schema.Field foundField = schema.getField(f.name());
        if (foundField != null) {
          fields.set(foundField.pos(), new Schema.Field(foundField.name(), mergeSchemas(foundField.schema(), f.schema()), foundField.doc(), foundField.defaultVal()));
        } else {
          fields.add(f);
        }
      }
    } else {
      fields.addAll(newFields);
    }

    return createNewSchemaFromFieldsWithReference(schema, fields);
  }

  /**
   * Create a new schema but maintain all meta info from the old schema
   *
   * @param schema schema to get the meta info from
   * @param fields list of fields in order that will be in the new schema
   *
   * @return schema with fields from fields, and metadata from schema
   */
  public static Schema createNewSchemaFromFieldsWithReference(Schema schema, List fields) {
    if (schema == null) {
      throw new IllegalArgumentException("Schema must not be null");
    }
    Schema newSchema = Schema.createRecord(schema.getName(), schema.getDoc(), schema.getNamespace(), schema.isError());
    Map schemaProps = Collections.emptyMap();
    try {
      schemaProps = schema.getObjectProps();
    } catch (Exception e) {
      LOG.warn("Error while getting object properties from schema: {}", schema, e);
    }
    for (Map.Entry prop : schemaProps.entrySet()) {
      newSchema.addProp(prop.getKey(), prop.getValue());
    }
    newSchema.setFields(fields);
    return newSchema;
  }

  /**
   * Passed in {@code Union} schema and will try to resolve the field with the {@code fieldSchemaFullName}
   * w/in the union returning its corresponding schema
   *
   * @param schema target schema to be inspected
   * @param fieldSchemaFullName target field-name to be looked up w/in the union
   * @return schema of the field w/in the union identified by the {@code fieldSchemaFullName}
   */
  public static Schema resolveUnionSchema(Schema schema, String fieldSchemaFullName) {
    if (schema.getType() != Schema.Type.UNION) {
      return schema;
    }

    List innerTypes = schema.getTypes();
    if (innerTypes.size() == 2 && isNullable(schema)) {
      // this is a basic nullable field so handle it more efficiently
      return resolveNullableSchema(schema);
    }

    Schema nonNullType =
        innerTypes.stream()
            .filter(it -> it.getType() != Schema.Type.NULL && Objects.equals(it.getFullName(), fieldSchemaFullName))
            .findFirst()
            .orElse(null);

    if (nonNullType == null) {
      throw new HoodieAvroSchemaException(
          String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema));
    }

    return nonNullType;
  }

  /**
   * Returns true in case provided {@link Schema} is nullable (ie accepting null values),
   * returns false otherwise
   */
  public static boolean isNullable(Schema schema) {
    if (schema.getType() != Schema.Type.UNION) {
      return false;
    }

    List innerTypes = schema.getTypes();
    return innerTypes.size() > 1 && innerTypes.stream().anyMatch(it -> it.getType() == Schema.Type.NULL);
  }

  /**
   * Resolves typical Avro's nullable schema definition: {@code Union(Schema.Type.NULL, )},
   * decomposing union and returning the target non-null type
   */
  public static Schema resolveNullableSchema(Schema schema) {
    if (schema.getType() != Schema.Type.UNION) {
      return schema;
    }

    List innerTypes = schema.getTypes();

    if (innerTypes.size() != 2) {
      throw new HoodieAvroSchemaException(
          String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema));
    }
    Schema firstInnerType = innerTypes.get(0);
    Schema secondInnerType = innerTypes.get(1);
    if ((firstInnerType.getType() != Schema.Type.NULL && secondInnerType.getType() != Schema.Type.NULL)
        || (firstInnerType.getType() == Schema.Type.NULL && secondInnerType.getType() == Schema.Type.NULL)) {
      throw new HoodieAvroSchemaException(
          String.format("Unsupported Avro UNION type %s: Only UNION of a null type and a non-null type is supported", schema));
    }
    return firstInnerType.getType() == Schema.Type.NULL ? secondInnerType : firstInnerType;
  }

  /**
   * Creates schema following Avro's typical nullable schema definition: {@code Union(Schema.Type.NULL, )},
   * wrapping around provided target non-null type
   */
  public static Schema createNullableSchema(Schema.Type avroType) {
    return createNullableSchema(Schema.create(avroType));
  }

  public static Schema createNullableSchema(Schema schema) {
    checkState(schema.getType() != Schema.Type.NULL);
    return Schema.createUnion(Schema.create(Schema.Type.NULL), schema);
  }

  /**
   * Returns true in case when schema contains the field w/ provided name
   */
  public static boolean containsFieldInSchema(Schema schema, String fieldName) {
    try {
      Schema.Field field = schema.getField(fieldName);
      return field != null;
    } catch (Exception e) {
      return false;
    }
  }

  /**
   * Checks whether writer schema is compatible with table schema considering {@code AVRO_SCHEMA_VALIDATE_ENABLE}
   * and {@code SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP} options.
   * To avoid collision of {@code SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP} and {@code DROP_PARTITION_COLUMNS}
   * partition column names should be passed as {@code dropPartitionColNames}.
   * Passed empty set means {@code DROP_PARTITION_COLUMNS} is disabled.
   *
   * @param tableSchema the latest dataset schema
   * @param writerSchema writer schema
   * @param shouldValidate whether {@link AvroSchemaCompatibility} check being performed
   * @param allowProjection whether column dropping check being performed
   * @param dropPartitionColNames partition column names to being excluded from column dropping check
   * @throws SchemaCompatibilityException if writer schema is not compatible
   */
  public static void checkSchemaCompatible(
      Schema tableSchema,
      Schema writerSchema,
      boolean shouldValidate,
      boolean allowProjection,
      Set dropPartitionColNames) throws SchemaCompatibilityException {

    if (!allowProjection) {
      List missingFields = findMissingFields(tableSchema, writerSchema, dropPartitionColNames);
      if (!missingFields.isEmpty()) {
        throw new MissingSchemaFieldException(missingFields.stream().map(Schema.Field::name).collect(Collectors.toList()), writerSchema, tableSchema);
      }
    }

    // TODO(HUDI-4772) re-enable validations in case partition columns
    //                 being dropped from the data-file after fixing the write schema
    if (dropPartitionColNames.isEmpty() && shouldValidate) {
      AvroSchemaCompatibility.SchemaPairCompatibility result =
          AvroSchemaCompatibility.checkReaderWriterCompatibility(writerSchema, tableSchema, true);
      if (result.getType() != AvroSchemaCompatibility.SchemaCompatibilityType.COMPATIBLE) {
        throw new SchemaBackwardsCompatibilityException(result, writerSchema, tableSchema);
      }
    }
  }

  /**
   * Validate whether the {@code incomingSchema} is a valid evolution of {@code tableSchema}.
   *
   * @param incomingSchema schema of the incoming dataset
   * @param tableSchema latest table schema
   */
  public static void checkValidEvolution(Schema incomingSchema, Schema tableSchema) {
    if (incomingSchema.getType() == Schema.Type.NULL) {
      return;
    }

    //not really needed for `hoodie.write.set.null.for.missing.columns` but good to check anyway
    List missingFields = new ArrayList<>();
    findAnyMissingFields(incomingSchema, tableSchema, new ArrayDeque<>(), missingFields);
    if (!missingFields.isEmpty()) {
      throw new MissingSchemaFieldException(missingFields, incomingSchema, tableSchema);
    }

    //make sure that the table schema can be read using the incoming schema
    AvroSchemaCompatibility.SchemaPairCompatibility result =
        AvroSchemaCompatibility.checkReaderWriterCompatibility(incomingSchema, tableSchema, false);
    if (result.getType() != AvroSchemaCompatibility.SchemaCompatibilityType.COMPATIBLE) {
      throw new SchemaBackwardsCompatibilityException(result, incomingSchema, tableSchema);
    }
  }

  /**
   * Find all fields in the latest table schema that are not in
   * the incoming schema.
   */
  private static void findAnyMissingFields(Schema incomingSchema,
                                           Schema latestTableSchema,
                                           Deque visited,
                                           List missingFields) {
    findAnyMissingFieldsRec(incomingSchema, latestTableSchema, visited,
        missingFields, incomingSchema, latestTableSchema);
  }

  /**
   * We want to pass the full schemas so that the error message has the entire schema to print from
   */
  private static void findAnyMissingFieldsRec(Schema incomingSchema,
                                              Schema latestTableSchema,
                                              Deque visited,
                                              List missingFields,
                                              Schema fullIncomingSchema,
                                              Schema fullTableSchema) {
    if (incomingSchema.getType() == latestTableSchema.getType()) {
      if (incomingSchema.getType() == Schema.Type.RECORD) {
        visited.addLast(latestTableSchema.getName());
        for (Schema.Field targetField : latestTableSchema.getFields()) {
          visited.addLast(targetField.name());
          Schema.Field sourceField = incomingSchema.getField(targetField.name());
          if (sourceField == null) {
            missingFields.add(String.join(".", visited));
          } else {
            findAnyMissingFieldsRec(sourceField.schema(), targetField.schema(), visited,
                missingFields, fullIncomingSchema, fullTableSchema);
          }
          visited.removeLast();
        }
        visited.removeLast();
      } else if (incomingSchema.getType() == Schema.Type.ARRAY) {
        visited.addLast("element");
        findAnyMissingFieldsRec(incomingSchema.getElementType(), latestTableSchema.getElementType(),
            visited, missingFields, fullIncomingSchema, fullTableSchema);
        visited.removeLast();
      } else if (incomingSchema.getType() == Schema.Type.MAP) {
        visited.addLast("value");
        findAnyMissingFieldsRec(incomingSchema.getValueType(), latestTableSchema.getValueType(),
            visited, missingFields, fullIncomingSchema, fullTableSchema);
        visited.removeLast();
      } else if (incomingSchema.getType() == Schema.Type.UNION) {
        List incomingNestedSchemas = incomingSchema.getTypes();
        List latestTableNestedSchemas = latestTableSchema.getTypes();
        if (incomingNestedSchemas.size() != latestTableNestedSchemas.size()) {
          throw new InvalidUnionTypeException(createSchemaErrorString(
              String.format("Incoming batch field '%s' has union with %d types, while the table schema has %d types",
              String.join(".", visited), incomingNestedSchemas.size(), latestTableNestedSchemas.size()), fullIncomingSchema, fullTableSchema));
        }
        if (incomingNestedSchemas.size() > 2) {
          throw new InvalidUnionTypeException(createSchemaErrorString(
              String.format("Union for incoming batch field '%s' should not have more than 2 types but has %d",
              String.join(".", visited), incomingNestedSchemas.size()), fullIncomingSchema, fullTableSchema));
        }
        for (int i = 0; i < incomingNestedSchemas.size(); ++i) {
          findAnyMissingFieldsRec(incomingNestedSchemas.get(i), latestTableNestedSchemas.get(i), visited,
              missingFields, fullIncomingSchema, fullTableSchema);
        }
      }
    }
  }

  public static String createSchemaErrorString(String errorMessage, Schema writerSchema, Schema tableSchema) {
    return String.format("%s\nwriterSchema: %s\ntableSchema: %s", errorMessage, writerSchema, tableSchema);
  }
}