org.apache.hudi.avro.AvroSchemaCompatibility Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.avro;
import org.apache.hudi.common.util.Either;
import org.apache.hudi.common.util.Option;
import org.apache.avro.AvroRuntimeException;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.Schema.Type;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
import java.util.stream.Collectors;
import static org.apache.hudi.avro.HoodieAvroUtils.isTypeNumeric;
import static org.apache.hudi.common.util.ValidationUtils.checkState;
/**
* Evaluate the compatibility between a reader schema and a writer schema. A
* reader and a writer schema are declared compatible if all datum instances of
* the writer schema can be successfully decoded using the specified reader
* schema.
*
* NOTE: PLEASE READ CAREFULLY BEFORE CHANGING
*
* This code is borrowed from Avro 1.10, with the following modifications:
*
* - Compatibility checks ignore schema name, unless schema is held inside
* a union
*
*/
public class AvroSchemaCompatibility {
private static final Logger LOG = LoggerFactory.getLogger(AvroSchemaCompatibility.class);
/**
* Utility class cannot be instantiated.
*/
private AvroSchemaCompatibility() {
}
/**
* Message to annotate reader/writer schema pairs that are compatible.
*/
public static final String READER_WRITER_COMPATIBLE_MESSAGE = "Reader schema can always successfully decode data written using the writer schema.";
/**
* Validates that the provided reader schema can be used to decode avro data
* written with the provided writer schema.
*
* @param reader schema to check.
* @param writer schema to check.
* @return a result object identifying any compatibility errors.
*/
public static SchemaPairCompatibility checkReaderWriterCompatibility(final Schema reader,
final Schema writer,
boolean checkNamingOverride) {
final SchemaCompatibilityResult compatibility =
new ReaderWriterCompatibilityChecker(checkNamingOverride).getCompatibility(reader, writer);
final String message;
switch (compatibility.getCompatibility()) {
case INCOMPATIBLE: {
message = String.format(
"Data encoded using writer schema:%n%s%n" + "will or may fail to decode using reader schema:%n%s%n",
writer.toString(true), reader.toString(true));
break;
}
case COMPATIBLE: {
message = READER_WRITER_COMPATIBLE_MESSAGE;
break;
}
default:
throw new AvroRuntimeException("Unknown compatibility: " + compatibility);
}
return new SchemaPairCompatibility(compatibility, reader, writer, message);
}
// -----------------------------------------------------------------------------------------------
/**
* Tests the equality of two Avro named schemas.
*
*
* Matching includes reader name aliases.
*
*
* @param reader Named reader schema.
* @param writer Named writer schema.
* @return whether the names of the named schemas match or not.
*/
public static boolean schemaNameEquals(final Schema reader, final Schema writer) {
if (objectsEqual(reader.getName(), writer.getName())) {
return true;
}
// Apply reader aliases:
return reader.getAliases().contains(writer.getFullName());
}
/**
* Identifies the writer field that corresponds to the specified reader field.
*
*
* Matching includes reader name aliases.
*
*
* @param writerSchema Schema of the record where to look for the writer field.
* @param readerField Reader field to identify the corresponding writer field
* of.
* @return the writer field, if any does correspond, or None.
*/
public static Field lookupWriterField(final Schema writerSchema, final Field readerField) {
assert (writerSchema.getType() == Type.RECORD);
final List writerFields = new ArrayList<>();
final Field direct = writerSchema.getField(readerField.name());
if (direct != null) {
writerFields.add(direct);
}
for (final String readerFieldAliasName : readerField.aliases()) {
final Field writerField = writerSchema.getField(readerFieldAliasName);
if (writerField != null) {
writerFields.add(writerField);
}
}
switch (writerFields.size()) {
case 0:
return null;
case 1:
return writerFields.get(0);
default: {
throw new AvroRuntimeException(String.format(
"Reader record field %s matches multiple fields in writer record schema %s", readerField, writerSchema));
}
}
}
/**
* Reader/writer schema pair that can be used as a key in a hash map.
*
* This reader/writer pair differentiates Schema objects based on their system
* hash code.
*/
private static final class ReaderWriter {
private final Schema mReader;
private final Schema mWriter;
/**
* Initializes a new reader/writer pair.
*
* @param reader Reader schema.
* @param writer Writer schema.
*/
public ReaderWriter(final Schema reader, final Schema writer) {
mReader = reader;
mWriter = writer;
}
/**
* {@inheritDoc}
*/
@Override
public int hashCode() {
return System.identityHashCode(mReader) ^ System.identityHashCode(mWriter);
}
/**
* {@inheritDoc}
*/
@Override
public boolean equals(Object obj) {
if (!(obj instanceof ReaderWriter)) {
return false;
}
final ReaderWriter that = (ReaderWriter) obj;
// Use pointer comparison here:
return (this.mReader == that.mReader) && (this.mWriter == that.mWriter);
}
/**
* {@inheritDoc}
*/
@Override
public String toString() {
return String.format("ReaderWriter{reader:%s, writer:%s}", mReader, mWriter);
}
}
/**
* Determines the compatibility of a reader/writer schema pair.
*
*
* Provides memoization to handle recursive schemas.
*
*/
private static final class ReaderWriterCompatibilityChecker {
private final AvroDefaultValueAccessor defaultValueAccessor = new AvroDefaultValueAccessor();
private final Map mMemoizeMap = new HashMap<>();
private final boolean checkNaming;
public ReaderWriterCompatibilityChecker(boolean checkNaming) {
this.checkNaming = checkNaming;
}
/**
* Reports the compatibility of a reader/writer schema pair.
*
*
* Memorizes the compatibility results.
*
*
* @param reader Reader schema to test.
* @param writer Writer schema to test.
* @return the compatibility of the reader/writer schema pair.
*/
public SchemaCompatibilityResult getCompatibility(final Schema reader, final Schema writer) {
ArrayDeque locations = new ArrayDeque<>(
Collections.singletonList(new LocationInfo(reader.getName(), reader.getType()))
);
return getCompatibility(reader, writer, locations);
}
/**
* Reports the compatibility of a reader/writer schema pair.
*
* Memorizes the compatibility results.
*
*
* @param reader Reader schema to test.
* @param writer Writer schema to test.
* @param locations Stack tracking the path (chain of locations) within the
* schema.
* @return the compatibility of the reader/writer schema pair.
*/
private SchemaCompatibilityResult getCompatibility(final Schema reader,
final Schema writer,
final Deque locations) {
LOG.debug("Checking compatibility of reader {} with writer {}", reader, writer);
final ReaderWriter pair = new ReaderWriter(reader, writer);
SchemaCompatibilityResult result = mMemoizeMap.get(pair);
if (result != null) {
if (result.getCompatibility() == SchemaCompatibilityType.RECURSION_IN_PROGRESS) {
// Break the recursion here.
// schemas are compatible unless proven incompatible:
result = SchemaCompatibilityResult.compatible();
}
} else {
// Mark this reader/writer pair as "in progress":
mMemoizeMap.put(pair, SchemaCompatibilityResult.recursionInProgress());
result = calculateCompatibility(reader, writer, locations);
mMemoizeMap.put(pair, result);
}
return result;
}
private static String getLocationName(final Deque locations, Type readerType) {
StringBuilder sb = new StringBuilder();
Iterator locationInfoIterator = locations.iterator();
boolean addDot = false;
while (locationInfoIterator.hasNext()) {
if (addDot) {
sb.append(".");
} else {
addDot = true;
}
LocationInfo next = locationInfoIterator.next();
sb.append(next.name);
//we check the reader type if we are at the last location. This is because
//if the type is array/map, that means the problem is that the field type
//of the writer is not array/map. If the type is something else, the problem
//is between the array element/map value of the reader and writer schemas
if (next.type.equals(Type.MAP)) {
if (locationInfoIterator.hasNext() || !readerType.equals(Type.MAP)) {
sb.append(".value");
}
} else if (next.type.equals(Type.ARRAY)) {
if (locationInfoIterator.hasNext() || !readerType.equals(Type.ARRAY)) {
sb.append(".element");
}
}
}
return sb.toString();
}
/**
* Calculates the compatibility of a reader/writer schema pair.
*
*
* Relies on external memoization performed by
* {@link #getCompatibility(Schema, Schema)}.
*
*
* @param reader Reader schema to test.
* @param writer Writer schema to test.
* @param locations Stack with which to track the location within the schema.
* @return the compatibility of the reader/writer schema pair.
*/
private SchemaCompatibilityResult calculateCompatibility(final Schema reader, final Schema writer,
final Deque locations) {
SchemaCompatibilityResult result = SchemaCompatibilityResult.compatible();
if (reader.getType() == writer.getType()) {
switch (reader.getType()) {
case NULL:
case BOOLEAN:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case BYTES:
case STRING: {
return result;
}
case ARRAY: {
return result.mergedWith(getCompatibility(reader.getElementType(), writer.getElementType(), locations));
}
case MAP: {
return result.mergedWith(getCompatibility(reader.getValueType(), writer.getValueType(), locations));
}
case FIXED: {
result = result.mergedWith(checkSchemaNames(reader, writer, locations));
return result.mergedWith(checkFixedSize(reader, writer, locations));
}
case ENUM: {
result = result.mergedWith(checkSchemaNames(reader, writer, locations));
return result.mergedWith(checkReaderEnumContainsAllWriterEnumSymbols(reader, writer, locations));
}
case RECORD: {
result = result.mergedWith(checkSchemaNames(reader, writer, locations));
return result.mergedWith(checkReaderWriterRecordFields(reader, writer, locations));
}
case UNION: {
// Check that each individual branch of the writer union can be decoded:
for (final Schema writerBranch : writer.getTypes()) {
SchemaCompatibilityResult compatibility = getCompatibility(reader, writerBranch, locations);
if (compatibility.getCompatibility() == SchemaCompatibilityType.INCOMPATIBLE) {
String message = String.format("reader union lacking writer type: %s for field: '%s'", writerBranch.getType(), getLocationName(locations, reader.getType()));
result = result.mergedWith(SchemaCompatibilityResult.incompatible(
SchemaIncompatibilityType.MISSING_UNION_BRANCH, reader, writer, message, asList(locations)));
}
}
// Each schema in the writer union can be decoded with the reader:
return result;
}
default: {
throw new AvroRuntimeException("Unknown schema type: " + reader.getType());
}
}
} else {
// Reader and writer have different schema types:
// Reader compatible with all branches of a writer union is compatible
if (writer.getType() == Schema.Type.UNION) {
for (Schema s : writer.getTypes()) {
result = result.mergedWith(getCompatibility(reader, s, locations));
}
return result;
}
switch (reader.getType()) {
case NULL:
return result.mergedWith(typeMismatch(reader, writer, locations));
case BOOLEAN:
return result.mergedWith(typeMismatch(reader, writer, locations));
case INT:
return result.mergedWith(typeMismatch(reader, writer, locations));
case LONG: {
return (writer.getType() == Type.INT) ? result : result.mergedWith(typeMismatch(reader, writer, locations));
}
case FLOAT: {
return ((writer.getType() == Type.INT) || (writer.getType() == Type.LONG)) ? result
: result.mergedWith(typeMismatch(reader, writer, locations));
}
case DOUBLE: {
return ((writer.getType() == Type.INT) || (writer.getType() == Type.LONG) || (writer.getType() == Type.FLOAT))
? result
: result.mergedWith(typeMismatch(reader, writer, locations));
}
case BYTES: {
return (writer.getType() == Type.STRING) ? result : result.mergedWith(typeMismatch(reader, writer, locations));
}
case STRING: {
return (isTypeNumeric(writer.getType()) || (writer.getType() == Schema.Type.BYTES)
? result : result.mergedWith(typeMismatch(reader, writer, locations)));
}
case ARRAY:
return result.mergedWith(typeMismatch(reader, writer, locations));
case MAP:
return result.mergedWith(typeMismatch(reader, writer, locations));
case FIXED:
return result.mergedWith(typeMismatch(reader, writer, locations));
case ENUM:
return result.mergedWith(typeMismatch(reader, writer, locations));
case RECORD:
return result.mergedWith(typeMismatch(reader, writer, locations));
case UNION: {
for (final Schema readerBranch : reader.getTypes()) {
SchemaCompatibilityResult compatibility = getCompatibility(readerBranch, writer, locations);
if (compatibility.getCompatibility() == SchemaCompatibilityType.COMPATIBLE) {
return result;
}
}
// No branch in the reader union has been found compatible with the writer
// schema:
String message = String.format("reader union lacking writer type: %s for field: '%s'", writer.getType(), getLocationName(locations, reader.getType()));
return result.mergedWith(SchemaCompatibilityResult
.incompatible(SchemaIncompatibilityType.MISSING_UNION_BRANCH, reader, writer, message, asList(locations)));
}
default: {
throw new AvroRuntimeException("Unknown schema type: " + reader.getType());
}
}
}
}
private SchemaCompatibilityResult checkReaderWriterRecordFields(final Schema reader, final Schema writer,
final Deque locations) {
SchemaCompatibilityResult result = SchemaCompatibilityResult.compatible();
// Check that each field in the reader record can be populated from the writer
// record:
for (final Field readerField : reader.getFields()) {
final Field writerField = lookupWriterField(writer, readerField);
if (writerField == null) {
// Reader field does not correspond to any field in the writer record schema, so
// the
// reader field must have a default value.
if (defaultValueAccessor.getDefaultValue(readerField) == null) {
// reader field has no default value
String message = String.format("Field '%s.%s' has no default value", getLocationName(locations, readerField.schema().getType()), readerField.name());
result = result.mergedWith(
SchemaCompatibilityResult.incompatible(SchemaIncompatibilityType.READER_FIELD_MISSING_DEFAULT_VALUE,
reader, writer, message, asList(locations)));
}
} else {
locations.addLast(new LocationInfo(readerField.name(), readerField.schema().getType()));
result = result.mergedWith(getCompatibility(readerField.schema(), writerField.schema(), locations));
locations.removeLast();
}
}
return result;
}
private static class AvroDefaultValueAccessor {
// Avro <= 1.8.2
private final Option legacyDefaultValueMethod = loadMethodNoThrow("defaultValue");
// Avro >= 1.10.0
private final Option newDefaultValueMethod = loadMethodNoThrow("defaultVal");
public Object getDefaultValue(Field field) {
return newDefaultValueMethod.or(legacyDefaultValueMethod)
.map(m -> invokeMethodNoThrow(m, field).asLeft())
.orElse(null);
}
private static Either