com.netease.arctic.shade.org.apache.avro.Resolver Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netease.arctic.shade.org.apache.avro;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.netease.arctic.shade.org.apache.avro.generic.GenericData;
import com.netease.arctic.shade.org.apache.avro.Schema.Field;
import com.netease.arctic.shade.org.apache.avro.Schema.SeenPair;
import com.netease.arctic.shade.org.apache.avro.Resolver.ErrorAction.ErrorType;
/**
* Encapsulate schema-resolution logic in an easy-to-consume representation. See
* {@link #resolve} and also the separate document entitled
* refactoring-resolution for more information. It might also be
* helpful to study {@link com.netease.arctic.shade.org.apache.avro.io.parsing.ResolvingGrammarGenerator}
* as an example of how to use this class.
*/
public class Resolver {
/**
* Returns a {@link Resolver.Action} tree for resolving the writer schema
* writer and the reader schema reader.
*
* This method walks the reader's and writer's schemas together, generating an
* appropriate subclass of {@link Action} to encapsulate the information needed
* to resolve the corresponding parts of each schema tree. For convenience,
* every {@link Action} object has a pointer to the corresponding parts of the
* reader's and writer's trees being resolved by the action. Each subclass of
* {@link Action} has additional information needed for different types of
* schema, e.g., the {@link EnumAdjust} subclass has information about
* re-ordering and deletion of enumeration symbols, while {@link RecordAdjust}
* has information about re-ordering and deletion of record fields.
*
* Note that aliases are applied to the writer's schema before resolution
* actually takes place. This means that the writer field of the
* resulting {@link Action} objects will not be the same schema as provided to
* this method. However, the reader field will be.
*
* @param writer The schema used by the writer
* @param reader The schema used by the reader
* @param data Used for getDefaultValue and getting conversions
* @return Nested actions for resolving the two
*/
public static Action resolve(Schema writer, Schema reader, GenericData data) {
return resolve(Schema.applyAliases(writer, reader), reader, data, new HashMap<>());
}
/**
* Uses GenericData.get() for the data param.
*/
public static Action resolve(Schema writer, Schema reader) {
return resolve(writer, reader, GenericData.get());
}
private static Action resolve(Schema w, Schema r, GenericData d, Map seen) {
final Schema.Type wType = w.getType();
final Schema.Type rType = r.getType();
if (wType == Schema.Type.UNION) {
return WriterUnion.resolve(w, r, d, seen);
}
if (wType == rType) {
switch (wType) {
case NULL:
case BOOLEAN:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case STRING:
case BYTES:
return new DoNothing(w, r, d);
case FIXED:
if (w.getFullName() != null && !w.getFullName().equals(r.getFullName())) {
return new ErrorAction(w, r, d, ErrorType.NAMES_DONT_MATCH);
} else if (w.getFixedSize() != r.getFixedSize()) {
return new ErrorAction(w, r, d, ErrorType.SIZES_DONT_MATCH);
} else {
return new DoNothing(w, r, d);
}
case ARRAY:
Action et = resolve(w.getElementType(), r.getElementType(), d, seen);
return new Container(w, r, d, et);
case MAP:
Action vt = resolve(w.getValueType(), r.getValueType(), d, seen);
return new Container(w, r, d, vt);
case ENUM:
return EnumAdjust.resolve(w, r, d);
case RECORD:
return RecordAdjust.resolve(w, r, d, seen);
default:
throw new IllegalArgumentException("Unknown type for schema: " + wType);
}
} else if (rType == Schema.Type.UNION) {
return ReaderUnion.resolve(w, r, d, seen);
} else {
return Promote.resolve(w, r, d);
}
}
/**
* An abstract class for an action to be taken to resolve a writer's schema
* (found in public instance variable writer) against a reader's schema
* (in reader). Ordinarily, neither field can be null, except
* that the reader field can be null in a {@link Skip}, which
* is used to skip a field in a writer's record that doesn't exist in the
* reader's (and thus there is no reader schema to resolve to).
*/
public static abstract class Action {
/** Helps us traverse faster. */
public enum Type {
DO_NOTHING, ERROR, PROMOTE, CONTAINER, ENUM, SKIP, RECORD, WRITER_UNION, READER_UNION
}
public final Schema writer, reader;
public final Type type;
/**
* If the reader has a logical type, it's stored here for fast access, otherwise
* this will be null.
*/
public final LogicalType logicalType;
/**
* If the reader has a conversion that needs to be applied, it's stored here for
* fast access, otherwise this will be null.
*/
public final Conversion> conversion;
protected Action(Schema w, Schema r, GenericData data, Type t) {
this.writer = w;
this.reader = r;
this.type = t;
if (r == null) {
this.logicalType = null;
this.conversion = null;
} else {
this.logicalType = r.getLogicalType();
this.conversion = data.getConversionFor(logicalType);
}
}
}
/**
* In this case, there's nothing to be done for resolution: the two schemas are
* effectively the same. This action will be generated only for
* primitive types and fixed types, and not for any other kind of schema.
*/
public static class DoNothing extends Action {
public DoNothing(Schema w, Schema r, GenericData d) {
super(w, r, d, Action.Type.DO_NOTHING);
}
}
/**
* In this case there is an error. We put error Actions into trees because Avro
* reports these errors in a lazy fashion: if a particular input doesn't
* "tickle" the error (typically because it's in a branch of a union that isn't
* found in the data being read), then it's safe to ignore it.
*/
public static class ErrorAction extends Action {
public enum ErrorType {
/**
* Use when Schema types don't match and can't be converted. For example,
* resolving "int" and "enum".
*/
INCOMPATIBLE_SCHEMA_TYPES,
/**
* Use when Schema types match but, in the case of record, enum, or fixed, the
* names don't match.
*/
NAMES_DONT_MATCH,
/**
* Use when two fixed types match and their names match by their sizes don't.
*/
SIZES_DONT_MATCH,
/**
* Use when matching two records and the reader has a field with no default
* value and that field is missing in the writer..
*/
MISSING_REQUIRED_FIELD,
/**
* Use when matching a reader's union against a non-union and can't find a
* branch that matches.
*/
NO_MATCHING_BRANCH
}
public final ErrorType error;
public ErrorAction(Schema w, Schema r, GenericData d, ErrorType e) {
super(w, r, d, Action.Type.ERROR);
this.error = e;
}
@Override
public String toString() {
switch (this.error) {
case INCOMPATIBLE_SCHEMA_TYPES:
case NAMES_DONT_MATCH:
case SIZES_DONT_MATCH:
case NO_MATCHING_BRANCH:
return "Found " + writer.getFullName() + ", expecting " + reader.getFullName();
case MISSING_REQUIRED_FIELD: {
final List rfields = reader.getFields();
String fname = "";
for (Field rf : rfields) {
if (writer.getField(rf.name()) == null && rf.defaultValue() == null) {
fname = rf.name();
}
}
return ("Found " + writer.getFullName() + ", expecting " + reader.getFullName() + ", missing required field "
+ fname);
}
default:
throw new IllegalArgumentException("Unknown error.");
}
}
}
/**
* In this case, the writer's type needs to be promoted to the reader's. These
* are constructed by {@link Promote#resolve}, which will only construct one
* when the writer's and reader's schemas are different (ie, no "self
* promotion"), and whent the promotion is one allowed by the Avro spec.
*/
public static class Promote extends Action {
private Promote(Schema w, Schema r, GenericData d) {
super(w, r, d, Action.Type.PROMOTE);
}
/**
* Return a promotion.
*
* @param w Writer's schema
* @param r Rearder's schema
* @return a {@link Promote} schema if the two schemas are compatible, or
* {@link ErrorType#INCOMPATIBLE_SCHEMA_TYPES} if they are not.
* @throws IllegalArgumentException if getType() of the two schemas are
* not different.
*/
public static Action resolve(Schema w, Schema r, GenericData d) {
if (isValid(w, r)) {
return new Promote(w, r, d);
} else {
return new ErrorAction(w, r, d, ErrorType.INCOMPATIBLE_SCHEMA_TYPES);
}
}
/**
* Returns true iff w and r are both primitive types and
* either they are the same type or w is promotable to r.
* Should
*/
public static boolean isValid(Schema w, Schema r) {
if (w.getType() == r.getType())
throw new IllegalArgumentException("Only use when reader and writer are different.");
Schema.Type wt = w.getType();
switch (r.getType()) {
case INT:
switch (wt) {
case INT:
return true;
}
break;
case LONG:
switch (wt) {
case INT:
case LONG:
return true;
}
break;
case FLOAT:
switch (wt) {
case INT:
case LONG:
case FLOAT:
return true;
}
break;
case DOUBLE:
switch (wt) {
case INT:
case LONG:
case FLOAT:
case DOUBLE:
return true;
}
break;
case BYTES:
case STRING:
switch (wt) {
case STRING:
case BYTES:
return true;
}
break;
}
return false;
}
}
/**
* Used for array and map schemas: the public instance variable
* elementAction contains the resolving action needed for the element
* type of an array or value top of a map.
*/
public static class Container extends Action {
public final Action elementAction;
public Container(Schema w, Schema r, GenericData d, Action e) {
super(w, r, d, Action.Type.CONTAINER);
this.elementAction = e;
}
}
/**
* Contains information needed to resolve enumerations. When resolving enums,
* adjustments need to be made in two scenarios: the index for an enum symbol
* might be different in the reader or writer, or the reader might not have a
* symbol that was written out for the writer (which is an error, but one we can
* only detect when decoding data).
*
* These adjustments are reflected in the instance variable
* adjustments. For the symbol with index i in the writer's
* enum definition, adjustments[i] -- and integer -- contains the
* adjustment for that symbol. If the integer is positive, then reader also has
* the symbol and the integer is its index in the reader's schema. If
* adjustment[i] is negative, then the reader does not have
* the corresponding symbol (which is the error case).
*
* Sometimes there's no adjustments needed: all symbols in the reader have the
* same index in the reader's and writer's schema. This is a common case, and it
* allows for some optimization. To signal that this is the case,
* noAdjustmentsNeeded is set to true.
*/
public static class EnumAdjust extends Action {
public final int[] adjustments;
public final Object[] values;
public final boolean noAdjustmentsNeeded;
private EnumAdjust(Schema w, Schema r, GenericData d, int[] adj, Object[] values) {
super(w, r, d, Action.Type.ENUM);
this.adjustments = adj;
boolean noAdj;
int rsymCount = r.getEnumSymbols().size();
int count = Math.min(rsymCount, adj.length);
noAdj = (adj.length <= rsymCount);
for (int i = 0; noAdj && i < count; i++) {
noAdj &= (i == adj[i]);
}
this.noAdjustmentsNeeded = noAdj;
this.values = values;
}
/**
* If writer and reader don't have same name, a
* {@link ErrorAction.ErrorType#NAMES_DONT_MATCH} is returned, otherwise an
* appropriate {@link EnumAdjust} is.
*/
public static Action resolve(Schema w, Schema r, GenericData d) {
if (w.getFullName() != null && !w.getFullName().equals(r.getFullName()))
return new ErrorAction(w, r, d, ErrorType.NAMES_DONT_MATCH);
final List wsymbols = w.getEnumSymbols();
final List rsymbols = r.getEnumSymbols();
final int defaultIndex = (r.getEnumDefault() == null ? -1 : rsymbols.indexOf(r.getEnumDefault()));
int[] adjustments = new int[wsymbols.size()];
Object[] values = new Object[wsymbols.size()];
Object defaultValue = (defaultIndex == -1) ? null : d.createEnum(r.getEnumDefault(), r);
for (int i = 0; i < adjustments.length; i++) {
int j = rsymbols.indexOf(wsymbols.get(i));
if (j < 0) {
j = defaultIndex;
}
adjustments[i] = j;
values[i] = (j == defaultIndex) ? defaultValue : d.createEnum(rsymbols.get(j), r);
}
return new EnumAdjust(w, r, d, adjustments, values);
}
}
/**
* This only appears inside {@link RecordAdjust#fieldActions}, i.e., the actions
* for adjusting the fields of a record. This action indicates that the writer's
* schema has a field that the reader's does not have, and thus the
* field should be skipped. Since there is no corresponding reader's schema for
* the writer's in this case, the {@link Action#reader} field is null
* for this subclass.
*/
public static class Skip extends Action {
public Skip(Schema w, GenericData d) {
super(w, null, d, Action.Type.SKIP);
}
}
/**
* Instructions for resolving two record schemas. Includes instructions on how
* to recursively resolve each field, an indication of when to skip (writer
* fields), plus information about which reader fields should be populated by
* defaults (because the writer doesn't have corresponding fields).
*/
public static class RecordAdjust extends Action {
/**
* An action for each field of the writer. If the corresponding field is to be
* skipped during reading, then this will contain a {@link Skip}. For fields to
* be read into the reading datum, will contain a regular action for resolving
* the writer/reader schemas of the matching fields.
*/
public final Action[] fieldActions;
/**
* Contains (all of) the reader's fields. The first n of these are the
* fields that will be read from the writer: these n are in the order
* dictated by writer's schema. The remaining m fields will be read from
* default values (actions for these default values are found in
* {@link RecordAdjust#defaults}.
*/
public final Field[] readerOrder;
/**
* Pointer into {@link RecordAdjust#readerOrder} of the first reader field whose
* value comes from a default value. Set to length of
* {@link RecordAdjust#readerOrder} if there are none.
*/
public final int firstDefault;
/**
* Contains the default values to be used for the last
* readerOrder.length-firstDefault fields in rearderOrder. The
* ith element of defaults is the default value for the
* i+firstDefault member of readerOrder.
*/
public final Object[] defaults;
/**
* Supplier that offers an optimized alternative to data.newRecord()
*/
public final GenericData.InstanceSupplier instanceSupplier;
/**
* Returns true iff i == readerOrder[i].pos()
for all
* indices i
. Which is to say: the order of the reader's fields is
* the same in both the reader's and writer's schema.
*/
public boolean noReorder() {
boolean result = true;
for (int i = 0; result && i < readerOrder.length; i++) {
result &= (i == readerOrder[i].pos());
}
return result;
}
private RecordAdjust(Schema w, Schema r, GenericData d, Action[] fa, Field[] ro, int firstD, Object[] defaults) {
super(w, r, d, Action.Type.RECORD);
this.fieldActions = fa;
this.readerOrder = ro;
this.firstDefault = firstD;
this.defaults = defaults;
this.instanceSupplier = d.getNewRecordSupplier(r);
}
/**
* Returns a {@link RecordAdjust} for the two schemas, or an {@link ErrorAction}
* if there was a problem resolving. An {@link ErrorAction} is returned when
* either the two record-schemas don't have the same name, or if the writer is
* missing a field for which the reader does not have a default value.
*
* @throws RuntimeException if writer and reader schemas are not both records
*/
static Action resolve(Schema writeSchema, Schema readSchema, GenericData data, Map seen) {
final SeenPair writeReadPair = new SeenPair(writeSchema, readSchema);
Action result = seen.get(writeReadPair);
if (result != null) {
return result;
}
/*
* Current implementation doesn't do this check. To pass regressions tests, we
* can't either. if (w.getFullName() != null && !
* w.getFullName().equals(r.getFullName())) { result = new ErrorAction(w, r, d,
* ErrorType.NAMES_DONT_MATCH); seen.put(wr, result); return result; }
*/
final List writeFields = writeSchema.getFields();
final List readFields = readSchema.getFields();
int firstDefault = 0;
for (Schema.Field writeField : writeFields) {
// The writeFields that are also in the readschema
if (readSchema.getField(writeField.name()) != null) {
++firstDefault;
}
}
final Action[] actions = new Action[writeFields.size()];
final Field[] reordered = new Field[readFields.size()];
final Object[] defaults = new Object[reordered.length - firstDefault];
result = new RecordAdjust(writeSchema, readSchema, data, actions, reordered, firstDefault, defaults);
seen.put(writeReadPair, result); // Insert early to handle recursion
int i = 0;
int ridx = 0;
for (Field writeField : writeFields) {
final Field readField = readSchema.getField(writeField.name());
if (readField != null) {
reordered[ridx++] = readField;
actions[i++] = Resolver.resolve(writeField.schema(), readField.schema(), data, seen);
} else {
actions[i++] = new Skip(writeField.schema(), data);
}
}
for (Field readField : readFields) {
// The field is not in the writeSchema, so we can never read it
// Use the default value, or throw an error otherwise
final Field writeField = writeSchema.getField(readField.name());
if (writeField == null) {
if (readField.defaultValue() == null) {
result = new ErrorAction(writeSchema, readSchema, data, ErrorType.MISSING_REQUIRED_FIELD);
seen.put(writeReadPair, result);
return result;
} else {
defaults[ridx - firstDefault] = data.getDefaultValue(readField);
reordered[ridx++] = readField;
}
}
}
return result;
}
}
/**
* In this case, the writer was a union. There are two subcases here:
*
* If the reader and writer are the same union, then the unionEquiv
* variable is set to true and the actions list holds the resolutions
* of each branch of the writer against the corresponding branch of the reader
* (which will result in no material resolution work, because the branches will
* be equivalent). If they reader is not a union or is a different union, then
* unionEquiv is false and the actions list holds the
* resolution of each of the writer's branches against the entire schema of the
* reader (if the reader is a union, that will result in ReaderUnion actions).
*/
public static class WriterUnion extends Action {
public final Action[] actions;
public final boolean unionEquiv;
private WriterUnion(Schema w, Schema r, GenericData d, boolean ue, Action[] a) {
super(w, r, d, Action.Type.WRITER_UNION);
unionEquiv = ue;
actions = a;
}
public static Action resolve(Schema writeSchema, Schema readSchema, GenericData data, Map seen) {
boolean unionEquivalent = unionEquiv(writeSchema, readSchema, new HashMap<>());
final List writeTypes = writeSchema.getTypes();
final List readTypes = (unionEquivalent ? readSchema.getTypes() : null);
int writeTypeLength = writeTypes.size();
final Action[] actions = new Action[writeTypeLength];
for (int i = 0; i < writeTypeLength; i++) {
actions[i] = Resolver.resolve(writeTypes.get(i), (unionEquivalent ? readTypes.get(i) : readSchema), data, seen);
}
return new WriterUnion(writeSchema, readSchema, data, unionEquivalent, actions);
}
}
/**
* In this case, the reader is a union and the writer is not. For this case, we
* need to pick the first branch of the reader that matches the writer and
* pretend to the reader that the index of this branch was found in the writer's
* data stream.
*
* To support this case, the {@link ReaderUnion} object has two (public) fields:
* firstMatch gives the index of the first matching branch in the
* reader's schema, and actualResolution is the {@link Action} that
* resolves the writer's schema with the schema found in the firstMatch
* branch of the reader's schema.
*/
public static class ReaderUnion extends Action {
public final int firstMatch;
public final Action actualAction;
public ReaderUnion(Schema w, Schema r, GenericData d, int firstMatch, Action actual) {
super(w, r, d, Action.Type.READER_UNION);
this.firstMatch = firstMatch;
this.actualAction = actual;
}
/**
* Returns a {@link ReaderUnion} action for resolving w and r,
* or an {@link ErrorAction} if there is no branch in the reader that matches
* the writer.
*
* @throws RuntimeException if r is not a union schema or w
* is a union schema
*/
public static Action resolve(Schema w, Schema r, GenericData d, Map seen) {
if (w.getType() == Schema.Type.UNION) {
throw new IllegalArgumentException("Writer schema is union.");
}
int i = firstMatchingBranch(w, r, d, seen);
if (0 <= i) {
return new ReaderUnion(w, r, d, i, Resolver.resolve(w, r.getTypes().get(i), d, seen));
}
return new ErrorAction(w, r, d, ErrorType.NO_MATCHING_BRANCH);
}
// Note: This code was taken verbatim from the 1.8.x branch of Avro. It
// implements
// a "soft match" algorithm that seems to disagree with the spec. However, in
// the
// interest of "bug-for-bug" compatibility, we imported the old algorithm.
private static int firstMatchingBranch(Schema w, Schema r, GenericData d, Map seen) {
final Schema.Type vt = w.getType();
// first scan for exact match
int j = 0;
int structureMatch = -1;
for (Schema b : r.getTypes()) {
if (vt == b.getType()) {
if (vt == Schema.Type.RECORD || vt == Schema.Type.ENUM || vt == Schema.Type.FIXED) {
final String vname = w.getFullName();
final String bname = b.getFullName();
// return immediately if the name matches exactly according to spec
if (vname != null && vname.equals(bname))
return j;
if (vt == Schema.Type.RECORD && !hasMatchError(RecordAdjust.resolve(w, b, d, seen))) {
final String vShortName = w.getName();
final String bShortName = b.getName();
// use the first structure match or one where the name matches
if ((structureMatch < 0) || (vShortName != null && vShortName.equals(bShortName))) {
structureMatch = j;
}
}
} else {
return j;
}
}
j++;
}
// if there is a record structure match, return it
if (structureMatch >= 0) {
return structureMatch;
}
// then scan match via numeric promotion
j = 0;
for (Schema b : r.getTypes()) {
switch (vt) {
case INT:
switch (b.getType()) {
case LONG:
case DOUBLE:
case FLOAT:
return j;
}
break;
case LONG:
switch (b.getType()) {
case DOUBLE:
case FLOAT:
return j;
}
break;
case FLOAT:
switch (b.getType()) {
case DOUBLE:
return j;
}
break;
case STRING:
switch (b.getType()) {
case BYTES:
return j;
}
break;
case BYTES:
switch (b.getType()) {
case STRING:
return j;
}
break;
}
j++;
}
return -1;
}
private static boolean hasMatchError(Action action) {
if (action instanceof ErrorAction)
return true;
else
for (Action a : ((RecordAdjust) action).fieldActions) {
if (a instanceof ErrorAction) {
return true;
}
}
return false;
}
}
private static boolean unionEquiv(Schema write, Schema read, Map seen) {
final Schema.Type wt = write.getType();
if (wt != read.getType()) {
return false;
}
// Previously, the spec was somewhat ambiguous as to whether getFullName or
// getName should be used here. Using name rather than fully qualified name
// maintains backwards compatibility.
if ((wt == Schema.Type.RECORD || wt == Schema.Type.FIXED || wt == Schema.Type.ENUM)
&& !(write.getName() == null || write.getName().equals(read.getName()))) {
return false;
}
switch (wt) {
case NULL:
case BOOLEAN:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case STRING:
case BYTES:
return true;
case ARRAY:
return unionEquiv(write.getElementType(), read.getElementType(), seen);
case MAP:
return unionEquiv(write.getValueType(), read.getValueType(), seen);
case FIXED:
return write.getFixedSize() == read.getFixedSize();
case ENUM: {
final List ws = write.getEnumSymbols();
final List rs = read.getEnumSymbols();
return ws.equals(rs);
}
case UNION: {
final List wb = write.getTypes();
final List rb = read.getTypes();
if (wb.size() != rb.size()) {
return false;
}
for (int i = 0; i < wb.size(); i++) {
if (!unionEquiv(wb.get(i), rb.get(i), seen)) {
return false;
}
}
return true;
}
case RECORD: {
final SeenPair wsc = new SeenPair(write, read);
if (!seen.containsKey(wsc)) {
seen.put(wsc, true); // Be optimistic, but we may change our minds
final List wb = write.getFields();
final List rb = read.getFields();
if (wb.size() != rb.size()) {
seen.put(wsc, false);
} else {
for (int i = 0; i < wb.size(); i++) {
// Loop through each of the elements, and check if they are equal
if (!wb.get(i).name().equals(rb.get(i).name())
|| !unionEquiv(wb.get(i).schema(), rb.get(i).schema(), seen)) {
seen.put(wsc, false);
break;
}
}
}
}
return seen.get(wsc);
}
default:
throw new IllegalArgumentException("Unknown schema type: " + write.getType());
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy