All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hudi.hadoop.utils.HiveAvroSerializer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hudi.hadoop.utils;

import org.apache.avro.JsonProperties;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericEnumSymbol;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.specific.SpecificRecordBase;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils;
import org.apache.hadoop.hive.serde2.avro.InstanceCache;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableDateObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;

import java.math.BigDecimal;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

import static org.apache.hudi.avro.AvroSchemaUtils.resolveNullableSchema;
import static org.apache.hudi.avro.AvroSchemaUtils.resolveUnionSchema;
import static org.apache.hudi.avro.HoodieAvroUtils.isMetadataField;

/**
 * Helper class to serialize hive writable type to avro record.
 */
public class HiveAvroSerializer {

  private final List columnNames;
  private final List columnTypes;
  private final ObjectInspector objectInspector;

  private static final Logger LOG = LogManager.getLogger(HiveAvroSerializer.class);

  public HiveAvroSerializer(ObjectInspector objectInspector, List columnNames, List columnTypes) {
    this.columnNames = columnNames;
    this.columnTypes = columnTypes;
    this.objectInspector = objectInspector;
  }

  private static final Schema STRING_SCHEMA = Schema.create(Schema.Type.STRING);

  public GenericRecord serialize(Object o, Schema schema) {

    StructObjectInspector soi = (StructObjectInspector) objectInspector;
    GenericData.Record record = new GenericData.Record(schema);

    List outputFieldRefs = soi.getAllStructFieldRefs();
    if (outputFieldRefs.size() != columnNames.size()) {
      throw new HoodieException("Number of input columns was different than output columns (in = " + columnNames.size() + " vs out = " + outputFieldRefs.size());
    }

    int size = schema.getFields().size();

    List allStructFieldRefs = soi.getAllStructFieldRefs();
    List structFieldsDataAsList = soi.getStructFieldsDataAsList(o);

    for (int i  = 0; i < size; i++) {
      Schema.Field field = schema.getFields().get(i);
      if (i >= columnTypes.size()) {
        break;
      }
      try {
        setUpRecordFieldFromWritable(columnTypes.get(i), structFieldsDataAsList.get(i),
            allStructFieldRefs.get(i).getFieldObjectInspector(), record, field);
      } catch (Exception e) {
        LOG.error(String.format("current columnNames: %s", columnNames.stream().collect(Collectors.joining(","))));
        LOG.error(String.format("current type: %s", columnTypes.stream().map(f -> f.getTypeName()).collect(Collectors.joining(","))));
        LOG.error(String.format("current value: %s", HoodieRealtimeRecordReaderUtils.arrayWritableToString((ArrayWritable) o)));
        throw e;
      }
    }
    return record;
  }

  private void setUpRecordFieldFromWritable(TypeInfo typeInfo, Object structFieldData, ObjectInspector fieldOI, GenericData.Record record, Schema.Field field) {
    Object val = serialize(typeInfo, fieldOI, structFieldData, field.schema());
    if (val == null) {
      if (field.defaultVal() instanceof JsonProperties.Null) {
        record.put(field.name(), null);
      } else {
        record.put(field.name(), field.defaultVal());
      }
    } else {
      record.put(field.name(), val);
    }
  }

  /**
   * Determine if an Avro schema is of type Union[T, NULL].  Avro supports nullable
   * types via a union of type T and null.  This is a very common use case.
   * As such, we want to silently convert it to just T and allow the value to be null.
   *
   * When a Hive union type is used with AVRO, the schema type becomes
   * Union[NULL, T1, T2, ...]. The NULL in the union should be silently removed
   *
   * @return true if type represents Union[T, Null], false otherwise
   */
  public static boolean isNullableType(Schema schema) {
    if (!schema.getType().equals(Schema.Type.UNION)) {
      return false;
    }

    List itemSchemas = schema.getTypes();
    if (itemSchemas.size() < 2) {
      return false;
    }

    for (Schema itemSchema : itemSchemas) {
      if (Schema.Type.NULL.equals(itemSchema.getType())) {
        return true;
      }
    }

    // [null, null] not allowed, so this check is ok.
    return false;
  }

  /**
   * If the union schema is a nullable union, get the schema for the non-nullable type.
   * This method does no checking that the provided Schema is nullable. If the provided
   * union schema is non-nullable, it simply returns the union schema
   */
  public static Schema getOtherTypeFromNullableType(Schema unionSchema) {
    final List types = unionSchema.getTypes();
    if (types.size() == 2) { // most common scenario
      if (types.get(0).getType() == Schema.Type.NULL) {
        return types.get(1);
      }
      if (types.get(1).getType() == Schema.Type.NULL) {
        return types.get(0);
      }
      // not a nullable union
      return unionSchema;
    }

    final List itemSchemas = new ArrayList<>();
    for (Schema itemSchema : types) {
      if (!Schema.Type.NULL.equals(itemSchema.getType())) {
        itemSchemas.add(itemSchema);
      }
    }

    if (itemSchemas.size() > 1) {
      return Schema.createUnion(itemSchemas);
    } else {
      return itemSchemas.get(0);
    }
  }

  private Object serialize(TypeInfo typeInfo, ObjectInspector fieldOI, Object structFieldData, Schema schema) throws HoodieException {
    if (null == structFieldData) {
      return null;
    }

    if (isNullableType(schema)) {
      schema = getOtherTypeFromNullableType(schema);
    }
    /* Because we use Hive's 'string' type when Avro calls for enum, we have to expressly check for enum-ness */
    if (Schema.Type.ENUM.equals(schema.getType())) {
      assert fieldOI instanceof PrimitiveObjectInspector;
      return serializeEnum((PrimitiveObjectInspector) fieldOI, structFieldData, schema);
    }
    switch (typeInfo.getCategory()) {
      case PRIMITIVE:
        assert fieldOI instanceof PrimitiveObjectInspector;
        return serializePrimitive((PrimitiveObjectInspector) fieldOI, structFieldData, schema);
      case MAP:
        assert fieldOI instanceof MapObjectInspector;
        assert typeInfo instanceof MapTypeInfo;
        return serializeMap((MapTypeInfo) typeInfo, (MapObjectInspector) fieldOI, structFieldData, schema);
      case LIST:
        assert fieldOI instanceof ListObjectInspector;
        assert typeInfo instanceof ListTypeInfo;
        return serializeList((ListTypeInfo) typeInfo, (ListObjectInspector) fieldOI, structFieldData, schema);
      case UNION:
        assert fieldOI instanceof UnionObjectInspector;
        assert typeInfo instanceof UnionTypeInfo;
        return serializeUnion((UnionTypeInfo) typeInfo, (UnionObjectInspector) fieldOI, structFieldData, schema);
      case STRUCT:
        assert fieldOI instanceof StructObjectInspector;
        assert typeInfo instanceof StructTypeInfo;
        return serializeStruct((StructTypeInfo) typeInfo, (StructObjectInspector) fieldOI, structFieldData, schema);
      default:
        throw new HoodieException("Ran out of TypeInfo Categories: " + typeInfo.getCategory());
    }
  }

  /** private cache to avoid lots of EnumSymbol creation while serializing.
   *  Two levels because the enum symbol is specific to a schema.
   *  Object because we want to avoid the overhead of repeated toString calls while maintaining compatability.
   *  Provided there are few enum types per record, and few symbols per enum, memory use should be moderate.
   *  eg 20 types with 50 symbols each as length-10 Strings should be on the order of 100KB per AvroSerializer.
   */
  final InstanceCache> enums = new InstanceCache>() {
    @Override
    protected InstanceCache makeInstance(final Schema schema,
                                                                    Set seenSchemas) {
      return new InstanceCache() {
        @Override
        protected GenericEnumSymbol makeInstance(Object seed, Set seenSchemas) {
          return new GenericData.EnumSymbol(schema, seed.toString());
        }
      };
    }
  };

  private Object serializeEnum(PrimitiveObjectInspector fieldOI, Object structFieldData, Schema schema) throws HoodieException {
    try {
      return enums.retrieve(schema).retrieve(serializePrimitive(fieldOI, structFieldData, schema));
    } catch (Exception e) {
      throw new HoodieException(e);
    }
  }

  private Object serializeStruct(StructTypeInfo typeInfo, StructObjectInspector ssoi, Object o, Schema schema) {
    int size = schema.getFields().size();
    List allStructFieldRefs = ssoi.getAllStructFieldRefs();
    List structFieldsDataAsList = ssoi.getStructFieldsDataAsList(o);
    GenericData.Record record = new GenericData.Record(schema);
    ArrayList allStructFieldTypeInfos = typeInfo.getAllStructFieldTypeInfos();

    for (int i  = 0; i < size; i++) {
      Schema.Field field = schema.getFields().get(i);
      setUpRecordFieldFromWritable(allStructFieldTypeInfos.get(i), structFieldsDataAsList.get(i),
          allStructFieldRefs.get(i).getFieldObjectInspector(), record, field);
    }
    return record;
  }

  private Object serializePrimitive(PrimitiveObjectInspector fieldOI, Object structFieldData, Schema schema) throws HoodieException {
    switch (fieldOI.getPrimitiveCategory()) {
      case BINARY:
        if (schema.getType() == Schema.Type.BYTES) {
          return AvroSerdeUtils.getBufferFromBytes((byte[])fieldOI.getPrimitiveJavaObject(structFieldData));
        } else if (schema.getType() == Schema.Type.FIXED) {
          GenericData.Fixed fixed = new GenericData.Fixed(schema, (byte[])fieldOI.getPrimitiveJavaObject(structFieldData));
          return fixed;
        } else {
          throw new HoodieException("Unexpected Avro schema for Binary TypeInfo: " + schema.getType());
        }
      case DECIMAL:
        HiveDecimal dec = (HiveDecimal)fieldOI.getPrimitiveJavaObject(structFieldData);
        LogicalTypes.Decimal decimal = (LogicalTypes.Decimal)schema.getLogicalType();
        BigDecimal bd = new BigDecimal(dec.toString()).setScale(decimal.getScale());
        return HoodieAvroUtils.DECIMAL_CONVERSION.toFixed(bd, schema, decimal);
      case CHAR:
        HiveChar ch = (HiveChar)fieldOI.getPrimitiveJavaObject(structFieldData);
        return new Utf8(ch.getStrippedValue());
      case VARCHAR:
        HiveVarchar vc = (HiveVarchar)fieldOI.getPrimitiveJavaObject(structFieldData);
        return new Utf8(vc.getValue());
      case STRING:
        String string = (String)fieldOI.getPrimitiveJavaObject(structFieldData);
        return new Utf8(string);
      case DATE:
        return DateWritable.dateToDays(((DateObjectInspector)fieldOI).getPrimitiveJavaObject(structFieldData));
      case TIMESTAMP:
        Timestamp timestamp =
            ((TimestampObjectInspector) fieldOI).getPrimitiveJavaObject(structFieldData);
        return timestamp.getTime();
      case INT:
        if (schema.getLogicalType() != null && schema.getLogicalType().getName().equals("date")) {
          return DateWritable.dateToDays(new WritableDateObjectInspector().getPrimitiveJavaObject(structFieldData));
        }
        return fieldOI.getPrimitiveJavaObject(structFieldData);
      case UNKNOWN:
        throw new HoodieException("Received UNKNOWN primitive category.");
      case VOID:
        return null;
      default: // All other primitive types are simple
        return fieldOI.getPrimitiveJavaObject(structFieldData);
    }
  }

  private Object serializeUnion(UnionTypeInfo typeInfo, UnionObjectInspector fieldOI, Object structFieldData, Schema schema) throws HoodieException {
    byte tag = fieldOI.getTag(structFieldData);

    // Invariant that Avro's tag ordering must match Hive's.
    return serialize(typeInfo.getAllUnionObjectTypeInfos().get(tag),
        fieldOI.getObjectInspectors().get(tag),
        fieldOI.getField(structFieldData),
        schema.getTypes().get(tag));
  }

  private Object serializeList(ListTypeInfo typeInfo, ListObjectInspector fieldOI, Object structFieldData, Schema schema) throws HoodieException {
    List list = fieldOI.getList(structFieldData);
    List deserialized = new GenericData.Array(list.size(), schema);

    TypeInfo listElementTypeInfo = typeInfo.getListElementTypeInfo();
    ObjectInspector listElementObjectInspector = fieldOI.getListElementObjectInspector();
    // NOTE: We have to resolve nullable schema, since Avro permits array elements
    //       to be null
    Schema arrayNestedType = resolveNullableSchema(schema.getElementType());
    Schema elementType = arrayNestedType.getField("element") == null ? arrayNestedType : arrayNestedType.getField("element").schema();

    for (int i = 0; i < list.size(); i++) {
      Object childFieldData = list.get(i);
      if (childFieldData instanceof ArrayWritable && ((ArrayWritable) childFieldData).get().length != ((StructTypeInfo) listElementTypeInfo).getAllStructFieldNames().size()) {
        deserialized.add(i, serialize(listElementTypeInfo, listElementObjectInspector, ((ArrayWritable) childFieldData).get()[0], elementType));
      } else {
        deserialized.add(i, serialize(listElementTypeInfo, listElementObjectInspector, childFieldData, elementType));
      }
    }
    return deserialized;
  }

  private Object serializeMap(MapTypeInfo typeInfo, MapObjectInspector fieldOI, Object structFieldData, Schema schema) throws HoodieException {
    // Avro only allows maps with string keys
    if (!mapHasStringKey(fieldOI.getMapKeyObjectInspector())) {
      throw new HoodieException("Avro only supports maps with keys as Strings.  Current Map is: " + typeInfo.toString());
    }

    ObjectInspector mapKeyObjectInspector = fieldOI.getMapKeyObjectInspector();
    ObjectInspector mapValueObjectInspector = fieldOI.getMapValueObjectInspector();
    TypeInfo mapKeyTypeInfo = typeInfo.getMapKeyTypeInfo();
    TypeInfo mapValueTypeInfo = typeInfo.getMapValueTypeInfo();
    Map map = fieldOI.getMap(structFieldData);
    Schema valueType = schema.getValueType();

    Map deserialized = new LinkedHashMap(fieldOI.getMapSize(structFieldData));

    for (Map.Entry entry : map.entrySet()) {
      deserialized.put(serialize(mapKeyTypeInfo, mapKeyObjectInspector, entry.getKey(), STRING_SCHEMA),
          serialize(mapValueTypeInfo, mapValueObjectInspector, entry.getValue(), valueType));
    }

    return deserialized;
  }

  private boolean mapHasStringKey(ObjectInspector mapKeyObjectInspector) {
    return mapKeyObjectInspector instanceof PrimitiveObjectInspector
        && ((PrimitiveObjectInspector) mapKeyObjectInspector).getPrimitiveCategory().equals(PrimitiveObjectInspector.PrimitiveCategory.STRING);
  }

  public static GenericRecord rewriteRecordIgnoreResultCheck(GenericRecord oldRecord, Schema newSchema) {
    GenericRecord newRecord = new GenericData.Record(newSchema);
    boolean isSpecificRecord = oldRecord instanceof SpecificRecordBase;
    for (Schema.Field f : newSchema.getFields()) {
      if (!(isSpecificRecord && isMetadataField(f.name()))) {
        copyOldValueOrSetDefault(oldRecord, newRecord, f);
      }
    }
    return newRecord;
  }

  private static void copyOldValueOrSetDefault(GenericRecord oldRecord, GenericRecord newRecord, Schema.Field field) {
    Schema oldSchema = oldRecord.getSchema();
    Object fieldValue = oldSchema.getField(field.name()) == null ? null : oldRecord.get(field.name());

    if (fieldValue != null) {
      // In case field's value is a nested record, we have to rewrite it as well
      Object newFieldValue;
      if (fieldValue instanceof GenericRecord) {
        GenericRecord record = (GenericRecord) fieldValue;
        newFieldValue = rewriteRecordIgnoreResultCheck(record, resolveUnionSchema(field.schema(), record.getSchema().getFullName()));
      } else {
        newFieldValue = fieldValue;
      }
      newRecord.put(field.name(), newFieldValue);
    } else if (field.defaultVal() instanceof JsonProperties.Null) {
      newRecord.put(field.name(), null);
    } else {
      newRecord.put(field.name(), field.defaultVal());
    }
  }
}