All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.serde2.avro.AvroDeserializer Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2.avro;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.rmi.server.UID;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.TimeZone;

import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Fixed;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.EncoderFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.common.type.TimestampTZUtil;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.common.type.CalendarUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.io.Writable;

class AvroDeserializer {
  private static final Logger LOG = LoggerFactory.getLogger(AvroDeserializer.class);
  /**
   * Set of already seen and valid record readers IDs which doesn't need re-encoding
   */
  private final HashSet noEncodingNeeded = new HashSet();
  /**
   * Map of record reader ID and the associated re-encoder. It contains only the record readers
   *  that record needs to be re-encoded.
   */
  private final HashMap reEncoderCache = new HashMap();
  /**
   * Flag to print the re-encoding warning message only once. Avoid excessive logging for each
   * record encoding.
   */
  private boolean warnedOnce = false;

  /**
   * Time zone in which file was written, which may be stored in metadata.
   */
  private ZoneId writerTimezone = null;

  /**
   * Whether the file was written using proleptic Gregorian or hybrid calendar.
   */
  private Boolean writerProleptic = null;

  private Boolean writerZoneConversionLegacy = null;

  private Configuration configuration = null;

  AvroDeserializer() {}

  AvroDeserializer(Configuration configuration) {
    this.configuration = configuration;
  }

  /**
   * When encountering a record with an older schema than the one we're trying
   * to read, it is necessary to re-encode with a reader against the newer schema.
   * Because Hive doesn't provide a way to pass extra information to the
   * inputformat, we're unable to provide the newer schema when we have it and it
   * would be most useful - when the inputformat is reading the file.
   *
   * This is a slow process, so we try to cache as many of the objects as possible.
   */
  static class SchemaReEncoder {
    private final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    private final GenericDatumWriter gdw = new GenericDatumWriter();
    private BinaryDecoder binaryDecoder = null;

    GenericDatumReader gdr = null;

    public SchemaReEncoder(Schema writer, Schema reader) {
      gdr = new GenericDatumReader(writer, reader);
    }

    public GenericRecord reencode(GenericRecord r)
        throws AvroSerdeException {
      baos.reset();

      BinaryEncoder be = EncoderFactory.get().directBinaryEncoder(baos, null);
      gdw.setSchema(r.getSchema());
      try {
        gdw.write(r, be);
        ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());

        binaryDecoder = DecoderFactory.defaultFactory().createBinaryDecoder(bais, binaryDecoder);

        return gdr.read(r, binaryDecoder);

      } catch (IOException e) {
        throw new AvroSerdeException("Exception trying to re-encode record to new schema", e);
      }
    }
  }

  private List row;

  /**
   * Deserialize an Avro record, recursing into its component fields and
   * deserializing them as well.  Fields of the record are matched by name
   * against fields in the Hive row.
   *
   * Because Avro has some data types that Hive does not, these are converted
   * during deserialization to types Hive will work with.
   *
   * @param columnNames List of columns Hive is expecting from record.
   * @param columnTypes List of column types matched by index to names
   * @param writable Instance of GenericAvroWritable to deserialize
   * @param readerSchema Schema of the writable to deserialize
   * @return A list of objects suitable for Hive to work with further
   * @throws AvroSerdeException For any exception during deserialization
   */
  public Object deserialize(List columnNames, List columnTypes,
                            Writable writable, Schema readerSchema) throws AvroSerdeException {
    if(!(writable instanceof AvroGenericRecordWritable)) {
      throw new AvroSerdeException("Expecting a AvroGenericRecordWritable");
    }

    if(row == null || row.size() != columnNames.size()) {
      row = new ArrayList(columnNames.size());
    } else {
      row.clear();
    }

    AvroGenericRecordWritable recordWritable = (AvroGenericRecordWritable) writable;
    GenericRecord r = recordWritable.getRecord();
    Schema fileSchema = recordWritable.getFileSchema();
    writerTimezone = recordWritable.getWriterTimezone();
    writerProleptic = recordWritable.getWriterProleptic();
    writerZoneConversionLegacy = recordWritable.getWriterZoneConversionLegacy(); 

    UID recordReaderId = recordWritable.getRecordReaderID();
    //If the record reader (from which the record is originated) is already seen and valid,
    //no need to re-encode the record.
    if(!noEncodingNeeded.contains(recordReaderId)) {
      SchemaReEncoder reEncoder = null;
      //Check if the record record is already encoded once. If it does
      //reuse the encoder.
      if(reEncoderCache.containsKey(recordReaderId)) {
        reEncoder = reEncoderCache.get(recordReaderId); //Reuse the re-encoder
      } else if (!r.getSchema().equals(readerSchema)) { //Evolved schema?
        //Create and store new encoder in the map for re-use
        reEncoder = new SchemaReEncoder(r.getSchema(), readerSchema);
        reEncoderCache.put(recordReaderId, reEncoder);
      } else{
        LOG.debug("Adding new valid RRID :" +  recordReaderId);
        noEncodingNeeded.add(recordReaderId);
      }
      if(reEncoder != null) {
        if (!warnedOnce) {
          LOG.warn("Received different schemas.  Have to re-encode: " +
              r.getSchema().toString(false) + "\nSIZE" + reEncoderCache + " ID " + recordReaderId);
          warnedOnce = true;
        }
        r = reEncoder.reencode(r);
      }
    }

    workerBase(row, fileSchema, columnNames, columnTypes, r);
    return row;
  }

  // The actual deserialization may involve nested records, which require recursion.
  private List workerBase(List objectRow, Schema fileSchema, List columnNames,
                                  List columnTypes, GenericRecord record)
          throws AvroSerdeException {
    for(int i = 0; i < columnNames.size(); i++) {
      TypeInfo columnType = columnTypes.get(i);
      String columnName = columnNames.get(i);
      Object datum = record.get(columnName);
      Schema datumSchema = record.getSchema().getField(columnName).schema();
      Schema.Field field = AvroSerdeUtils.isNullableType(fileSchema)?AvroSerdeUtils.getOtherTypeFromNullableType(fileSchema).getField(columnName):fileSchema.getField(columnName);
      objectRow.add(worker(datum, field == null ? null : field.schema(), datumSchema, columnType));
    }

    return objectRow;
  }

  private Object worker(Object datum, Schema fileSchema, Schema recordSchema, TypeInfo columnType)
          throws AvroSerdeException {
    if (datum == null) {
      return null;
    }

    // Avro requires nullable types to be defined as unions of some type T
    // and NULL. This is annoying and we're going to hide it from the user.

    if (AvroSerdeUtils.isNullableType(recordSchema)) {
      recordSchema = AvroSerdeUtils.getOtherTypeFromNullableType(recordSchema);
    }
    if (fileSchema != null && AvroSerdeUtils.isNullableType(fileSchema)) {
      fileSchema = AvroSerdeUtils.getOtherTypeFromNullableType(fileSchema);
    }

    switch(columnType.getCategory()) {
    case STRUCT:
      return deserializeStruct((GenericData.Record) datum, fileSchema, (StructTypeInfo) columnType);
    case UNION:
      return deserializeUnion(datum, fileSchema, recordSchema, (UnionTypeInfo) columnType);
    case LIST:
      return deserializeList(datum, fileSchema, recordSchema, (ListTypeInfo) columnType);
    case MAP:
      return deserializeMap(datum, fileSchema, recordSchema, (MapTypeInfo) columnType);
    case PRIMITIVE:
      return deserializePrimitive(datum, fileSchema, recordSchema, (PrimitiveTypeInfo) columnType);
    default:
      throw new AvroSerdeException("Unknown TypeInfo: " + columnType.getCategory());
    }
  }

  private Object deserializePrimitive(Object datum, Schema fileSchema, Schema recordSchema,
      PrimitiveTypeInfo columnType) throws AvroSerdeException {
    switch (columnType.getPrimitiveCategory()){
    case STRING:
      return datum.toString(); // To workaround AvroUTF8
      // This also gets us around the Enum issue since we just take the value
      // and convert it to a string. Yay!
    case BINARY:
      if (recordSchema.getType() == Type.FIXED){
        Fixed fixed = (Fixed) datum;
        return fixed.bytes();
      } else if (recordSchema.getType() == Type.BYTES){
        return AvroSerdeUtils.getBytesFromByteBuffer((ByteBuffer) datum);
      } else {
        throw new AvroSerdeException("Unexpected Avro schema for Binary TypeInfo: " + recordSchema.getType());
      }
    case DECIMAL:
      if (fileSchema == null) {
        throw new AvroSerdeException("File schema is missing for decimal field. Reader schema is " + columnType);
      }

      int scale = 0;
      try {
        scale = AvroSerdeUtils.getIntFromSchema(fileSchema, AvroSerDe.AVRO_PROP_SCALE);
      } catch(Exception ex) {
        throw new AvroSerdeException("Failed to obtain scale value from file schema: " + fileSchema, ex);
      }

      HiveDecimal dec = AvroSerdeUtils.getHiveDecimalFromByteBuffer((ByteBuffer) datum, scale);
      JavaHiveDecimalObjectInspector oi = (JavaHiveDecimalObjectInspector)
          PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector((DecimalTypeInfo)columnType);
      return oi.set(null, dec);
    case CHAR:
      if (fileSchema == null) {
        throw new AvroSerdeException("File schema is missing for char field. Reader schema is " + columnType);
      }

      int maxLength = 0;
      try {
        maxLength = AvroSerdeUtils.getIntFromSchema(fileSchema, AvroSerDe.AVRO_PROP_MAX_LENGTH);
      } catch (Exception ex) {
        throw new AvroSerdeException("Failed to obtain maxLength value for char field from file schema: " + fileSchema, ex);
      }

      String str = datum.toString();
      HiveChar hc = new HiveChar(str, maxLength);
      return hc;
    case VARCHAR:
      if (fileSchema == null) {
        throw new AvroSerdeException("File schema is missing for varchar field. Reader schema is " + columnType);
      }

      maxLength = 0;
      try {
        maxLength = AvroSerdeUtils.getIntFromSchema(fileSchema, AvroSerDe.AVRO_PROP_MAX_LENGTH);
      } catch (Exception ex) {
        throw new AvroSerdeException("Failed to obtain maxLength value for varchar field from file schema: " + fileSchema, ex);
      }

      str = datum.toString();
      HiveVarchar hvc = new HiveVarchar(str, maxLength);
      return hvc;
    case DATE: {
      if (recordSchema.getType() != Type.INT) {
        throw new AvroSerdeException("Unexpected Avro schema for Date TypeInfo: " + recordSchema.getType());
      }

      final boolean skipProlepticConversion;
      if (writerProleptic != null) {
        skipProlepticConversion = writerProleptic;
      } else {
        if (configuration != null) {
          skipProlepticConversion = HiveConf.getBoolVar(
              configuration, HiveConf.ConfVars.HIVE_AVRO_PROLEPTIC_GREGORIAN_DEFAULT);
        } else {
          skipProlepticConversion = HiveConf.ConfVars.HIVE_AVRO_PROLEPTIC_GREGORIAN_DEFAULT.defaultBoolVal;
        }
      }

      return Date.ofEpochMilli(DateWritableV2.daysToMillis(
          skipProlepticConversion ? (Integer) datum : CalendarUtils.convertDateToProleptic((Integer) datum)));
    }
    case TIMESTAMP: {
      if (recordSchema.getType() != Type.LONG) {
        throw new AvroSerdeException(
            "Unexpected Avro schema for Date TypeInfo: " + recordSchema.getType());
      }
      // If a time zone is found in file metadata (property name: writer.time.zone), convert the
      // timestamp to that (writer) time zone in order to emulate time zone agnostic behavior.
      // If not, then the file was written by an older version of hive, so we convert the timestamp
      // to the server's (reader) time zone for backwards compatibility reasons - unless the
      // session level configuration hive.avro.timestamp.skip.conversion is set to true, in which
      // case we assume it was written by a time zone agnostic writer, so we don't convert it.
      final boolean skipUTCConversion;
      if (configuration != null) {
        skipUTCConversion = HiveConf.getBoolVar(
            configuration, HiveConf.ConfVars.HIVE_AVRO_TIMESTAMP_SKIP_CONVERSION);
      } else {
        skipUTCConversion = HiveConf.ConfVars.HIVE_AVRO_TIMESTAMP_SKIP_CONVERSION.defaultBoolVal;
      }
      final boolean legacyConversion;
      if (writerZoneConversionLegacy != null) {
        legacyConversion = writerZoneConversionLegacy;
      } else if (writerTimezone != null) {
        legacyConversion = false;
      } else if (configuration != null) {
        legacyConversion =
            HiveConf.getBoolVar(configuration, HiveConf.ConfVars.HIVE_AVRO_TIMESTAMP_LEGACY_CONVERSION_ENABLED);
      } else {
        legacyConversion = HiveConf.ConfVars.HIVE_AVRO_TIMESTAMP_LEGACY_CONVERSION_ENABLED.defaultBoolVal;
      }
      ZoneId convertToTimeZone;
      if (writerTimezone != null) {
        convertToTimeZone = writerTimezone;
      } else if (skipUTCConversion) {
        convertToTimeZone = ZoneOffset.UTC;
      } else {
        convertToTimeZone = TimeZone.getDefault().toZoneId();
      }
      final boolean skipProlepticConversion;
      if (writerProleptic != null) {
        skipProlepticConversion = writerProleptic;
      } else {
        if (configuration != null) {
          skipProlepticConversion = HiveConf.getBoolVar(
              configuration, HiveConf.ConfVars.HIVE_AVRO_PROLEPTIC_GREGORIAN_DEFAULT);
        } else {
          skipProlepticConversion = HiveConf.ConfVars.HIVE_AVRO_PROLEPTIC_GREGORIAN_DEFAULT.defaultBoolVal;
        }
      }
      Timestamp timestamp = TimestampTZUtil.convertTimestampToZone(
          Timestamp.ofEpochMilli((Long) datum), ZoneOffset.UTC, convertToTimeZone, legacyConversion);
      if (!skipProlepticConversion) {
        timestamp = Timestamp.ofEpochMilli(
            CalendarUtils.convertTimeToProleptic(timestamp.toEpochMilli()));
      }
      return timestamp;
    }
    default:
      return datum;
    }
  }


  private Object deserializeStruct(GenericData.Record datum, Schema fileSchema, StructTypeInfo columnType)
          throws AvroSerdeException {
    // No equivalent Java type for the backing structure, need to recurse and build a list
    List innerFieldTypes = columnType.getAllStructFieldTypeInfos();
    List innerFieldNames = columnType.getAllStructFieldNames();
    List innerObjectRow = new ArrayList(innerFieldTypes.size());

    return workerBase(innerObjectRow, fileSchema, innerFieldNames, innerFieldTypes, datum);
  }

  private Object deserializeUnion(Object datum, Schema fileSchema, Schema recordSchema,
                                  UnionTypeInfo columnType) throws AvroSerdeException {
    // Calculate tags individually since the schema can evolve and can have different tags. In worst case, both schemas are same
    // and we would end up doing calculations twice to get the same tag
    int fsTag = GenericData.get().resolveUnion(fileSchema, datum); // Determine index of value from fileSchema
    int rsTag = GenericData.get().resolveUnion(recordSchema, datum); // Determine index of value from recordSchema
    Object desired = worker(datum, fileSchema == null ? null : fileSchema.getTypes().get(fsTag),
        recordSchema.getTypes().get(rsTag), columnType.getAllUnionObjectTypeInfos().get(rsTag));
    return new StandardUnionObjectInspector.StandardUnion((byte)rsTag, desired);
  }

  private Object deserializeList(Object datum, Schema fileSchema, Schema recordSchema,
                                 ListTypeInfo columnType) throws AvroSerdeException {
    // Need to check the original schema to see if this is actually a Fixed.
    if(recordSchema.getType().equals(Schema.Type.FIXED)) {
    // We're faking out Hive to work through a type system impedance mismatch.
    // Pull out the backing array and convert to a list.
      GenericData.Fixed fixed = (GenericData.Fixed) datum;
      List asList = new ArrayList(fixed.bytes().length);
      for(int j = 0; j < fixed.bytes().length; j++) {
        asList.add(fixed.bytes()[j]);
      }
      return asList;
    } else if(recordSchema.getType().equals(Schema.Type.BYTES)) {
      // This is going to be slow... hold on.
      ByteBuffer bb = (ByteBuffer)datum;
      List asList = new ArrayList(bb.capacity());
      byte[] array = bb.array();
      for(int j = 0; j < array.length; j++) {
        asList.add(array[j]);
      }
      return asList;
    } else { // An actual list, deser its values
      List listData = (List) datum;
      Schema listSchema = recordSchema.getElementType();
      List listContents = new ArrayList(listData.size());
      for(Object obj : listData) {
        listContents.add(worker(obj, fileSchema == null ? null : fileSchema.getElementType(), listSchema,
            columnType.getListElementTypeInfo()));
      }
      return listContents;
    }
  }

  private Object deserializeMap(Object datum, Schema fileSchema, Schema mapSchema, MapTypeInfo columnType)
          throws AvroSerdeException {
    // Avro only allows maps with Strings for keys, so we only have to worry
    // about deserializing the values
    Map map = new HashMap();
    Map mapDatum = (Map)datum;
    Schema valueSchema = mapSchema.getValueType();
    TypeInfo valueTypeInfo = columnType.getMapValueTypeInfo();
    for (CharSequence key : mapDatum.keySet()) {
      Object value = mapDatum.get(key);
      map.put(key.toString(), worker(value, fileSchema == null ? null : fileSchema.getValueType(),
          valueSchema, valueTypeInfo));
    }

    return map;
  }

  public HashSet getNoEncodingNeeded() {
    return noEncodingNeeded;
  }

  public HashMap getReEncoderCache() {
    return reEncoderCache;
  }

}