All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.serde2.avro.AvroSerDe Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2.avro;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import org.apache.avro.Schema;
import org.apache.hadoop.hive.common.StringInternUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.Writable;

/**
 * Read or write Avro data from Hive.
 */
@SerDeSpec(schemaProps = {
    serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES,
    AvroSerDe.LIST_COLUMN_COMMENTS, AvroSerDe.TABLE_NAME, AvroSerDe.TABLE_COMMENT,
    AvroSerdeUtils.SCHEMA_LITERAL, AvroSerdeUtils.SCHEMA_URL,
    AvroSerdeUtils.SCHEMA_NAMESPACE, AvroSerdeUtils.SCHEMA_NAME, AvroSerdeUtils.SCHEMA_DOC})
public class AvroSerDe extends AbstractSerDe {
  private static final Logger LOG = LoggerFactory.getLogger(AvroSerDe.class);

  public static final String TABLE_NAME = "name";
  public static final String TABLE_COMMENT = "comment";
  public static final String LIST_COLUMN_COMMENTS = "columns.comments";

  public static final String DECIMAL_TYPE_NAME = "decimal";
  public static final String CHAR_TYPE_NAME = "char";
  public static final String VARCHAR_TYPE_NAME = "varchar";
  public static final String DATE_TYPE_NAME = "date";
  public static final String TIMESTAMP_TYPE_NAME = "timestamp-millis";
  public static final String WRITER_TIME_ZONE = "writer.time.zone";
  public static final String WRITER_PROLEPTIC = "writer.proleptic";
  public static final String WRITER_ZONE_CONVERSION_LEGACY = "writer.zone.conversion.legacy";
  public static final String AVRO_PROP_LOGICAL_TYPE = "logicalType";
  public static final String AVRO_PROP_PRECISION = "precision";
  public static final String AVRO_PROP_SCALE = "scale";
  public static final String AVRO_PROP_MAX_LENGTH = "maxLength";
  public static final String AVRO_STRING_TYPE_NAME = "string";
  public static final String AVRO_INT_TYPE_NAME = "int";
  public static final String AVRO_LONG_TYPE_NAME = "long";

  private ObjectInspector oi;
  private List columnNames;
  private List columnTypes;
  private Schema schema;
  private AvroDeserializer avroDeserializer = null;
  private AvroSerializer avroSerializer = null;

  private boolean badSchema = false;

   @Override
  public void initialize(Configuration configuration, Properties tableProperties, Properties partitionProperties)
      throws SerDeException {
    /*
     * Avro should always use the table properties for initialization (see
     * HIVE-6835). The tableProperties is modified directly by this SerDe when
     * the user supplies a schema file so do not make a copy.
     */
    super.initialize(configuration, tableProperties, null);

    // Reset member variables so we don't get in a half-constructed state
    if (schema != null) {
      LOG.debug("Resetting already initialized AvroSerDe");
    }

    LOG.debug("AvroSerde::initialize(): Preset value of avro.schema.literal == "
        + tableProperties.get(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName()));

    schema = null;
    oi = null;
    columnNames = null;
    columnTypes = null;

    final String columnNameProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMNS);
    final String columnTypeProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
    final String columnCommentProperty = tableProperties.getProperty(LIST_COLUMN_COMMENTS, "");
    final String columnNameDelimiter = tableProperties.containsKey(serdeConstants.COLUMN_NAME_DELIMITER)
        ? tableProperties.getProperty(serdeConstants.COLUMN_NAME_DELIMITER)
        : String.valueOf(SerDeUtils.COMMA);

    boolean gotColTypesFromColProps = true;
    if (hasExternalSchema(tableProperties)
        || columnNameProperty == null || columnNameProperty.isEmpty()
        || columnTypeProperty == null || columnTypeProperty.isEmpty()) {
      schema = determineSchemaOrReturnErrorSchema(configuration, tableProperties);
      gotColTypesFromColProps = false;
    } else {
      // Get column names and sort order
      columnNames = StringInternUtils.internStringsInList(
          Arrays.asList(columnNameProperty.split(columnNameDelimiter)));
      columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);

      schema = getSchemaFromCols(tableProperties, columnNames, columnTypes, columnCommentProperty);
    }

    tableProperties.setProperty(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName(), schema.toString());

    LOG.debug("Avro schema is: {}", schema);

    if (this.configuration.isPresent()) {
      this.configuration.get().set(AvroSerdeUtils.AvroTableProperties.AVRO_SERDE_SCHEMA.getPropName(),
          schema.toString(false));
    } else {
      LOG.debug("Configuration null, not inserting schema");
    }

    badSchema = (schema == SchemaResolutionProblem.SIGNAL_BAD_SCHEMA);

    AvroObjectInspectorGenerator aoig = new AvroObjectInspectorGenerator(schema);
    this.columnNames = StringInternUtils.internStringsInList(aoig.getColumnNames());
    this.columnTypes = aoig.getColumnTypes();
    this.oi = aoig.getObjectInspector();
    // HIVE-22595: Update the column/type properties to reflect the current, since the
    // these properties may be used
    if (!gotColTypesFromColProps) {
      LOG.info("Updating column name/type properties based on current schema");
      tableProperties.setProperty(serdeConstants.LIST_COLUMNS, String.join(",", columnNames));
      tableProperties.setProperty(serdeConstants.LIST_COLUMN_TYPES, String.join(",", TypeInfoUtils.getTypeStringsFromTypeInfo(columnTypes)));
    }

    if (badSchema) {
      throw new SerDeException("Invalid schema reported");
    }

    this.avroSerializer = new AvroSerializer(configuration);
    this.avroDeserializer = new AvroDeserializer(configuration);
  }

  private boolean hasExternalSchema(Properties properties) {
    return properties.getProperty(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName()) != null
        || properties.getProperty(AvroSerdeUtils.AvroTableProperties.SCHEMA_URL.getPropName()) != null;
  }

  private boolean hasExternalSchema(Map tableParams) {
    return tableParams.containsKey(AvroSerdeUtils.AvroTableProperties.SCHEMA_LITERAL.getPropName())
        || tableParams.containsKey(AvroSerdeUtils.AvroTableProperties.SCHEMA_URL.getPropName());
  }

  public static Schema getSchemaFromCols(Properties properties,
          List columnNames, List columnTypes, String columnCommentProperty) {
    List columnComments;
    if (columnCommentProperty == null || columnCommentProperty.isEmpty()) {
      columnComments = new ArrayList();
    } else {
      //Comments are separated by "\0" in columnCommentProperty, see method getSchema
      //in MetaStoreUtils where this string columns.comments is generated
      columnComments = Arrays.asList(columnCommentProperty.split("\0"));

      LOG.debug("columnComments is {}", columnCommentProperty);
    }
    if (columnNames.size() != columnTypes.size()) {
      throw new IllegalArgumentException("AvroSerde initialization failed. Number of column " +
          "name and column type differs. columnNames = " + columnNames + ", columnTypes = " +
          columnTypes);
    }

    final String tableName = properties.getProperty(TABLE_NAME);
    final String tableComment = properties.getProperty(TABLE_COMMENT);
    TypeInfoToSchema typeInfoToSchema = new TypeInfoToSchema();
    return typeInfoToSchema.convert(columnNames, columnTypes, columnComments,
        properties.getProperty(AvroSerdeUtils.AvroTableProperties.SCHEMA_NAMESPACE.getPropName()),
        properties.getProperty(AvroSerdeUtils.AvroTableProperties.SCHEMA_NAME.getPropName(), tableName),
        properties.getProperty(AvroSerdeUtils.AvroTableProperties.SCHEMA_DOC.getPropName(), tableComment));

  }

  /**
   * Attempt to determine the schema via the usual means, but do not throw
   * an exception if we fail.  Instead, signal failure via a special
   * schema.  This is used because Hive calls init on the serde during
   * any call, including calls to update the serde properties, meaning
   * if the serde is in a bad state, there is no way to update that state.
   */
  private Schema determineSchemaOrReturnErrorSchema(Configuration conf, Properties props) {
    try {
      return AvroSerdeUtils.determineSchemaOrThrowException(conf, props);
    } catch (AvroSerdeException he) {
      LOG.warn("Encountered AvroSerdeException determining schema. Returning signal schema to indicate problem",
          he);

      return SchemaResolutionProblem.SIGNAL_BAD_SCHEMA;
    } catch (Exception e) {
      LOG.warn("Encountered exception determining schema. Returning signal schema to indicate problem", e);
      return SchemaResolutionProblem.SIGNAL_BAD_SCHEMA;
    }
  }

  @Override
  public Class getSerializedClass() {
    return AvroGenericRecordWritable.class;
  }

  @Override
  public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException {
    if(badSchema) {
      throw new BadSchemaException();
    }
    return avroSerializer.serialize(o, objectInspector, columnNames, columnTypes, schema);
  }

  @Override
  public Object deserialize(Writable writable) throws SerDeException {
    if(badSchema) {
      throw new BadSchemaException();
    }
    return avroDeserializer.deserialize(columnNames, columnTypes, writable, schema);
  }

  @Override
  public ObjectInspector getObjectInspector() throws SerDeException {
    return oi;
  }

  @Override
  public boolean shouldStoreFieldsInMetastore(Map tableParams) {
    return !hasExternalSchema(tableParams);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy