All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2.avro;


import org.apache.avro.Schema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.mapred.JobConf;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.Properties;

/**
 * Utilities useful only to the AvroSerde itself.  Not mean to be used by
 * end-users but public for interop to the ql package.
 */
public class AvroSerdeUtils {
  private static final Logger LOG = LoggerFactory.getLogger(AvroSerdeUtils.class);

  /**
   * Enum container for all avro table properties.
   * If introducing a new avro-specific table property,
   * add it here. Putting them in an enum rather than separate strings
   * allows them to be programmatically grouped and referenced together.
   */
  public static enum AvroTableProperties {
    SCHEMA_LITERAL("avro.schema.literal"),
    SCHEMA_URL("avro.schema.url"),
    SCHEMA_NAMESPACE("avro.schema.namespace"),
    SCHEMA_NAME("avro.schema.name"),
    SCHEMA_DOC("avro.schema.doc"),
    AVRO_SERDE_SCHEMA("avro.serde.schema"),
    AVRO_SERDE_TYPE("avro.serde.type"),
    AVRO_SERDE_SKIP_BYTES("avro.serde.skip.bytes"),
    SCHEMA_RETRIEVER("avro.schema.retriever");

    private final String propName;

    AvroTableProperties(String propName) {
      this.propName = propName;
    }

    public String getPropName(){
      return this.propName;
    }
  }

  // Following parameters slated for removal, prefer usage of enum above, that allows programmatic access.
  @Deprecated public static final String SCHEMA_LITERAL = "avro.schema.literal";
  @Deprecated public static final String SCHEMA_URL = "avro.schema.url";
  @Deprecated public static final String SCHEMA_NAMESPACE = "avro.schema.namespace";
  @Deprecated public static final String SCHEMA_NAME = "avro.schema.name";
  @Deprecated public static final String SCHEMA_DOC = "avro.schema.doc";
  @Deprecated public static final String AVRO_SERDE_SCHEMA = AvroTableProperties.AVRO_SERDE_SCHEMA.getPropName();
  @Deprecated public static final String SCHEMA_RETRIEVER = AvroTableProperties.SCHEMA_RETRIEVER.getPropName();

  public static final String SCHEMA_NONE = "none";
  public static final String EXCEPTION_MESSAGE = "Neither "
      + AvroTableProperties.SCHEMA_LITERAL.getPropName() + " nor "
      + AvroTableProperties.SCHEMA_URL.getPropName() + " specified, can't determine table schema";



  /**
   * Determine the schema to that's been provided for Avro serde work.
   * @param properties containing a key pointing to the schema, one way or another
   * @return schema to use while serdeing the avro file
   * @throws IOException if error while trying to read the schema from another location
   * @throws AvroSerdeException if unable to find a schema or pointer to it in the properties
   */
  public static Schema determineSchemaOrThrowException(Configuration conf, Properties properties)
          throws IOException, AvroSerdeException {
    String schemaString = properties.getProperty(AvroTableProperties.SCHEMA_LITERAL.getPropName());
    if(schemaString != null && !schemaString.equals(SCHEMA_NONE))
      return AvroSerdeUtils.getSchemaFor(schemaString);

    // Try pulling directly from URL
    schemaString = properties.getProperty(AvroTableProperties.SCHEMA_URL.getPropName());
    if (schemaString == null) {
      final String columnNameProperty = properties.getProperty(serdeConstants.LIST_COLUMNS);
      final String columnTypeProperty = properties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
      final String columnCommentProperty = properties.getProperty(AvroSerDe.LIST_COLUMN_COMMENTS);
      if (columnNameProperty == null || columnNameProperty.isEmpty()
        || columnTypeProperty == null || columnTypeProperty.isEmpty() ) {
        throw new AvroSerdeException(EXCEPTION_MESSAGE);
      }
      final String columnNameDelimiter = properties.containsKey(serdeConstants.COLUMN_NAME_DELIMITER) ? properties
          .getProperty(serdeConstants.COLUMN_NAME_DELIMITER) : String.valueOf(SerDeUtils.COMMA);
      // Get column names and types
      List columnNames = Arrays.asList(columnNameProperty.split(columnNameDelimiter));
      List columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);

      Schema schema = AvroSerDe.getSchemaFromCols(properties, columnNames, columnTypes, columnCommentProperty);
      properties.setProperty(AvroTableProperties.SCHEMA_LITERAL.getPropName(), schema.toString());
      if (conf != null)
        conf.set(AvroTableProperties.AVRO_SERDE_SCHEMA.getPropName(), schema.toString(false));
      return schema;
    } else if(schemaString.equals(SCHEMA_NONE)) {
      throw new AvroSerdeException(EXCEPTION_MESSAGE);
    }

    try {
      Schema s = getSchemaFromFS(schemaString, conf);
      if (s == null) {
        //in case schema is not a file system
        return AvroSerdeUtils.getSchemaFor(new URL(schemaString));
      }
      return s;
    } catch (IOException ioe) {
      throw new AvroSerdeException("Unable to read schema from given path: " + schemaString, ioe);
    } catch (URISyntaxException urie) {
      throw new AvroSerdeException("Unable to read schema from given path: " + schemaString, urie);
    }
  }

  // Protected for testing and so we can pass in a conf for testing.
  protected static Schema getSchemaFromFS(String schemaFSUrl,
                          Configuration conf) throws IOException, URISyntaxException {
    FSDataInputStream in = null;
    FileSystem fs = null;
    try {
      fs = FileSystem.get(new URI(schemaFSUrl), conf);
    } catch (IOException ioe) {
      //return null only if the file system in schema is not recognized
      LOG.debug("Failed to open file system for uri {} assuming it is not a FileSystem url", schemaFSUrl, ioe);
      return null;
    }
    try {
      in = fs.open(new Path(schemaFSUrl));
      Schema s = AvroSerdeUtils.getSchemaFor(in);
      return s;
    } finally {
      if(in != null) in.close();
    }
  }

  /**
   * Determine if an Avro schema is of type Union[T, NULL].  Avro supports nullable
   * types via a union of type T and null.  This is a very common use case.
   * As such, we want to silently convert it to just T and allow the value to be null.
   *
   * When a Hive union type is used with AVRO, the schema type becomes
   * Union[NULL, T1, T2, ...]. The NULL in the union should be silently removed
   *
   * @return true if type represents Union[T, Null], false otherwise
   */
  public static boolean isNullableType(Schema schema) {
    if (!schema.getType().equals(Schema.Type.UNION)) {
      return false;
    }

    List itemSchemas = schema.getTypes();
    if (itemSchemas.size() < 2) {
      return false;
    }

    for (Schema itemSchema : itemSchemas) {
      if (Schema.Type.NULL.equals(itemSchema.getType())) {
        return true;
      }
    }

    // [null, null] not allowed, so this check is ok.
    return false;
  }

  /**
   * If the union schema is a nullable union, get the schema for the non-nullable type.
   * This method does no checking that the provided Schema is nullable. If the provided
   * union schema is non-nullable, it simply returns the union schema
   */
  public static Schema getOtherTypeFromNullableType(Schema unionSchema) {
    final List types = unionSchema.getTypes();
    if (types.size() == 2) { // most common scenario
      if (types.get(0).getType() == Schema.Type.NULL) {
        return types.get(1);
      }
      if (types.get(1).getType() == Schema.Type.NULL) {
        return types.get(0);
      }
      // not a nullable union
      return unionSchema;
    }

    final List itemSchemas = new ArrayList<>();
    for (Schema itemSchema : types) {
      if (!Schema.Type.NULL.equals(itemSchema.getType())) {
        itemSchemas.add(itemSchema);
      }
    }

    if (itemSchemas.size() > 1) {
      return Schema.createUnion(itemSchemas);
    } else {
      return itemSchemas.get(0);
    }
  }

  /**
   * Determine if we're being executed from within an MR job or as part
   * of a select * statement.  The signals for this varies between Hive versions.
   * @param job that contains things that are or are not set in a job
   * @return Are we in a job or not?
   */
  public static boolean insideMRJob(JobConf job) {
    return job != null
           && (HiveConf.getVar(job, HiveConf.ConfVars.PLAN) != null)
           && (!HiveConf.getVar(job, HiveConf.ConfVars.PLAN).isEmpty());
  }

  public static Buffer getBufferFromBytes(byte[] input) {
    ByteBuffer bb = ByteBuffer.wrap(input);
    return bb.rewind();
  }

  public static Buffer getBufferFromDecimal(HiveDecimal dec, int scale) {
    if (dec == null) {
      return null;
    }

    // NOTE: Previously, we did OldHiveDecimal.setScale(scale), called OldHiveDecimal
    //       unscaledValue().toByteArray().
    return AvroSerdeUtils.getBufferFromBytes(dec.bigIntegerBytesScaled(scale));
  }

  public static byte[] getBytesFromByteBuffer(ByteBuffer byteBuffer) {
    byteBuffer.rewind();
    byte[] result = new byte[byteBuffer.limit()];
    byteBuffer.get(result);
    return result;
  }

  public static HiveDecimal getHiveDecimalFromByteBuffer(ByteBuffer byteBuffer, int scale) {
    byte[] result = getBytesFromByteBuffer(byteBuffer);
    HiveDecimal dec = HiveDecimal.create(new BigInteger(result), scale);
    return dec;
  }

  private static Schema.Parser getSchemaParser() {
    // HIVE-24797: Disable validate default values when parsing Avro schemas.
    return new Schema.Parser().setValidateDefaults(false);
  }

  public static Schema getSchemaFor(String str) {
    Schema schema = getSchemaParser().parse(str);
    return schema;
  }

  public static Schema getSchemaFor(File file) {
    Schema schema;
    try {
      schema = getSchemaParser().parse(file);
    } catch (IOException e) {
      throw new RuntimeException("Failed to parse Avro schema from " + file.getName(), e);
    }
    return schema;
  }

  public static Schema getSchemaFor(InputStream stream) {
    Schema schema;
    try {
      schema = getSchemaParser().parse(stream);
    } catch (IOException e) {
      throw new RuntimeException("Failed to parse Avro schema", e);
    }
    return schema;
  }

  public static Schema getSchemaFor(URL url) {
    InputStream in = null;
    try {
      in = url.openStream();
      return getSchemaFor(in);
    } catch (Exception e) {
      throw new RuntimeException("Failed to parse Avro schema", e);
    } finally {
      if (in != null) {
        try {
          in.close();
        } catch (IOException e) {
          // Ignore
        }
      }
    }
  }

  public static int getIntFromSchema(Schema schema, String name) {
    Object obj = schema.getObjectProp(name);
    if (obj instanceof String) {
      return Integer.parseInt((String) obj);
    } else if (obj instanceof Integer) {
      return (int) obj;
    } else {
      throw new IllegalArgumentException("Expect integer or string value from property " + name
        + " but found type " + obj.getClass().getName());
    }
  }

  /**
   * Called on specific alter table events, removes schema url and schema literal from given tblproperties
   * After the change, HMS solely will be responsible for handling the schema
   *
   * @param conf
   * @param serializationLib
   * @param parameters
   */
  public static void handleAlterTableForAvro(HiveConf conf, String serializationLib, Map parameters) {
    if (AvroSerDe.class.getName().equals(serializationLib)) {
      String literalPropName = AvroTableProperties.SCHEMA_LITERAL.getPropName();
      String urlPropName = AvroTableProperties.SCHEMA_URL.getPropName();

      if (parameters.containsKey(literalPropName) || parameters.containsKey(urlPropName)) {
          throw new RuntimeException("Not allowed to alter schema of Avro stored table having external schema." +
                  " Consider removing "+AvroTableProperties.SCHEMA_LITERAL.getPropName() + " or " +
                  AvroTableProperties.SCHEMA_URL.getPropName() + " from table properties.");
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy