co.cask.hydrator.plugin.batch.commons.HiveSchemaConverter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-plugins Show documentation
There is a newer version: 2.1.2
/*
 * Copyright © 2015-2016 Cask Data, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */

package co.cask.hydrator.plugin.batch.commons;

import co.cask.cdap.api.data.schema.Schema;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.collect.Lists;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hive.hcatalog.common.HCatException;
import org.apache.hive.hcatalog.data.schema.HCatFieldSchema;
import org.apache.hive.hcatalog.data.schema.HCatSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;

/**
 * >Hive Schema Converter class to convert {@link Schema} to {@link HCatSchema} and vice versa.
 * Note: If the {@link HCatSchema} contains non-primitive type then this conversion to {@link Schema} will fail.
 */
public class HiveSchemaConverter {

  private static final Logger LOG = LoggerFactory.getLogger(HiveSchemaConverter.class);

  /**
   * Converts a CDAP's {@link Schema} to Hive's {@link HCatSchema} while verifying the fields in the given
   * {@link Schema} to exists in the table. The valid types for {@link Schema} which can be converted into
   * {@link HCatSchema} are boolean, int, long, float, double, string and bytes.
   *
   * @param schema the {@link Schema}
   * @return {@link HCatSchema} for the given {@link Schema}
   * @throws NullPointerException if a field in the given {@link Schema} is not found in table's {@link HCatSchema}
   */
  public static HCatSchema toHiveSchema(Schema schema, HCatSchema tableSchema) {
    List fields = Lists.newArrayList();
    for (Schema.Field field : schema.getFields()) {
      String name = field.getName();
      try {
        // this field of the schema must exist in the table and should be of the same type
        HCatFieldSchema hCatFieldSchema = tableSchema.get(name);
        Preconditions.checkNotNull(hCatFieldSchema, "Missing field %s in table schema", name);
        PrimitiveTypeInfo hiveType = hCatFieldSchema.getTypeInfo();
        PrimitiveTypeInfo type = getType(name, field.getSchema());
        if (!hiveType.equals(type)) {
          LOG.warn("The given schema {} for the field {} does not match the schema {} from the table. " +
                     "The schema {} for field {} will be used.", type, name, hiveType, hiveType, name);
        }
        fields.add(hCatFieldSchema);
      } catch (HCatException e) {
        throw new IllegalArgumentException(
          String.format("Failed to create HCatFieldSchema field %s of type %s from schema", name,
                        field.getSchema().getType()), e);
      }
    }
    return new HCatSchema(fields);
  }

  /**
   * Returns the {@link PrimitiveTypeInfo} for the {@link Schema.Type}
   *
   * @param name name of the field
   * @param schema {@link Schema} of the field
   * @return {@link PrimitiveTypeInfo} for the given {@link Schema.Type} which is compatible with Hive.
   */
  private static PrimitiveTypeInfo getType(String name, Schema schema) {
    Schema.Type type = schema.isNullable() ? schema.getNonNullable().getType() : schema.getType();
    switch (type) {
      case BOOLEAN:
        return TypeInfoFactory.booleanTypeInfo;
      case INT:
        return TypeInfoFactory.intTypeInfo;
      case LONG:
        return TypeInfoFactory.longTypeInfo;
      case FLOAT:
        return TypeInfoFactory.floatTypeInfo;
      case DOUBLE:
        return TypeInfoFactory.doubleTypeInfo;
      case STRING:
        return TypeInfoFactory.stringTypeInfo;
      case BYTES:
        return TypeInfoFactory.binaryTypeInfo;
      default:
        throw new IllegalArgumentException(String.format(
          "Schema contains field '%s' with unsupported type %s. " +
            "You should provide an schema with this field dropped to work with this table.", name, type));
    }
  }

  /**
   * Converts a {@link HCatSchema} from hive to {@link Schema} for CDAP.
   * Note: This conversion does not support non-primitive types and the conversion will fail.
   * The conversion might also change the primitive type.
   * See {@link #getType(String, PrimitiveObjectInspector.PrimitiveCategory)} for details.
   * The valid types of {@link HCatFieldSchema} which can be converted into {@link Schema} are boolean, byte, char,
   * short, int, long, float, double, string, varchar, binary
   *
   * @param hiveSchema the {@link HCatSchema} of the hive table
   * @return {@link Schema} for the given {@link HCatSchema}
   */
  public static Schema toSchema(HCatSchema hiveSchema) {
    List fields = Lists.newArrayList();
    for (HCatFieldSchema field : hiveSchema.getFields()) {
      String name = field.getName();
      if (field.isComplex()) {
        throw new IllegalArgumentException(String.format(
          "Table schema contains field '%s' with complex type %s. Only primitive types are supported.",
          name, field.getTypeString()));
      }
      fields.add(Schema.Field.of(name, getType(name, field.getTypeInfo().getPrimitiveCategory())));
    }
    return Schema.recordOf("record", fields);
  }

  /**
   * Returns the {@link Schema.Type} compatible for this field from hive.
   *
   * @param name name of the field
   * @param category the field's {@link PrimitiveObjectInspector.PrimitiveCategory}
   * @return the {@link Schema.Type} for this field
   */
  private static Schema getType(String name, PrimitiveObjectInspector.PrimitiveCategory category) {
    System.out.println("### primitive cat " + category);
    switch (category) {
      case BOOLEAN:
        return Schema.nullableOf(Schema.of(Schema.Type.BOOLEAN));
      case BYTE:
      case SHORT:
      case INT:
        return Schema.nullableOf(Schema.of(Schema.Type.INT));
      case LONG:
        return Schema.nullableOf(Schema.of(Schema.Type.LONG));
      case FLOAT:
        return Schema.nullableOf(Schema.of(Schema.Type.FLOAT));
      case DOUBLE:
        return Schema.nullableOf(Schema.of(Schema.Type.DOUBLE));
      case CHAR:
      case STRING:
      case VARCHAR:
        return Schema.nullableOf(Schema.of(Schema.Type.STRING));
      case BINARY:
        return Schema.nullableOf(Schema.of(Schema.Type.BYTES));
      // We can support VOID by having Schema type as null but HCatRecord does not support VOID and since we read
      // write through HCatSchema and HCatRecord we are not supporting VOID too for consistent behavior.
      case VOID:
      case DATE:
      case TIMESTAMP:
      case DECIMAL:
      case UNKNOWN:
      default:
        throw new IllegalArgumentException(String.format("Table schema contains field '%s' with unsupported type %s",
                                                         name, category.name()));
    }
  }
}