All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.serde2.teradata.TeradataBinarySerde Maven / Gradle / Ivy

There is a newer version: 4.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.teradata;

import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.Timestamp;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.type.Date;

import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import static java.lang.String.format;

/**
 * https://cwiki.apache.org/confluence/display/Hive/TeradataBinarySerde.
 * TeradataBinarySerde handles the serialization and deserialization of Teradata Binary Record
 * passed from TeradataBinaryRecordReader.
 *
 * The Teradata Binary Record uses little-endian to handle the SHORT, INT, LONG, DOUBLE...
 * We extend SwappedDataInputStream to handle these types and extend to handle the Teradata
 * specific types like VARCHAR, CHAR, TIMESTAMP, DATE...
 *
 * Currently we support 11 Teradata data types: VARCHAR ,INTEGER, TIMESTAMP, FLOAT, DATE,
 * BYTEINT, BIGINT, CHARACTER, DECIMAL, SMALLINT, VARBYTE.
 * The mapping between Teradata data type and Hive data type is
 * Teradata Data Type: Hive Data Type
 * VARCHAR: VARCHAR,
 * INTEGER: INT,
 * TIMESTAMP: TIMESTAMP,
 * FLOAT: DOUBLE,
 * DATE: DATE,
 * BYTEINT: TINYINT ,
 * BIGINT: BIGINT,
 * CHAR: CHAR,
 * DECIMAL: DECIMAL,
 * SMALLINT: SMALLINT,
 * VARBYTE: BINARY.
 *
 * TeradataBinarySerde currently doesn't support complex types like MAP, ARRAY and STRUCT.
 */
@SerDeSpec(schemaProps = { serdeConstants.LIST_COLUMNS,
    serdeConstants.LIST_COLUMN_TYPES }) public class TeradataBinarySerde extends AbstractSerDe {
  private static final Logger LOG = LoggerFactory.getLogger(TeradataBinarySerde.class);

  public static final String TD_SCHEMA_LITERAL = "teradata.schema.literal";

  private StructObjectInspector rowOI;
  private ArrayList row;
  private byte[] inForNull;

  private int numCols;
  private List columnNames;
  private List columnTypes;

  private TeradataBinaryDataOutputStream out;
  private BytesWritable serializeBytesWritable;
  private byte[] outForNull;

  public static final String TD_TIMESTAMP_PRECISION = "teradata.timestamp.precision";
  private int timestampPrecision;
  private static final int DEFAULT_TIMESTAMP_BYTE_NUM = 19;
  private static final String DEFAULT_TIMESTAMP_PRECISION = "6";

  public static final String TD_CHAR_SET = "teradata.char.charset";
  private String charCharset;
  private static final String DEFAULT_CHAR_CHARSET = "UNICODE";
  private static final Map CHARSET_TO_BYTE_NUM = ImmutableMap.of("LATIN", 2, "UNICODE", 3);

  @Override
  public void initialize(Configuration configuration, Properties tableProperties, Properties partitionProperties)
      throws SerDeException {
    super.initialize(configuration, tableProperties, partitionProperties);

    columnNames = Arrays.asList(properties.getProperty(serdeConstants.LIST_COLUMNS).split(","));

    String columnTypeProperty = properties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
    LOG.debug(serdeConstants.LIST_COLUMN_TYPES + ": " + columnTypeProperty);
    if (columnTypeProperty.length() == 0) {
      columnTypes = new ArrayList();
    } else {
      columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    }

    assert columnNames.size() == columnTypes.size();
    numCols = columnNames.size();

    // get the configured teradata timestamp precision
    // you can configure to generate timestamp of different precision in the binary file generated by TPT/BTEQ
    timestampPrecision = Integer.parseInt(properties.getProperty(TD_TIMESTAMP_PRECISION, DEFAULT_TIMESTAMP_PRECISION));

    // get the configured teradata char charset
    // in TD, latin charset will have 2 bytes per char and unicode will have 3 bytes per char
    charCharset = properties.getProperty(TD_CHAR_SET, DEFAULT_CHAR_CHARSET);
    if (!CHARSET_TO_BYTE_NUM.containsKey(charCharset)) {
      throw new SerDeException(
          format("%s isn't supported in Teradata Char Charset %s", charCharset, CHARSET_TO_BYTE_NUM.keySet()));
    }

    // All columns have to be primitive.
    // Constructing the row ObjectInspector:
    List columnOIs = new ArrayList(numCols);
    for (int i = 0; i < numCols; i++) {
      if (columnTypes.get(i).getCategory() != ObjectInspector.Category.PRIMITIVE) {
        throw new SerDeException(
            getClass().getName() + " only accepts primitive columns, but column[" + i + "] named " + columnNames.get(i)
                + " has category " + columnTypes.get(i).getCategory());
      }
      columnOIs.add(TypeInfoUtils.getStandardWritableObjectInspectorFromTypeInfo(columnTypes.get(i)));
    }

    rowOI = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, columnOIs);

    // Constructing the row object and will be reused for all rows
    row = new ArrayList(numCols);
    for (int i = 0; i < numCols; i++) {
      row.add(null);
    }

    // Initialize vars related to Null Array which represents the null bitmap
    int byteNumForNullArray = (numCols / 8) + ((numCols % 8 == 0) ? 0 : 1);
    LOG.debug(format("The Null Bytes for each record will have %s bytes", byteNumForNullArray));
    inForNull = new byte[byteNumForNullArray];

    out = new TeradataBinaryDataOutputStream();
    serializeBytesWritable = new BytesWritable();
    outForNull = new byte[byteNumForNullArray];
  }

  /**
   * Returns the Writable class that would be returned by the serialize method.
   * This is used to initialize SequenceFile header.
   */
  @Override public Class getSerializedClass() {
    return ByteWritable.class;
  }

  /**
   * Serialize an object by navigating inside the Object with the
   * ObjectInspector. In most cases, the return value of this function will be
   * constant since the function will reuse the Writable object. If the client
   * wants to keep a copy of the Writable, the client needs to clone the
   * returned value.

   * @param obj
   * @param objInspector
   */
  @Override public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
    try {
      out.reset();
      final StructObjectInspector outputRowOI = (StructObjectInspector) objInspector;
      final List fieldRefs = outputRowOI.getAllStructFieldRefs();

      if (fieldRefs.size() != numCols) {
        throw new SerDeException(
            "Cannot serialize the object because there are " + fieldRefs.size() + " fieldRefs but the table defined "
                + numCols + " columns.");
      }

      // Fully refresh the Null Array to write into the out
      for (int i = 0; i < numCols; i++) {
        Object objectForField = outputRowOI.getStructFieldData(obj, fieldRefs.get(i));
        if (objectForField == null) {
          outForNull[i / 8] = (byte) (outForNull[i / 8] | (0x01 << (7 - (i % 8))));
        } else {
          outForNull[i / 8] = (byte) (outForNull[i / 8] & ~(0x01 << (7 - (i % 8))));
        }
      }
      out.write(outForNull);

      // serialize each field using FieldObjectInspector
      for (int i = 0; i < numCols; i++) {
        Object objectForField = outputRowOI.getStructFieldData(obj, fieldRefs.get(i));
        serializeField(objectForField, fieldRefs.get(i).getFieldObjectInspector(), columnTypes.get(i));
      }

      serializeBytesWritable.set(out.toByteArray(), 0, out.size());
      return serializeBytesWritable;
    } catch (IOException e) {
      throw new SerDeException(e);
    }
  }

  private void serializeField(Object objectForField, ObjectInspector oi, TypeInfo ti)
      throws IOException, SerDeException {
    switch (oi.getCategory()) {
    case PRIMITIVE:
      PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
      switch (poi.getPrimitiveCategory()) {
      // Teradata Type: BYTEINT
      case BYTE:
        ByteObjectInspector boi = (ByteObjectInspector) poi;
        byte b = 0;
        if (objectForField != null) {
          b = boi.get(objectForField);
        }
        out.write(b);
        return;
      // Teradata Type: SMALLINT
      case SHORT:
        ShortObjectInspector spoi = (ShortObjectInspector) poi;
        short s = 0;
        if (objectForField != null) {
          s = spoi.get(objectForField);
        }
        out.writeShort(s);
        return;
      // Teradata Type: INT
      case INT:
        IntObjectInspector ioi = (IntObjectInspector) poi;
        int i = 0;
        if (objectForField != null) {
          i = ioi.get(objectForField);
        }
        out.writeInt(i);
        return;
      // Teradata Type: BIGINT
      case LONG:
        LongObjectInspector loi = (LongObjectInspector) poi;
        long l = 0;
        if (objectForField != null) {
          l = loi.get(objectForField);
        }
        out.writeLong(l);
        return;
      // Teradata Type: FLOAT
      case DOUBLE:
        DoubleObjectInspector doi = (DoubleObjectInspector) poi;
        double d = 0;
        if (objectForField != null) {
          d = doi.get(objectForField);
        }
        out.writeDouble(d);
        return;
      // Teradata Type: VARCHAR
      case VARCHAR:
        HiveVarcharObjectInspector hvoi = (HiveVarcharObjectInspector) poi;
        HiveVarcharWritable hv = hvoi.getPrimitiveWritableObject(objectForField);
        // assert the length of varchar record fits into the table definition
        if (hv != null) {
          assert ((VarcharTypeInfo) ti).getLength() >= hv.getHiveVarchar().getCharacterLength();
        }
        out.writeVarChar(hv);
        return;
      // Teradata Type: TIMESTAMP
      case TIMESTAMP:
        TimestampObjectInspector tsoi = (TimestampObjectInspector) poi;
        TimestampWritableV2 ts = tsoi.getPrimitiveWritableObject(objectForField);
        out.writeTimestamp(ts, getTimeStampByteNum(timestampPrecision));
        return;
      // Teradata Type: DATE
      case DATE:
        DateObjectInspector dtoi = (DateObjectInspector) poi;
        DateWritableV2 dw = dtoi.getPrimitiveWritableObject(objectForField);
        out.writeDate(dw);
        return;
      // Teradata Type: CHAR
      case CHAR:
        HiveCharObjectInspector coi = (HiveCharObjectInspector) poi;
        HiveCharWritable hc = coi.getPrimitiveWritableObject(objectForField);
        // assert the length of char record fits into the table definition
        if (hc != null) {
          assert ((CharTypeInfo) ti).getLength() >= hc.getHiveChar().getCharacterLength();
        }
        out.writeChar(hc, getCharByteNum(charCharset) * ((CharTypeInfo) ti).getLength());
        return;
      // Teradata Type: DECIMAL
      case DECIMAL:
        DecimalTypeInfo dtype = (DecimalTypeInfo) ti;
        int precision = dtype.precision();
        int scale = dtype.scale();
        HiveDecimalObjectInspector hdoi = (HiveDecimalObjectInspector) poi;
        HiveDecimalWritable hd = hdoi.getPrimitiveWritableObject(objectForField);
        // assert the precision of decimal record fits into the table definition
        if (hd != null) {
          assert (dtype.getPrecision() >= hd.precision());
        }
        out.writeDecimal(hd, getDecimalByteNum(precision), scale);
        return;
      // Teradata Type: VARBYTE
      case BINARY:
        BinaryObjectInspector bnoi = (BinaryObjectInspector) poi;
        BytesWritable byw = bnoi.getPrimitiveWritableObject(objectForField);
        out.writeVarByte(byw);
        return;
      default:
        throw new SerDeException("Unrecognized type: " + poi.getPrimitiveCategory());
      }
      // Currently, serialization of complex types is not supported
    case LIST:
    case MAP:
    case STRUCT:
    default:
      throw new SerDeException("Unrecognized type: " + oi.getCategory());
    }
  }

  /**
   * Deserialize an object out of a Writable blob. In most cases, the return
   * value of this function will be constant since the function will reuse the
   * returned object. If the client wants to keep a copy of the object, the
   * client needs to clone the returned value by calling
   * ObjectInspectorUtils.getStandardObject().
   *
   * @param blob
   *          The Writable object containing a serialized object
   * @return A Java object representing the contents in the blob.
   */
  @Override public Object deserialize(Writable blob) throws SerDeException {
    try {
      BytesWritable data = (BytesWritable) blob;

      // initialize the data to be the input stream
      TeradataBinaryDataInputStream in =
          new TeradataBinaryDataInputStream(new ByteArrayInputStream(data.getBytes(), 0, data.getLength()));

      int numOfByteRead = in.read(inForNull);

      if (inForNull.length != 0 && numOfByteRead != inForNull.length) {
        throw new EOFException("not enough bytes for one object");
      }

      boolean isNull;
      for (int i = 0; i < numCols; i++) {
        // get if the ith field is null or not
        isNull = ((inForNull[i / 8] & (128 >> (i % 8))) != 0);
        row.set(i, deserializeField(in, columnTypes.get(i), row.get(i), isNull));
      }

      //After deserializing all the fields, the input should be over in which case in.read will return -1
      if (in.read() != -1) {
        throw new EOFException("The inputstream has more after we deserialize all the fields - this is unexpected");
      }
    } catch (EOFException e) {
      LOG.warn("Catch thrown exception", e);
      LOG.warn("This record has been polluted. We have reset all the row fields to be null");
      for (int i = 0; i < numCols; i++) {
        row.set(i, null);
      }
    } catch (IOException e) {
      throw new SerDeException(e);
    } catch (ParseException e) {
      throw new SerDeException(e);
    }
    return row;
  }

  private Object deserializeField(TeradataBinaryDataInputStream in, TypeInfo type, Object reuse, boolean isNull)
      throws IOException, ParseException, SerDeException {
    // isNull:
    // In the Teradata Binary file, even if the field is null (isNull=true),
    // the data still has some default values to pad the record.
    // In this case, you cannot avoid reading the bytes even it is not used.
    switch (type.getCategory()) {
    case PRIMITIVE:
      PrimitiveTypeInfo ptype = (PrimitiveTypeInfo) type;
      switch (ptype.getPrimitiveCategory()) {
      case VARCHAR: // Teradata Type: VARCHAR
        String st = in.readVarchar();
        if (isNull) {
          return null;
        } else {
          HiveVarcharWritable r = reuse == null ? new HiveVarcharWritable() : (HiveVarcharWritable) reuse;
          r.set(st, ((VarcharTypeInfo) type).getLength());
          return r;
        }
      case INT: // Teradata Type: INT
        int i = in.readInt();
        if (isNull) {
          return null;
        } else {
          IntWritable r = reuse == null ? new IntWritable() : (IntWritable) reuse;
          r.set(i);
          return r;
        }
      case TIMESTAMP: // Teradata Type: TIMESTAMP
        Timestamp ts = in.readTimestamp(getTimeStampByteNum(timestampPrecision));
        if (isNull) {
          return null;
        } else {
          TimestampWritableV2 r = reuse == null ? new TimestampWritableV2() : (TimestampWritableV2) reuse;
          r.set(ts);
          return r;
        }
      case DOUBLE: // Teradata Type: FLOAT
        double d = in.readDouble();
        if (isNull) {
          return null;
        } else {
          DoubleWritable r = reuse == null ? new DoubleWritable() : (DoubleWritable) reuse;
          r.set(d);
          return r;
        }
      case DATE: // Teradata Type: DATE
        Date dt = in.readDate();
        if (isNull) {
          return null;
        } else {
          DateWritableV2 r = reuse == null ? new DateWritableV2() : (DateWritableV2) reuse;
          r.set(dt);
          return r;
        }
      case BYTE: // Teradata Type: BYTEINT
        byte bt = in.readByte();
        if (isNull) {
          return null;
        } else {
          ByteWritable r = reuse == null ? new ByteWritable() : (ByteWritable) reuse;
          r.set(bt);
          return r;
        }
      case LONG: // Teradata Type: BIGINT
        long l = in.readLong();
        if (isNull) {
          return null;
        } else {
          LongWritable r = reuse == null ? new LongWritable() : (LongWritable) reuse;
          r.set(l);
          return r;
        }
      case CHAR: // Teradata Type: CHAR
        CharTypeInfo ctype = (CharTypeInfo) type;
        int length = ctype.getLength();
        String c = in.readChar(length * getCharByteNum(charCharset));
        if (isNull) {
          return null;
        } else {
          HiveCharWritable r = reuse == null ? new HiveCharWritable() : (HiveCharWritable) reuse;
          r.set(c, length);
          return r;
        }
      case DECIMAL: // Teradata Type: DECIMAL
        DecimalTypeInfo dtype = (DecimalTypeInfo) type;
        int precision = dtype.precision();
        int scale = dtype.scale();
        HiveDecimal hd = in.readDecimal(scale, getDecimalByteNum(precision));
        if (isNull) {
          return null;
        } else {
          HiveDecimalWritable r = (reuse == null ? new HiveDecimalWritable() : (HiveDecimalWritable) reuse);
          r.set(hd);
          return r;
        }
      case SHORT: // Teradata Type: SMALLINT
        short s = in.readShort();
        if (isNull) {
          return null;
        } else {
          ShortWritable r = reuse == null ? new ShortWritable() : (ShortWritable) reuse;
          r.set(s);
          return r;
        }
      case BINARY: // Teradata Type: VARBYTE
        byte[] content = in.readVarbyte();
        if (isNull) {
          return null;
        } else {
          BytesWritable r = new BytesWritable();
          r.set(content, 0, content.length);
          return r;
        }
      default:
        throw new SerDeException("Unrecognized type: " + ptype.getPrimitiveCategory());
      }
      // Currently, deserialization of complex types is not supported
    case LIST:
    case MAP:
    case STRUCT:
    default:
      throw new SerDeException("Unsupported category: " + type.getCategory());
    }
  }

  /**
   * Get the object inspector that can be used to navigate through the internal
   * structure of the Object returned from deserialize(...).
   */
  @Override public ObjectInspector getObjectInspector() throws SerDeException {
    return rowOI;
  }

  private int getTimeStampByteNum(int precision) {
    if (precision == 0) {
      return DEFAULT_TIMESTAMP_BYTE_NUM;
    } else {
      return precision + 1 + DEFAULT_TIMESTAMP_BYTE_NUM;
    }
  }

  private int getCharByteNum(String charset) throws SerDeException {
    if (!CHARSET_TO_BYTE_NUM.containsKey(charCharset)) {
      throw new SerDeException(
          format("%s isn't supported in Teradata Char Charset %s", charCharset, CHARSET_TO_BYTE_NUM.keySet()));
    } else {
      return CHARSET_TO_BYTE_NUM.get(charset);
    }
  }

  private int getDecimalByteNum(int precision) throws SerDeException {
    if (precision <= 0) {
      throw new SerDeException(format("the precision of Decimal should be bigger than 0. %d is illegal", precision));
    }
    if (precision <= 2) {
      return 1;
    }
    if (precision <= 4) {
      return 2;
    }
    if (precision <= 9) {
      return 4;
    }
    if (precision <= 18) {
      return 8;
    }
    if (precision <= 38) {
      return 16;
    }
    throw new IllegalArgumentException(
        format("the precision of Decimal should be smaller than 39. %d is illegal", precision));
  }
}