org.apache.hadoop.hive.serde2.teradata.TeradataBinaryDataInputStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-serde
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.serde2.teradata;

import org.apache.commons.io.input.SwappedDataInputStream;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.Timestamp;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigInteger;
import java.text.ParseException;

import static java.lang.String.format;

/**
 * The TeradataBinaryDataInputStream is used to handle the Teradata binary format input for record.
 * Since the TD binary format uses little-endian to handle the SHORT, INT, LONG, DOUBLE and etc.
 * while the Hadoop uses big-endian,
 * We extend SwappedDataInputStream to handle these types and extend to handle the Teradata
 * specific types like VARCHAR, CHAR, TIMESTAMP, DATE...
 */
public class TeradataBinaryDataInputStream extends SwappedDataInputStream {

  private static final int DATE_STRING_LENGTH = 8;

  /**
   * Instantiates a new Teradata binary data input stream.
   *
   * @param input the input
   */
  public TeradataBinaryDataInputStream(InputStream input) {
    super(input);
  }

  /**
   * Read VARCHAR(N).
   * The representation of Varchar in Teradata binary format is:
   * the first two bytes represent the length N of this varchar field,
   * the next N bytes represent the content of this varchar field.
   * To pad the null varchar, the length will be 0 and the content will be none.
   *
   * @return the string
   * @throws IOException the io exception
   */
  public String readVarchar() throws IOException {
    int varcharLength = readUnsignedShort();
    byte[] varcharContent = new byte[varcharLength];
    int numOfBytesRead = in.read(varcharContent);
    if (varcharContent.length != 0 && numOfBytesRead != varcharLength) {
      throw new EOFException(
          format("Fail to read the varchar. Expect %d bytes, get %d bytes", varcharLength, numOfBytesRead));
    }
    //force it to be UTF8 string
    return new String(varcharContent, "UTF8");
  }

  /**
   * Read TIMESTAMP(P).
   * The representation of timestamp in Teradata binary format is:
   * the byte number to read is based on the precision of timestamp,
   * each byte represents one char and the timestamp is using string representation,
   * eg: for TIMESTAMP(6), we need to read 26 bytes
   * 31 39  31 31 2d 31 31 2d 31 31 20 31 39 3a 32 30 3a 32 31 2e 34 33 33 32 30 30
   * will represent 1911-11-11 19:20:21.433200.
   * the null timestamp will use space to pad.
   *
   * @param byteNum the byte number that will be read from inputstream
   * @return the timestamp
   * @throws IOException the io exception
   */
  public Timestamp readTimestamp(Integer byteNum) throws IOException {
    // yyyy-mm-dd hh:mm:ss
    byte[] timestampContent = new byte[byteNum];
    int numOfBytesRead = in.read(timestampContent);
    if (timestampContent.length != 0 && numOfBytesRead != byteNum) {
      throw new EOFException(
          format("Fail to read the timestamp. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead));
    }
    String timestampStr = new String(timestampContent, "UTF8");
    if (timestampStr.trim().length() == 0) {
      return null;
    }
    return Timestamp.valueOf(timestampStr);
  }

  /**
   * Read DATE.
   * The representation of date in Teradata binary format is:
   * The Date D is a int with 4 bytes using little endian,
   * The representation is (D+19000000).ToString -> YYYYMMDD,
   * eg: Date 07 b2 01 00 -> 111111 in little endian -> 19111111 - > 1911.11.11.
   * the null date will use 0 to pad.
   *
   * @return the date
   * @throws IOException the io exception
   * @throws ParseException the parse exception
   */
  public Date readDate() throws IOException, ParseException {
    int di = readInt();
    if (di == 0) {
      return null;
    }
    String dateString = String.valueOf(di + 19000000);
    if (dateString.length() < DATE_STRING_LENGTH) {
      dateString = StringUtils.leftPad(dateString, DATE_STRING_LENGTH, '0');
    }
    Date date = new Date();
    date.setYear(Integer.parseInt(dateString.substring(0, 4)));
    date.setMonth(Integer.parseInt(dateString.substring(4, 6)));
    date.setDayOfMonth(Integer.parseInt(dateString.substring(6, 8)));
    return date;
  }

  /**
   * Read CHAR(N).
   * The representation of char in Teradata binary format is
   * the byte number to read is based on the [charLength] * [bytePerChar] <- totalLength,
   * bytePerChar is decided by the charset: LATIN charset is 2 bytes per char and UNICODE charset is 3 bytes per char.
   * the null char will use space to pad.
   *
   * @param totalLength the total length
   * @return the string
   * @throws IOException the io exception
   */
  public String readChar(int totalLength) throws IOException {
    byte[] charContent = new byte[totalLength];
    int numOfBytesRead = in.read(charContent);
    if (charContent.length != 0 && numOfBytesRead != totalLength) {
      throw new EOFException(
          format("Fail to read the varchar. Expect %d bytes, get %d bytes", totalLength, numOfBytesRead));
    }
    return new String(charContent, "UTF8");
  }

  /**
   * Read DECIMAL(P, S).
   * The representation of decimal in Teradata binary format is
   * the byte number to read is decided solely by the precision(P),
   * HiveDecimal is constructed through the byte array and scale.
   * the null DECIMAL will use 0x00 to pad.
   *
   * @param scale the scale
   * @param byteNum the byte num
   * @return the hive decimal
   * @throws IOException the io exception
   */
  public HiveDecimal readDecimal(int scale, int byteNum) throws IOException {
    byte[] decimalContent = new byte[byteNum];
    int numOfBytesRead = in.read(decimalContent);
    if (decimalContent.length != 0 && numOfBytesRead != byteNum) {
      throw new EOFException(
          format("Fail to read the decimal. Expect %d bytes, get %d bytes", byteNum, numOfBytesRead));
    }
    ArrayUtils.reverse(decimalContent);
    return HiveDecimal.create(new BigInteger(decimalContent), scale);
  }

  /**
   * Read VARBYTE(N).
   * The representation of VARBYTE in Teradata binary format is:
   * the first two bytes represent the length N of this varchar field
   * the next N bytes represent the content of this varchar field.
   * To pad the null varbyte, the length will be 0 and the content will be none.
   *
   * @return the byte [ ]
   * @throws IOException the io exception
   */
  public byte[] readVarbyte() throws IOException {
    int varbyteLength = readUnsignedShort();
    byte[] varbyteContent = new byte[varbyteLength];
    int numOfBytesRead = in.read(varbyteContent);
    if (varbyteContent.length != 0 && numOfBytesRead != varbyteLength) {
      throw new EOFException(
          format("Fail to read the varbyte. Expect %d bytes, get %d bytes", varbyteLength, numOfBytesRead));
    }
    return varbyteContent;
  }
}