All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryUtils Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2.lazybinary;

import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ConcurrentHashMap;

import org.apache.hadoop.hive.serde2.ByteStream.RandomAccessOutput;
import org.apache.hadoop.hive.serde2.io.TimestampLocalTZWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
import org.apache.hadoop.hive.serde2.lazybinary.objectinspector.LazyBinaryObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.io.WritableUtils;

/**
 * LazyBinaryUtils.
 *
 */
public final class LazyBinaryUtils {

  /**
   * Convert the byte array to an int starting from the given offset. Refer to
   * code by aeden on DZone Snippets:
   *
   * @param b
   *          the byte array
   * @param offset
   *          the array offset
   * @return the integer
   */
  public static int byteArrayToInt(byte[] b, int offset) {
    int value = 0;
    for (int i = 0; i < 4; i++) {
      int shift = (4 - 1 - i) * 8;
      value += (b[i + offset] & 0x000000FF) << shift;
    }
    return value;
  }

  /**
   * Convert the byte array to a long starting from the given offset.
   *
   * @param b
   *          the byte array
   * @param offset
   *          the array offset
   * @return the long
   */
  public static long byteArrayToLong(byte[] b, int offset) {
    long value = 0;
    for (int i = 0; i < 8; i++) {
      int shift = (8 - 1 - i) * 8;
      value += ((long) (b[i + offset] & 0x00000000000000FF)) << shift;
    }
    return value;
  }

  /**
   * Convert the byte array to a short starting from the given offset.
   *
   * @param b
   *          the byte array
   * @param offset
   *          the array offset
   * @return the short
   */
  public static short byteArrayToShort(byte[] b, int offset) {
    short value = 0;
    value += (b[offset] & 0x000000FF) << 8;
    value += (b[offset + 1] & 0x000000FF);
    return value;
  }

  /**
   * Record is the unit that data is serialized in. A record includes two parts.
   * The first part stores the size of the element and the second part stores
   * the real element. size element record -> |----|-------------------------|
   *
   * A RecordInfo stores two information of a record, the size of the "size"
   * part which is the element offset and the size of the element part which is
   * element size.
   */
  public static class RecordInfo {
    public RecordInfo() {
      elementOffset = 0;
      elementSize = 0;
    }

    public byte elementOffset;
    public int elementSize;

    @Override
    public String toString() {
      return "(" + elementOffset + ", " + elementSize + ")";
    }
  }

  /**
   * Check a particular field and set its size and offset in bytes based on the
   * field type and the bytes arrays.
   *
   * For void, boolean, byte, short, int, long, float and double, there is no
   * offset and the size is fixed. For string, map, list, struct, the first four
   * bytes are used to store the size. So the offset is 4 and the size is
   * computed by concating the first four bytes together. The first four bytes
   * are defined with respect to the offset in the bytes arrays.
   * For timestamp, if the first bit is 0, the record length is 4, otherwise
   * a VInt begins at the 5th byte and its length is added to 4.
   *
   * @param objectInspector
   *          object inspector of the field
   * @param bytes
   *          bytes arrays store the table row
   * @param offset
   *          offset of this field
   * @param recordInfo
   *          modify this byteinfo object and return it
   */
  public static void checkObjectByteInfo(ObjectInspector objectInspector,
      byte[] bytes, int offset, RecordInfo recordInfo, VInt vInt) {
    Category category = objectInspector.getCategory();
    switch (category) {
    case PRIMITIVE:
      PrimitiveCategory primitiveCategory = ((PrimitiveObjectInspector) objectInspector)
          .getPrimitiveCategory();
      switch (primitiveCategory) {
      case VOID:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = 0;
        break;
      case BOOLEAN:
      case BYTE:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = 1;
        break;
      case SHORT:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = 2;
        break;
      case FLOAT:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = 4;
        break;
      case DOUBLE:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = 8;
        break;
      case INT:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
        break;
      case LONG:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
        break;
      case STRING:
        // using vint instead of 4 bytes
        LazyBinaryUtils.readVInt(bytes, offset, vInt);
        recordInfo.elementOffset = vInt.length;
        recordInfo.elementSize = vInt.value;
        break;
      case CHAR:
      case VARCHAR:
        LazyBinaryUtils.readVInt(bytes, offset, vInt);
        recordInfo.elementOffset = vInt.length;
        recordInfo.elementSize = vInt.value;
        break;
      case BINARY:
        // using vint instead of 4 bytes
        LazyBinaryUtils.readVInt(bytes, offset, vInt);
        recordInfo.elementOffset = vInt.length;
        recordInfo.elementSize = vInt.value;
        break;
      case DATE:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
        break;
      case TIMESTAMP:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = TimestampWritableV2.getTotalLength(bytes, offset);
        break;
      case TIMESTAMPLOCALTZ:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = TimestampLocalTZWritable.getTotalLength(bytes, offset);
        break;
      case INTERVAL_YEAR_MONTH:
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = WritableUtils.decodeVIntSize(bytes[offset]);
        break;
      case INTERVAL_DAY_TIME:
        recordInfo.elementOffset = 0;
        int secondsSize = WritableUtils.decodeVIntSize(bytes[offset]);
        int nanosSize = WritableUtils.decodeVIntSize(bytes[offset + secondsSize]);
        recordInfo.elementSize = secondsSize + nanosSize;
        break;
      case DECIMAL:
        // using vint instead of 4 bytes
        LazyBinaryUtils.readVInt(bytes, offset, vInt);
        recordInfo.elementOffset = 0;
        recordInfo.elementSize = vInt.length;
        LazyBinaryUtils.readVInt(bytes, offset + vInt.length, vInt);
        recordInfo.elementSize += vInt.length + vInt.value;
        break;
      default: {
        throw new RuntimeException("Unrecognized primitive type: "
            + primitiveCategory);
      }
      }
      break;
    case LIST:
    case MAP:
    case STRUCT:
    case UNION:
      recordInfo.elementOffset = 4;
      recordInfo.elementSize = LazyBinaryUtils.byteArrayToInt(bytes, offset);
      break;
    default: {
      throw new RuntimeException("Unrecognized non-primitive type: " + category);
    }
    }
  }

  /**
   * A zero-compressed encoded long.
   */
  public static class VLong {
    public VLong() {
      value = 0;
      length = 0;
    }

    public long value;
    public byte length;
  };

  /**
   * Reads a zero-compressed encoded long from a byte array and returns it.
   *
   * @param bytes
   *          the byte array
   * @param offset
   *          offset of the array to read from
   * @param vlong
   *          storing the deserialized long and its size in byte
   */
  public static void readVLong(byte[] bytes, int offset, VLong vlong) {
    byte firstByte = bytes[offset];
    vlong.length = (byte) WritableUtils.decodeVIntSize(firstByte);
    if (vlong.length == 1) {
      vlong.value = firstByte;
      return;
    }
    long i = 0;
    for (int idx = 0; idx < vlong.length - 1; idx++) {
      byte b = bytes[offset + 1 + idx];
      i = i << 8;
      i = i | (b & 0xFF);
    }
    vlong.value = (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1L) : i);
  }

  /**
   * A zero-compressed encoded integer.
   */
  public static class VInt {
    public VInt() {
      value = 0;
      length = 0;
    }

    public int value;
    public byte length;
  };

  public static final ThreadLocal threadLocalVInt = new ThreadLocal() {
    @Override
    protected VInt initialValue() {
      return new VInt();
    }
  };

  /**
   * Reads a zero-compressed encoded int from a byte array and returns it.
   *
   * @param bytes
   *          the byte array
   * @param offset
   *          offset of the array to read from
   * @param vInt
   *          storing the deserialized int and its size in byte
   */
  public static void readVInt(byte[] bytes, int offset, VInt vInt) {
    byte firstByte = bytes[offset];
    vInt.length = (byte) WritableUtils.decodeVIntSize(firstByte);
    if (vInt.length == 1) {
      vInt.value = firstByte;
      return;
    }
    int i = 0;
    for (int idx = 0; idx < vInt.length - 1; idx++) {
      byte b = bytes[offset + 1 + idx];
      i = i << 8;
      i = i | (b & 0xFF);
    }
    vInt.value = (WritableUtils.isNegativeVInt(firstByte) ? (i ^ -1) : i);
  }

  /**
   * Writes a zero-compressed encoded int to a byte array.
   *
   * @param byteStream
   *          the byte array/stream
   * @param i
   *          the int
   */
  public static void writeVInt(RandomAccessOutput byteStream, int i) {
    writeVLong(byteStream, i);
  }

  public static void writeVInt(RandomAccessOutput byteStream, int i,
      byte[] scratchBytes) {
    writeVLong(byteStream, i, scratchBytes);
  }

  /**
   * Read a zero-compressed encoded long from a byte array.
   *
   * @param bytes the byte array
   * @param offset the offset in the byte array where the VLong is stored
   * @return the long
   */
  public static long readVLongFromByteArray(final byte[] bytes, int offset) {
    byte firstByte = bytes[offset++];
    int len = WritableUtils.decodeVIntSize(firstByte);
    if (len == 1) {
      return firstByte;
    }
    long i = 0;
    for (int idx = 0; idx < len-1; idx++) {
      byte b = bytes[offset++];
      i = i << 8;
      i = i | (b & 0xFF);
    }
    return (WritableUtils.isNegativeVInt(firstByte) ? ~i : i);
  }

  /**
   * Write a zero-compressed encoded long to a byte array.
   *
   * @param bytes
   *          the byte array/stream
   * @param l
   *          the long
   */
  public static int writeVLongToByteArray(byte[] bytes, long l) {
    return LazyBinaryUtils.writeVLongToByteArray(bytes, 0, l);
  }

  public static int writeVLongToByteArray(byte[] bytes, int offset, long l) {
    if (l >= -112 && l <= 127) {
      bytes[offset] = (byte) l;
      return 1;
    }

    int len = -112;
    if (l < 0) {
      l ^= -1L; // take one's complement'
      len = -120;
    }

    long tmp = l;
    while (tmp != 0) {
      tmp = tmp >> 8;
      len--;
    }

    bytes[offset] = (byte) len;

    len = (len < -120) ? -(len + 120) : -(len + 112);

    for (int idx = len; idx != 0; idx--) {
      int shiftbits = (idx - 1) * 8;
      long mask = 0xFFL << shiftbits;
      bytes[offset+1-(idx - len)] = (byte) ((l & mask) >> shiftbits);
    }
    return 1 + len;
  }

  public static final int VLONG_BYTES_LEN = 9;

  private static ThreadLocal vLongBytesThreadLocal = new ThreadLocal() {
    @Override
    public byte[] initialValue() {
      return new byte[VLONG_BYTES_LEN];
    }
  };

  public static void writeVLong(RandomAccessOutput byteStream, long l) {
    byte[] vLongBytes = vLongBytesThreadLocal.get();
    int len = LazyBinaryUtils.writeVLongToByteArray(vLongBytes, l);
    byteStream.write(vLongBytes, 0, len);
  }

  public static void writeVLong(RandomAccessOutput byteStream, long l,
      byte[] scratchBytes) {
    int len = LazyBinaryUtils.writeVLongToByteArray(scratchBytes, l);
    byteStream.write(scratchBytes, 0, len);
  }

  public static void writeDouble(RandomAccessOutput byteStream, double d) {
    long v = Double.doubleToLongBits(d);
    byteStream.write((byte) (v >> 56));
    byteStream.write((byte) (v >> 48));
    byteStream.write((byte) (v >> 40));
    byteStream.write((byte) (v >> 32));
    byteStream.write((byte) (v >> 24));
    byteStream.write((byte) (v >> 16));
    byteStream.write((byte) (v >> 8));
    byteStream.write((byte) (v));
  }

  static ConcurrentHashMap cachedLazyBinaryObjectInspector =
      new ConcurrentHashMap();

  /**
   * Returns the lazy binary object inspector that can be used to inspect an
   * lazy binary object of that typeInfo
   *
   * For primitive types, we use the standard writable object inspector.
   */
  public static ObjectInspector getLazyBinaryObjectInspectorFromTypeInfo(
      TypeInfo typeInfo) {
    ObjectInspector result = cachedLazyBinaryObjectInspector.get(typeInfo);
    if (result == null) {
      switch (typeInfo.getCategory()) {
      case PRIMITIVE: {
        result = PrimitiveObjectInspectorFactory
            .getPrimitiveWritableObjectInspector(((PrimitiveTypeInfo) typeInfo));
        break;
      }
      case LIST: {
        ObjectInspector elementObjectInspector = getLazyBinaryObjectInspectorFromTypeInfo(((ListTypeInfo) typeInfo)
            .getListElementTypeInfo());
        result = LazyBinaryObjectInspectorFactory
            .getLazyBinaryListObjectInspector(elementObjectInspector);
        break;
      }
      case MAP: {
        MapTypeInfo mapTypeInfo = (MapTypeInfo) typeInfo;
        ObjectInspector keyObjectInspector = getLazyBinaryObjectInspectorFromTypeInfo(mapTypeInfo
            .getMapKeyTypeInfo());
        ObjectInspector valueObjectInspector = getLazyBinaryObjectInspectorFromTypeInfo(mapTypeInfo
            .getMapValueTypeInfo());
        result = LazyBinaryObjectInspectorFactory
            .getLazyBinaryMapObjectInspector(keyObjectInspector,
            valueObjectInspector);
        break;
      }
      case STRUCT: {
        StructTypeInfo structTypeInfo = (StructTypeInfo) typeInfo;
        List fieldNames = structTypeInfo.getAllStructFieldNames();
        List fieldTypeInfos = structTypeInfo
            .getAllStructFieldTypeInfos();
        List fieldObjectInspectors = new ArrayList(
            fieldTypeInfos.size());
        for (int i = 0; i < fieldTypeInfos.size(); i++) {
          fieldObjectInspectors
              .add(getLazyBinaryObjectInspectorFromTypeInfo(fieldTypeInfos
              .get(i)));
        }
        result = LazyBinaryObjectInspectorFactory
            .getLazyBinaryStructObjectInspector(fieldNames,
            fieldObjectInspectors);
        break;
      }
      case UNION: {
        UnionTypeInfo unionTypeInfo = (UnionTypeInfo) typeInfo;
        final List fieldTypeInfos = unionTypeInfo.getAllUnionObjectTypeInfos();
        List fieldObjectInspectors = new ArrayList(
          fieldTypeInfos.size());
        for (int i = 0; i < fieldTypeInfos.size(); i++) {
          fieldObjectInspectors
            .add(getLazyBinaryObjectInspectorFromTypeInfo(fieldTypeInfos
                                                            .get(i)));
        }
        result = LazyBinaryObjectInspectorFactory
            .getLazyBinaryUnionObjectInspector(fieldObjectInspectors);
        break;
      }
      default: {
        result = null;
      }
      }
      ObjectInspector prev =
        cachedLazyBinaryObjectInspector.putIfAbsent(typeInfo, result);
      if (prev != null) {
        result = prev;
      }
    }
    return result;
  }

  private LazyBinaryUtils() {
    // prevent instantiation
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy