All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe Maven / Gradle / Ivy

There is a newer version: 4.0.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.serde2.columnar;

import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinaryFactory;
import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.Writable;


/**
 * LazyBinaryColumnarSerDe. This serde combines elements of columnar serde and lazybinary serde
 * to produce a serde which serializes columns into a BytesRefArrayWritable in a compact binary
 * format and which is deserialized in a lazy, i.e. on-demand fashion.
 *
 */
@SerDeSpec(schemaProps = {serdeConstants.LIST_COLUMNS, serdeConstants.LIST_COLUMN_TYPES})
public class LazyBinaryColumnarSerDe extends ColumnarSerDeBase {

  private List columnNames;
  private List columnTypes;

  @Override
  public String toString() {
    return getClass().toString()
        + "["
        + columnNames
        + ":"
        + columnTypes + "]";
  }

  @Override
  public void initialize(Configuration conf, Properties tbl) throws SerDeException {
    LazySerDeParameters serdeParams = new LazySerDeParameters(conf, tbl, getClass().getName());
    
    columnNames = serdeParams.getColumnNames();
    columnTypes = serdeParams.getColumnTypes();

    cachedObjectInspector = LazyBinaryFactory.createColumnarStructInspector(
        columnNames, columnTypes);
    int size = columnTypes.size();
    List notSkipIDs = new ArrayList();
    if (conf == null || ColumnProjectionUtils.isReadAllColumns(conf)) {
      for (int i = 0; i < size; i++ ) {
        notSkipIDs.add(i);
      }
    } else {
      notSkipIDs = ColumnProjectionUtils.getReadColumnIDs(conf);
    }
    cachedLazyStruct = new LazyBinaryColumnarStruct(cachedObjectInspector, notSkipIDs);

    super.initialize(size);
  }

  static final byte[] INVALID_UTF__SINGLE_BYTE = {(byte)Integer.parseInt("10111111", 2)};
  @Override
  public Writable serialize(Object obj, ObjectInspector objInspector) throws SerDeException {
    if (objInspector.getCategory() != Category.STRUCT) {
      throw new SerDeException(getClass().toString()
          + " can only serialize struct types, but we got: "
          + objInspector.getTypeName());
    }

    StructObjectInspector soi = (StructObjectInspector) objInspector;
    List fields = soi.getAllStructFieldRefs();
    List list = soi.getStructFieldsDataAsList(obj);

    LazyBinarySerDe.BooleanRef warnedOnceNullMapKey = new LazyBinarySerDe.BooleanRef(false);
    serializeStream.reset();
    serializedSize = 0;
    int streamOffset = 0;
    // Serialize each field
    for (int i = 0; i < fields.size(); i++) {
      // Get the field objectInspector and the field object.
      ObjectInspector foi = fields.get(i).getFieldObjectInspector();
      Object f = (list == null ? null : list.get(i));
      //empty strings are marked by an invalid utf single byte sequence. A valid utf stream cannot
      //produce this sequence
      if ((f != null) && (foi.getCategory().equals(ObjectInspector.Category.PRIMITIVE))
          && ((PrimitiveObjectInspector) foi).getPrimitiveCategory().equals(
              PrimitiveObjectInspector.PrimitiveCategory.STRING)
          && ((StringObjectInspector) foi).getPrimitiveJavaObject(f).length() == 0) {
        serializeStream.write(INVALID_UTF__SINGLE_BYTE, 0, 1);
      } else {
        LazyBinarySerDe.serialize(serializeStream, f, foi, true, warnedOnceNullMapKey);
      }
      field[i].set(serializeStream.getData(), streamOffset, serializeStream
          .getLength()
          - streamOffset);
      streamOffset = serializeStream.getLength();
    }
    serializedSize = serializeStream.getLength();
    lastOperationSerialize = true;
    lastOperationDeserialize = false;
    return serializeCache;
  }
}