org.apache.hadoop.hive.ql.io.BatchToRowReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec
There is a newer version: 4.0.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.io;


import com.google.common.collect.Lists;

import org.apache.hadoop.hive.llap.DebugUtils;

import java.util.Arrays;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;

import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.Decimal64ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritableV2;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritableV2;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordReader;

/**
 * A record reader wrapper that converts VRB reader into an OI-based reader.
 * Due to the fact that changing table OIs in the plan after compilation is nearly impossible,
 * this is made an abstract class where type-specific implementations can plug in certain details,
 * so that the data produced after wrapping a vectorized reader would conform to the original OIs.
 */
public abstract class BatchToRowReader
    implements RecordReader {
  protected static final Logger LOG = LoggerFactory.getLogger(BatchToRowReader.class);

  private final NullWritable key;
  private final VectorizedRowBatch batch;
  private final RecordReader vrbReader;

  private final List schema;
  private final boolean[] included;
  private int rowInBatch = 0;

  protected List virtualColumnHandlers;

  public BatchToRowReader(RecordReader vrbReader,
      VectorizedRowBatchCtx vrbCtx, List includedCols) {
    this.vrbReader = vrbReader;
    this.key = vrbReader.createKey();
    this.batch = vrbReader.createValue();
    this.schema = Lists.newArrayList(vrbCtx.getRowColumnTypeInfos());
    // TODO: does this include partition columns?
    boolean[] included = new boolean[schema.size()];
    if (includedCols != null) {
      for (int colIx : includedCols) {
        included[colIx] = true;
      }
    } else {
      Arrays.fill(included, true);
    }

    virtualColumnHandlers = requestedVirtualColumns();
    for (VirtualColumnHandler handler : virtualColumnHandlers) {
      int idx = vrbCtx.findVirtualColumnNum(handler.virtualColumn);
      if (idx >= 0) {
        included[idx] = true;
        handler.indexInSchema = idx;
        batch.cols[idx].noNulls = false;
        Arrays.fill(batch.cols[idx].isNull, true);
      }
    }

    if (LOG.isDebugEnabled()) {
      LOG.debug("Including the columns " + DebugUtils.toString(included));
    }
    this.included = included;
  }

  /**
   * Wrapper class to map a virtual column to a handler defined by subclasses of {@link BatchToRowReader}.
   * The handler should be a set operation which sets the value of the virtual column value
   * in the current row.
   */
  public static class VirtualColumnHandler {
    private final VirtualColumn virtualColumn;
    private final Consumer