org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx Maven / Gradle / Ivy
The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector;
import java.io.IOException;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth;
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
import org.apache.hadoop.hive.ql.io.IOPrepareCache;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hive.common.util.DateUtils;
/**
* Context for Vectorized row batch. this calss does eager deserialization of row data using serde
* in the RecordReader layer.
* It has supports partitions in this layer so that the vectorized batch is populated correctly
* with the partition column.
*/
public class VectorizedRowBatchCtx {
private static final Log LOG = LogFactory.getLog(VectorizedRowBatchCtx.class.getName());
// OI for raw row data (EG without partition cols)
private StructObjectInspector rawRowOI;
// OI for the row (Raw row OI + partition OI)
private StructObjectInspector rowOI;
// Deserializer for the row data
private Deserializer deserializer;
// Hash map of partition values. Key=TblColName value=PartitionValue
private Map partitionValues;
//partition types
private Map partitionTypes;
// partition column positions, for use by classes that need to know whether a given column is a
// partition column
private Set partitionCols;
// Column projection list - List of column indexes to include. This
// list does not contain partition columns
private List colsToInclude;
private Map scratchColumnTypeMap = null;
/**
* Constructor for VectorizedRowBatchCtx
*
* @param rawRowOI
* OI for raw row data (EG without partition cols)
* @param rowOI
* OI for the row (Raw row OI + partition OI)
* @param deserializer
* Deserializer for the row data
* @param partitionValues
* Hash map of partition values. Key=TblColName value=PartitionValue
*/
public VectorizedRowBatchCtx(StructObjectInspector rawRowOI, StructObjectInspector rowOI,
Deserializer deserializer, Map partitionValues,
Map partitionTypes) {
this.rowOI = rowOI;
this.rawRowOI = rawRowOI;
this.deserializer = deserializer;
this.partitionValues = partitionValues;
this.partitionTypes = partitionTypes;
}
/**
* Constructor for VectorizedRowBatchCtx
*/
public VectorizedRowBatchCtx() {
}
/**
* Initializes the VectorizedRowBatch context based on an scratch column type map and
* object inspector.
* @param scratchColumnTypeMap
* @param rowOI
* Object inspector that shapes the column types
*/
public void init(Map scratchColumnTypeMap,
StructObjectInspector rowOI) {
this.scratchColumnTypeMap = scratchColumnTypeMap;
this.rowOI= rowOI;
this.rawRowOI = rowOI;
}
/**
* Initializes VectorizedRowBatch context based on the
* split and Hive configuration (Job conf with hive Plan).
*
* @param hiveConf
* Hive configuration using Hive plan is extracted
* @param split
* File split of the file being read
* @throws ClassNotFoundException
* @throws IOException
* @throws SerDeException
* @throws InstantiationException
* @throws IllegalAccessException
* @throws HiveException
*/
public void init(Configuration hiveConf, FileSplit split) throws ClassNotFoundException,
IOException,
SerDeException,
InstantiationException,
IllegalAccessException,
HiveException {
Map pathToPartitionInfo = Utilities
.getMapWork(hiveConf).getPathToPartitionInfo();
PartitionDesc part = HiveFileFormatUtils
.getPartitionDescFromPathRecursively(pathToPartitionInfo,
split.getPath(), IOPrepareCache.get().getPartitionDescMap());
String partitionPath = split.getPath().getParent().toString();
scratchColumnTypeMap = Utilities.getMapWorkVectorScratchColumnTypeMap(hiveConf);
// LOG.info("VectorizedRowBatchCtx init scratchColumnTypeMap " + scratchColumnTypeMap.toString());
Properties partProps =
(part.getPartSpec() == null || part.getPartSpec().isEmpty()) ?
part.getTableDesc().getProperties() : part.getProperties();
Class serdeclass = hiveConf.getClassByName(part.getSerdeClassName());
Deserializer partDeserializer = (Deserializer) serdeclass.newInstance();
SerDeUtils.initializeSerDe(partDeserializer, hiveConf, part.getTableDesc().getProperties(),
partProps);
StructObjectInspector partRawRowObjectInspector = (StructObjectInspector) partDeserializer
.getObjectInspector();
deserializer = partDeserializer;
// Check to see if this split is part of a partition of a table
String pcols = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
String[] partKeys = null;
if (pcols != null && pcols.length() > 0) {
// Partitions exist for this table. Get the partition object inspector and
// raw row object inspector (row with out partition col)
LinkedHashMap partSpec = part.getPartSpec();
partKeys = pcols.trim().split("/");
String pcolTypes = partProps.getProperty(hive_metastoreConstants.META_TABLE_PARTITION_COLUMN_TYPES);
String[] partKeyTypes = pcolTypes.trim().split(":");
if (partKeys.length > partKeyTypes.length) {
throw new HiveException("Internal error : partKeys length, " +partKeys.length +
" greater than partKeyTypes length, " + partKeyTypes.length);
}
List partNames = new ArrayList(partKeys.length);
List partObjectInspectors = new ArrayList(partKeys.length);
partitionValues = new LinkedHashMap();
partitionTypes = new LinkedHashMap();
for (int i = 0; i < partKeys.length; i++) {
String key = partKeys[i];
partNames.add(key);
ObjectInspector objectInspector = null;
Object objectVal;
if (partSpec == null) {
// for partitionless table, initialize partValue to empty string.
// We can have partitionless table even if we have partition keys
// when there is only only partition selected and the partition key is not
// part of the projection/include list.
objectVal = null;
objectInspector = PrimitiveObjectInspectorFactory.javaStringObjectInspector;
partitionTypes.put(key, PrimitiveCategory.STRING);
} else {
// Create a Standard java object Inspector
objectInspector = TypeInfoUtils.getStandardJavaObjectInspectorFromTypeInfo(
TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]));
objectVal =
ObjectInspectorConverters.
getConverter(PrimitiveObjectInspectorFactory.
javaStringObjectInspector, objectInspector).
convert(partSpec.get(key));
partitionTypes.put(key, TypeInfoFactory.getPrimitiveTypeInfo(partKeyTypes[i]).getPrimitiveCategory());
}
if (LOG.isDebugEnabled()) {
LOG.debug("Partition column: name: " + key + ", value: " + objectVal + ", type: " + partitionTypes.get(key));
}
partitionValues.put(key, objectVal);
partObjectInspectors.add(objectInspector);
}
// Create partition OI
StructObjectInspector partObjectInspector = ObjectInspectorFactory
.getStandardStructObjectInspector(partNames, partObjectInspectors);
// Get row OI from partition OI and raw row OI
StructObjectInspector rowObjectInspector = ObjectInspectorFactory
.getUnionStructObjectInspector(Arrays
.asList(new StructObjectInspector[] {partRawRowObjectInspector, partObjectInspector}));
rowOI = rowObjectInspector;
rawRowOI = partRawRowObjectInspector;
// We have to do this after we've set rowOI, as getColIndexBasedOnColName uses it
partitionCols = new HashSet();
if (pcols != null && pcols.length() > 0) {
for (int i = 0; i < partKeys.length; i++) {
partitionCols.add(getColIndexBasedOnColName(partKeys[i]));
}
}
} else {
// No partitions for this table, hence row OI equals raw row OI
rowOI = partRawRowObjectInspector;
rawRowOI = partRawRowObjectInspector;
}
colsToInclude = ColumnProjectionUtils.getReadColumnIDs(hiveConf);
}
/**
* Creates a Vectorized row batch and the column vectors.
*
* @return VectorizedRowBatch
* @throws HiveException
*/
public VectorizedRowBatch createVectorizedRowBatch() throws HiveException
{
List fieldRefs = rowOI.getAllStructFieldRefs();
VectorizedRowBatch result = new VectorizedRowBatch(fieldRefs.size());
for (int j = 0; j < fieldRefs.size(); j++) {
// If the column is included in the include list or if the column is a
// partition column then create the column vector. Also note that partition columns are not
// in the included list.
if ((colsToInclude == null) || colsToInclude.contains(j)
|| ((partitionValues != null) &&
partitionValues.containsKey(fieldRefs.get(j).getFieldName()))) {
ObjectInspector foi = fieldRefs.get(j).getFieldObjectInspector();
switch (foi.getCategory()) {
case PRIMITIVE: {
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) foi;
// Vectorization currently only supports the following data types:
// BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, BINARY, STRING, CHAR, VARCHAR, TIMESTAMP,
// DATE and DECIMAL
switch (poi.getPrimitiveCategory()) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case TIMESTAMP:
case DATE:
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
result.cols[j] = new LongColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
break;
case FLOAT:
case DOUBLE:
result.cols[j] = new DoubleColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
break;
case BINARY:
case STRING:
case CHAR:
case VARCHAR:
result.cols[j] = new BytesColumnVector(VectorizedRowBatch.DEFAULT_SIZE);
break;
case DECIMAL:
DecimalTypeInfo tInfo = (DecimalTypeInfo) poi.getTypeInfo();
result.cols[j] = new DecimalColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
tInfo.precision(), tInfo.scale());
break;
default:
throw new RuntimeException("Vectorizaton is not supported for datatype:"
+ poi.getPrimitiveCategory());
}
break;
}
case LIST:
case MAP:
case STRUCT:
case UNION:
throw new HiveException("Vectorizaton is not supported for datatype:"
+ foi.getCategory());
default:
throw new HiveException("Unknown ObjectInspector category!");
}
}
}
result.numCols = fieldRefs.size();
this.addScratchColumnsToBatch(result);
result.reset();
return result;
}
/**
* Adds the row to the batch after deserializing the row
*
* @param rowIndex
* Row index in the batch to which the row is added
* @param rowBlob
* Row blob (serialized version of row)
* @param batch
* Vectorized batch to which the row is added
* @param buffer a buffer to copy strings into
* @throws HiveException
* @throws SerDeException
*/
public void addRowToBatch(int rowIndex, Writable rowBlob,
VectorizedRowBatch batch,
DataOutputBuffer buffer
) throws HiveException, SerDeException
{
Object row = this.deserializer.deserialize(rowBlob);
VectorizedBatchUtil.addRowToBatch(row, this.rawRowOI, rowIndex, batch, buffer);
}
/**
* Deserialized set of rows and populates the batch
*
* @param rowBlob
* to deserialize
* @param batch
* Vectorized row batch which contains deserialized data
* @throws SerDeException
*/
public void convertRowBatchBlobToVectorizedBatch(Object rowBlob, int rowsInBlob,
VectorizedRowBatch batch)
throws SerDeException {
if (deserializer instanceof VectorizedSerde) {
((VectorizedSerde) deserializer).deserializeVector(rowBlob, rowsInBlob, batch);
} else {
throw new SerDeException(
"Not able to deserialize row batch. Serde does not implement VectorizedSerde");
}
}
private int getColIndexBasedOnColName(String colName) throws HiveException
{
List fieldRefs = rowOI.getAllStructFieldRefs();
for (int i = 0; i < fieldRefs.size(); i++) {
if (fieldRefs.get(i).getFieldName().equals(colName)) {
return i;
}
}
throw new HiveException("Not able to find column name in row object inspector");
}
/**
* Add the partition values to the batch
*
* @param batch
* @throws HiveException
*/
public void addPartitionColsToBatch(VectorizedRowBatch batch) throws HiveException
{
int colIndex;
Object value;
PrimitiveCategory pCategory;
if (partitionValues != null) {
for (String key : partitionValues.keySet()) {
colIndex = getColIndexBasedOnColName(key);
value = partitionValues.get(key);
pCategory = partitionTypes.get(key);
switch (pCategory) {
case BOOLEAN: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Boolean) value == true ? 1 : 0);
lcv.isNull[0] = false;
}
}
break;
case BYTE: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Byte) value);
lcv.isNull[0] = false;
}
}
break;
case SHORT: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Short) value);
lcv.isNull[0] = false;
}
}
break;
case INT: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Integer) value);
lcv.isNull[0] = false;
}
}
break;
case LONG: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill((Long) value);
lcv.isNull[0] = false;
}
}
break;
case DATE: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill(DateWritable.dateToDays((Date) value));
lcv.isNull[0] = false;
}
}
break;
case TIMESTAMP: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill(TimestampUtils.getTimeNanoSec((Timestamp) value));
lcv.isNull[0] = false;
}
}
break;
case INTERVAL_YEAR_MONTH: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill(((HiveIntervalYearMonth) value).getTotalMonths());
lcv.isNull[0] = false;
}
}
case INTERVAL_DAY_TIME: {
LongColumnVector lcv = (LongColumnVector) batch.cols[colIndex];
if (value == null) {
lcv.noNulls = false;
lcv.isNull[0] = true;
lcv.isRepeating = true;
} else {
lcv.fill(DateUtils.getIntervalDayTimeTotalNanos((HiveIntervalDayTime) value));
lcv.isNull[0] = false;
}
}
case FLOAT: {
DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
if (value == null) {
dcv.noNulls = false;
dcv.isNull[0] = true;
dcv.isRepeating = true;
} else {
dcv.fill((Float) value);
dcv.isNull[0] = false;
}
}
break;
case DOUBLE: {
DoubleColumnVector dcv = (DoubleColumnVector) batch.cols[colIndex];
if (value == null) {
dcv.noNulls = false;
dcv.isNull[0] = true;
dcv.isRepeating = true;
} else {
dcv.fill((Double) value);
dcv.isNull[0] = false;
}
}
break;
case DECIMAL: {
DecimalColumnVector dv = (DecimalColumnVector) batch.cols[colIndex];
if (value == null) {
dv.noNulls = false;
dv.isNull[0] = true;
dv.isRepeating = true;
} else {
HiveDecimal hd = (HiveDecimal) value;
dv.set(0, hd);
dv.isRepeating = true;
dv.isNull[0] = false;
}
}
break;
case BINARY: {
BytesColumnVector bcv = (BytesColumnVector) batch.cols[colIndex];
byte[] bytes = (byte[]) value;
if (bytes == null) {
bcv.noNulls = false;
bcv.isNull[0] = true;
bcv.isRepeating = true;
} else {
bcv.fill(bytes);
bcv.isNull[0] = false;
}
}
break;
case STRING:
case CHAR:
case VARCHAR: {
BytesColumnVector bcv = (BytesColumnVector) batch.cols[colIndex];
String sVal = (String) value;
if (sVal == null) {
bcv.noNulls = false;
bcv.isNull[0] = true;
bcv.isRepeating = true;
} else {
bcv.fill(sVal.getBytes());
bcv.isNull[0] = false;
}
}
break;
default:
throw new HiveException("Unable to recognize the partition type " + pCategory +
" for column " + key);
}
}
}
}
/**
* Determine whether a given column is a partition column
* @param colnum column number in
* {@link org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch}s created by this context.
* @return true if it is a partition column, false otherwise
*/
public final boolean isPartitionCol(int colnum) {
return (partitionCols == null) ? false : partitionCols.contains(colnum);
}
private void addScratchColumnsToBatch(VectorizedRowBatch vrb) throws HiveException {
if (scratchColumnTypeMap != null && !scratchColumnTypeMap.isEmpty()) {
int origNumCols = vrb.numCols;
int newNumCols = vrb.cols.length+scratchColumnTypeMap.keySet().size();
vrb.cols = Arrays.copyOf(vrb.cols, newNumCols);
for (int i = origNumCols; i < newNumCols; i++) {
String typeName = scratchColumnTypeMap.get(i);
if (typeName == null) {
throw new HiveException("No type entry found for column " + i + " in map " + scratchColumnTypeMap.toString());
}
vrb.cols[i] = allocateColumnVector(typeName,
VectorizedRowBatch.DEFAULT_SIZE);
}
vrb.numCols = vrb.cols.length;
}
}
/**
* Get the scale and precision for the given decimal type string. The decimal type is assumed to be
* of the format decimal(precision,scale) e.g. decimal(20,10).
* @param decimalType The given decimal type string.
* @return An integer array of size 2 with first element set to precision and second set to scale.
*/
private static int[] getScalePrecisionFromDecimalType(String decimalType) {
Pattern p = Pattern.compile("\\d+");
Matcher m = p.matcher(decimalType);
m.find();
int precision = Integer.parseInt(m.group());
m.find();
int scale = Integer.parseInt(m.group());
int [] precScale = { precision, scale };
return precScale;
}
public static ColumnVector allocateColumnVector(String type, int defaultSize) {
if (type.equalsIgnoreCase("double")) {
return new DoubleColumnVector(defaultSize);
} else if (VectorizationContext.isStringFamily(type)) {
return new BytesColumnVector(defaultSize);
} else if (VectorizationContext.decimalTypePattern.matcher(type).matches()){
int [] precisionScale = getScalePrecisionFromDecimalType(type);
return new DecimalColumnVector(defaultSize, precisionScale[0], precisionScale[1]);
} else if (type.equalsIgnoreCase("long") ||
type.equalsIgnoreCase("date") ||
type.equalsIgnoreCase("timestamp") ||
type.equalsIgnoreCase(serdeConstants.INTERVAL_YEAR_MONTH_TYPE_NAME) ||
type.equalsIgnoreCase(serdeConstants.INTERVAL_DAY_TIME_TYPE_NAME)) {
return new LongColumnVector(defaultSize);
} else {
throw new RuntimeException("Cannot allocate vector column for " + type);
}
}
}