Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hadoop.hive.ql.io.BatchToRowReader Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.io;
import com.google.common.collect.Lists;
import org.apache.hadoop.hive.llap.DebugUtils;
import java.util.Arrays;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.RecordReader;
/**
* A record reader wrapper that converts VRB reader into an OI-based reader.
* Due to the fact that changing table OIs in the plan after compilation is nearly impossible,
* this is made an abstract class where type-specific implementations can plug in certain details,
* so that the data produced after wrapping a vectorized reader would conform to the original OIs.
*/
public abstract class BatchToRowReader
implements RecordReader {
protected static final Logger LOG = LoggerFactory.getLogger(BatchToRowReader.class);
private final NullWritable key;
private final VectorizedRowBatch batch;
private final RecordReader vrbReader;
private final List schema;
private final boolean[] included;
private int rowInBatch = 0;
public BatchToRowReader(RecordReader vrbReader,
VectorizedRowBatchCtx vrbCtx, List includedCols) {
this.vrbReader = vrbReader;
this.key = vrbReader.createKey();
this.batch = vrbReader.createValue();
this.schema = Lists.newArrayList(vrbCtx.getRowColumnTypeInfos());
// TODO: does this include partition columns?
boolean[] included = new boolean[schema.size()];
if (includedCols != null) {
for (int colIx : includedCols) {
included[colIx] = true;
}
} else {
Arrays.fill(included, true);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Including the columns " + DebugUtils.toString(included));
}
this.included = included;
}
protected abstract StructType createStructObject(Object previous, List childrenTypes);
protected abstract void setStructCol(StructType structObj, int i, Object value);
protected abstract Object getStructCol(StructType structObj, int i);
protected abstract UnionType createUnionObject(List childrenTypes, Object previous);
protected abstract void setUnion(UnionType unionObj, byte tag, Object object);
protected abstract Object getUnionField(UnionType unionObj);
@Override
public NullWritable createKey() {
return key;
}
@Override
public Object createValue() {
return createStructObject(null, schema);
}
@Override
public long getPos() throws IOException {
return -1;
}
@Override
public float getProgress() throws IOException {
return 0;
}
@Override
public boolean next(NullWritable key, Object previous) throws IOException {
if (!ensureBatch()) {
return false;
}
@SuppressWarnings("unchecked")
StructType value = (StructType)previous;
for (int i = 0; i < schema.size(); ++i) {
if (!included[i]) continue; // TODO: shortcut for last col below length?
try {
setStructCol(value, i,
nextValue(batch.cols[i], rowInBatch, schema.get(i), getStructCol(value, i)));
} catch (Throwable t) {
LOG.error("Error at row " + rowInBatch + "/" + batch.size + ", column " + i
+ "/" + schema.size() + " " + batch.cols[i], t);
throw (t instanceof IOException) ? (IOException)t : new IOException(t);
}
}
++rowInBatch;
return true;
}
/**
* If the current batch is empty, get a new one.
* @return true if we have rows available.
*/
private boolean ensureBatch() throws IOException {
if (rowInBatch >= batch.size) {
rowInBatch = 0;
return vrbReader.next(key, batch) && batch.size > 0;
}
return true;
}
@Override
public void close() throws IOException {
vrbReader.close();
batch.cols = null;
}
/* Routines for stubbing into Writables */
public static BooleanWritable nextBoolean(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
BooleanWritable result;
if (previous == null || previous.getClass() != BooleanWritable.class) {
result = new BooleanWritable();
} else {
result = (BooleanWritable) previous;
}
result.set(((LongColumnVector) vector).vector[row] != 0);
return result;
} else {
return null;
}
}
public static ByteWritable nextByte(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
ByteWritable result;
if (previous == null || previous.getClass() != ByteWritable.class) {
result = new ByteWritable();
} else {
result = (ByteWritable) previous;
}
result.set((byte) ((LongColumnVector) vector).vector[row]);
return result;
} else {
return null;
}
}
public static ShortWritable nextShort(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
ShortWritable result;
if (previous == null || previous.getClass() != ShortWritable.class) {
result = new ShortWritable();
} else {
result = (ShortWritable) previous;
}
result.set((short) ((LongColumnVector) vector).vector[row]);
return result;
} else {
return null;
}
}
public static IntWritable nextInt(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
IntWritable result;
if (previous == null || previous.getClass() != IntWritable.class) {
result = new IntWritable();
} else {
result = (IntWritable) previous;
}
result.set((int) ((LongColumnVector) vector).vector[row]);
return result;
} else {
return null;
}
}
public static LongWritable nextLong(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
LongWritable result;
if (previous == null || previous.getClass() != LongWritable.class) {
result = new LongWritable();
} else {
result = (LongWritable) previous;
}
result.set(((LongColumnVector) vector).vector[row]);
return result;
} else {
return null;
}
}
public static FloatWritable nextFloat(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
FloatWritable result;
if (previous == null || previous.getClass() != FloatWritable.class) {
result = new FloatWritable();
} else {
result = (FloatWritable) previous;
}
result.set((float) ((DoubleColumnVector) vector).vector[row]);
return result;
} else {
return null;
}
}
public static DoubleWritable nextDouble(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
DoubleWritable result;
if (previous == null || previous.getClass() != DoubleWritable.class) {
result = new DoubleWritable();
} else {
result = (DoubleWritable) previous;
}
result.set(((DoubleColumnVector) vector).vector[row]);
return result;
} else {
return null;
}
}
public static Text nextString(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
Text result;
if (previous == null || previous.getClass() != Text.class) {
result = new Text();
} else {
result = (Text) previous;
}
BytesColumnVector bytes = (BytesColumnVector) vector;
result.set(bytes.vector[row], bytes.start[row], bytes.length[row]);
return result;
} else {
return null;
}
}
public static HiveCharWritable nextChar(ColumnVector vector,
int row,
int size,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
HiveCharWritable result;
if (previous == null || previous.getClass() != HiveCharWritable.class) {
result = new HiveCharWritable();
} else {
result = (HiveCharWritable) previous;
}
BytesColumnVector bytes = (BytesColumnVector) vector;
result.set(bytes.toString(row), size);
return result;
} else {
return null;
}
}
public static HiveVarcharWritable nextVarchar(
ColumnVector vector, int row, int size, Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
HiveVarcharWritable result;
if (previous == null || previous.getClass() != HiveVarcharWritable.class) {
result = new HiveVarcharWritable();
} else {
result = (HiveVarcharWritable) previous;
}
BytesColumnVector bytes = (BytesColumnVector) vector;
result.set(bytes.toString(row), size);
return result;
} else {
return null;
}
}
public static BytesWritable nextBinary(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
BytesWritable result;
if (previous == null || previous.getClass() != BytesWritable.class) {
result = new BytesWritable();
} else {
result = (BytesWritable) previous;
}
BytesColumnVector bytes = (BytesColumnVector) vector;
result.set(bytes.vector[row], bytes.start[row], bytes.length[row]);
return result;
} else {
return null;
}
}
public static HiveDecimalWritable nextDecimal(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
HiveDecimalWritable result;
if (previous == null || previous.getClass() != HiveDecimalWritable.class) {
result = new HiveDecimalWritable();
} else {
result = (HiveDecimalWritable) previous;
}
result.set(((DecimalColumnVector) vector).vector[row]);
return result;
} else {
return null;
}
}
public static DateWritable nextDate(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
DateWritable result;
if (previous == null || previous.getClass() != DateWritable.class) {
result = new DateWritable();
} else {
result = (DateWritable) previous;
}
int date = (int) ((LongColumnVector) vector).vector[row];
result.set(date);
return result;
} else {
return null;
}
}
public static TimestampWritable nextTimestamp(ColumnVector vector,
int row,
Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
TimestampWritable result;
if (previous == null || previous.getClass() != TimestampWritable.class) {
result = new TimestampWritable();
} else {
result = (TimestampWritable) previous;
}
TimestampColumnVector tcv = (TimestampColumnVector) vector;
result.setInternal(tcv.time[row], tcv.nanos[row]);
return result;
} else {
return null;
}
}
public StructType nextStruct(
ColumnVector vector, int row, StructTypeInfo schema, Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
List childrenTypes = schema.getAllStructFieldTypeInfos();
StructType result = createStructObject(previous, childrenTypes);
StructColumnVector struct = (StructColumnVector) vector;
for (int f = 0; f < childrenTypes.size(); ++f) {
setStructCol(result, f, nextValue(struct.fields[f], row,
childrenTypes.get(f), getStructCol(result, f)));
}
return result;
} else {
return null;
}
}
private UnionType nextUnion(
ColumnVector vector, int row, UnionTypeInfo schema, Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
List childrenTypes = schema.getAllUnionObjectTypeInfos();
UnionType result = createUnionObject(childrenTypes, previous);
UnionColumnVector union = (UnionColumnVector) vector;
byte tag = (byte) union.tags[row];
setUnion(result, tag, nextValue(union.fields[tag], row, childrenTypes.get(tag),
getUnionField(result)));
return result;
} else {
return null;
}
}
private ArrayList nextList(
ColumnVector vector, int row, ListTypeInfo schema, Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
ArrayList result;
if (previous == null || previous.getClass() != ArrayList.class) {
result = new ArrayList<>();
} else {
result = (ArrayList) previous;
}
ListColumnVector list = (ListColumnVector) vector;
int length = (int) list.lengths[row];
int offset = (int) list.offsets[row];
result.ensureCapacity(length);
int oldLength = result.size();
int idx = 0;
TypeInfo childType = schema.getListElementTypeInfo();
while (idx < length && idx < oldLength) {
result.set(idx, nextValue(list.child, offset + idx, childType,
result.get(idx)));
idx += 1;
}
if (length < oldLength) {
for(int i= oldLength - 1; i >= length; --i) {
result.remove(i);
}
} else if (oldLength < length) {
while (idx < length) {
result.add(nextValue(list.child, offset + idx, childType, null));
idx += 1;
}
}
return result;
} else {
return null;
}
}
private HashMap nextMap(
ColumnVector vector, int row, MapTypeInfo schema, Object previous) {
if (vector.isRepeating) {
row = 0;
}
if (vector.noNulls || !vector.isNull[row]) {
MapColumnVector map = (MapColumnVector) vector;
int length = (int) map.lengths[row];
int offset = (int) map.offsets[row];
TypeInfo keyType = schema.getMapKeyTypeInfo();
TypeInfo valueType = schema.getMapValueTypeInfo();
HashMap result;
if (previous == null || previous.getClass() != HashMap.class) {
result = new HashMap(length);
} else {
result = (HashMap) previous;
// I couldn't think of a good way to reuse the keys and value objects
// without even more allocations, so take the easy and safe approach.
result.clear();
}
for(int e=0; e < length; ++e) {
result.put(nextValue(map.keys, e + offset, keyType, null),
nextValue(map.values, e + offset, valueType, null));
}
return result;
} else {
return null;
}
}
private Object nextValue(ColumnVector vector, int row, TypeInfo schema, Object previous) {
switch (schema.getCategory()) {
case STRUCT:
return nextStruct(vector, row, (StructTypeInfo)schema, previous);
case UNION:
return nextUnion(vector, row, (UnionTypeInfo)schema, previous);
case LIST:
return nextList(vector, row, (ListTypeInfo)schema, previous);
case MAP:
return nextMap(vector, row, (MapTypeInfo)schema, previous);
case PRIMITIVE: {
PrimitiveTypeInfo pschema = (PrimitiveTypeInfo)schema;
switch (pschema.getPrimitiveCategory()) {
case BOOLEAN:
return nextBoolean(vector, row, previous);
case BYTE:
return nextByte(vector, row, previous);
case SHORT:
return nextShort(vector, row, previous);
case INT:
return nextInt(vector, row, previous);
case LONG:
return nextLong(vector, row, previous);
case FLOAT:
return nextFloat(vector, row, previous);
case DOUBLE:
return nextDouble(vector, row, previous);
case STRING:
return nextString(vector, row, previous);
case CHAR:
return nextChar(vector, row, ((CharTypeInfo)pschema).getLength(), previous);
case VARCHAR:
return nextVarchar(vector, row, ((VarcharTypeInfo)pschema).getLength(), previous);
case BINARY:
return nextBinary(vector, row, previous);
case DECIMAL:
return nextDecimal(vector, row, previous);
case DATE:
return nextDate(vector, row, previous);
case TIMESTAMP:
return nextTimestamp(vector, row, previous);
default:
throw new IllegalArgumentException("Unknown type " + schema);
}
}
default:
throw new IllegalArgumentException("Unknown type " + schema);
}
}
}