org.apache.hudi.common.util.AvroOrcUtils Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.common.util;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.avro.Conversions;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.StringType;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.orc.TypeDescription;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.ByteBuffer;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import static org.apache.avro.JsonProperties.NULL_VALUE;
import static org.apache.hudi.common.util.BinaryUtil.toBytes;
import static org.apache.hudi.common.util.StringUtils.getUTF8Bytes;
/**
* Methods including addToVector, addUnionValue, createOrcSchema are originally from
* https://github.com/streamsets/datacollector.
* Source classes:
* - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcRecordConverter
* - com.streamsets.pipeline.lib.util.avroorc.AvroToOrcSchemaConverter
*
* Changes made:
* 1. Flatten nullable Avro schema type when the value is not null in `addToVector`.
* 2. Use getLogicalType(), constants from LogicalTypes instead of getJsonProp() to handle Avro logical types.
*/
public class AvroOrcUtils {
private static final int MICROS_PER_MILLI = 1000;
private static final int NANOS_PER_MICRO = 1000;
/**
* Add an object (of a given ORC type) to the column vector at a given position.
*
* @param type ORC schema of the value Object.
* @param colVector The column vector to store the value Object.
* @param avroSchema Avro schema of the value Object.
* Only used to check logical types for timestamp unit conversion.
* @param value Object to be added to the column vector
* @param vectorPos The position in the vector where value will be stored at.
*/
public static void addToVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, Object value, int vectorPos) {
final int currentVecLength = colVector.isNull.length;
if (vectorPos >= currentVecLength) {
colVector.ensureSize(2 * currentVecLength, true);
}
if (value == null) {
colVector.isNull[vectorPos] = true;
colVector.noNulls = false;
return;
}
if (avroSchema.getType().equals(Schema.Type.UNION)) {
avroSchema = getActualSchemaType(avroSchema);
}
LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null;
switch (type.getCategory()) {
case BOOLEAN:
LongColumnVector boolVec = (LongColumnVector) colVector;
boolVec.vector[vectorPos] = (boolean) value ? 1 : 0;
break;
case BYTE:
LongColumnVector byteColVec = (LongColumnVector) colVector;
byteColVec.vector[vectorPos] = (byte) value;
break;
case SHORT:
LongColumnVector shortColVec = (LongColumnVector) colVector;
shortColVec.vector[vectorPos] = (short) value;
break;
case INT:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MILLIS, but we will ignore that fact here
// since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the int value
LongColumnVector intColVec = (LongColumnVector) colVector;
intColVec.vector[vectorPos] = (int) value;
break;
case LONG:
// the Avro logical type could be AvroTypeUtil.LOGICAL_TYPE_TIME_MICROS, but we will ignore that fact here
// since Orc has no way to represent a time in the way Avro defines it; we will simply preserve the long value
LongColumnVector longColVec = (LongColumnVector) colVector;
longColVec.vector[vectorPos] = (long) value;
break;
case FLOAT:
DoubleColumnVector floatColVec = (DoubleColumnVector) colVector;
floatColVec.vector[vectorPos] = (float) value;
break;
case DOUBLE:
DoubleColumnVector doubleColVec = (DoubleColumnVector) colVector;
doubleColVec.vector[vectorPos] = (double) value;
break;
case VARCHAR:
case CHAR:
case STRING:
BytesColumnVector bytesColVec = (BytesColumnVector) colVector;
byte[] bytes = null;
if (value instanceof String) {
bytes = getUTF8Bytes((String) value);
} else if (value instanceof Utf8) {
final Utf8 utf8 = (Utf8) value;
bytes = utf8.getBytes();
} else if (value instanceof GenericData.EnumSymbol) {
bytes = getUTF8Bytes(((GenericData.EnumSymbol) value).toString());
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro %s field value, which has type %s, value %s",
type.getCategory().getName(),
value.getClass().getName(),
value.toString()
));
}
if (bytes == null) {
bytesColVec.isNull[vectorPos] = true;
bytesColVec.noNulls = false;
} else {
bytesColVec.setRef(vectorPos, bytes, 0, bytes.length);
}
break;
case DATE:
LongColumnVector dateColVec = (LongColumnVector) colVector;
int daysSinceEpoch;
if (logicalType instanceof LogicalTypes.Date) {
daysSinceEpoch = (int) value;
} else if (value instanceof java.sql.Date) {
daysSinceEpoch = DateWritable.dateToDays((java.sql.Date) value);
} else if (value instanceof Date) {
daysSinceEpoch = DateWritable.millisToDays(((Date) value).getTime());
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro DATE field value, which has type %s, value %s",
value.getClass().getName(),
value.toString()
));
}
dateColVec.vector[vectorPos] = daysSinceEpoch;
break;
case TIMESTAMP:
TimestampColumnVector tsColVec = (TimestampColumnVector) colVector;
long time;
int nanos = 0;
// The unit for Timestamp in ORC is millis, convert timestamp to millis if needed
if (logicalType instanceof LogicalTypes.TimestampMillis) {
time = (long) value;
} else if (logicalType instanceof LogicalTypes.TimestampMicros) {
final long logicalTsValue = (long) value;
time = logicalTsValue / MICROS_PER_MILLI;
nanos = NANOS_PER_MICRO * ((int) (logicalTsValue % MICROS_PER_MILLI));
} else if (value instanceof Timestamp) {
Timestamp tsValue = (Timestamp) value;
time = tsValue.getTime();
nanos = tsValue.getNanos();
} else if (value instanceof java.sql.Date) {
java.sql.Date sqlDateValue = (java.sql.Date) value;
time = sqlDateValue.getTime();
} else if (value instanceof Date) {
Date dateValue = (Date) value;
time = dateValue.getTime();
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro TIMESTAMP field value, which has type %s, value %s",
value.getClass().getName(),
value.toString()
));
}
tsColVec.time[vectorPos] = time;
tsColVec.nanos[vectorPos] = nanos;
break;
case BINARY:
BytesColumnVector binaryColVec = (BytesColumnVector) colVector;
byte[] binaryBytes;
if (value instanceof GenericData.Fixed) {
binaryBytes = ((GenericData.Fixed)value).bytes();
} else if (value instanceof ByteBuffer) {
final ByteBuffer byteBuffer = (ByteBuffer) value;
binaryBytes = toBytes(byteBuffer);
} else if (value instanceof byte[]) {
binaryBytes = (byte[]) value;
} else {
throw new IllegalStateException(String.format(
"Unrecognized type for Avro BINARY field value, which has type %s, value %s",
value.getClass().getName(),
value.toString()
));
}
binaryColVec.setRef(vectorPos, binaryBytes, 0, binaryBytes.length);
break;
case DECIMAL:
DecimalColumnVector decimalColVec = (DecimalColumnVector) colVector;
HiveDecimal decimalValue;
if (value instanceof BigDecimal) {
final BigDecimal decimal = (BigDecimal) value;
decimalValue = HiveDecimal.create(decimal);
} else if (value instanceof ByteBuffer) {
final ByteBuffer byteBuffer = (ByteBuffer) value;
final byte[] decimalBytes = new byte[byteBuffer.remaining()];
byteBuffer.get(decimalBytes);
final BigInteger bigInt = new BigInteger(decimalBytes);
final int scale = type.getScale();
BigDecimal bigDecVal = new BigDecimal(bigInt, scale);
decimalValue = HiveDecimal.create(bigDecVal);
if (decimalValue == null && decimalBytes.length > 0) {
throw new IllegalStateException(
"Unexpected read null HiveDecimal from bytes (base-64 encoded): "
+ Base64.getEncoder().encodeToString(decimalBytes)
);
}
} else if (value instanceof GenericData.Fixed) {
final BigDecimal decimal = new Conversions.DecimalConversion()
.fromFixed((GenericData.Fixed) value, avroSchema, logicalType);
decimalValue = HiveDecimal.create(decimal);
} else {
throw new IllegalStateException(String.format(
"Unexpected type for decimal (%s), cannot convert from Avro value",
value.getClass().getCanonicalName()
));
}
if (decimalValue == null) {
decimalColVec.isNull[vectorPos] = true;
decimalColVec.noNulls = false;
} else {
decimalColVec.set(vectorPos, decimalValue);
}
break;
case LIST:
List> list = (List>) value;
ListColumnVector listColVec = (ListColumnVector) colVector;
listColVec.offsets[vectorPos] = listColVec.childCount;
listColVec.lengths[vectorPos] = list.size();
TypeDescription listType = type.getChildren().get(0);
for (Object listItem : list) {
addToVector(listType, listColVec.child, avroSchema.getElementType(), listItem, listColVec.childCount++);
}
break;
case MAP:
Map mapValue = (Map) value;
MapColumnVector mapColumnVector = (MapColumnVector) colVector;
mapColumnVector.offsets[vectorPos] = mapColumnVector.childCount;
mapColumnVector.lengths[vectorPos] = mapValue.size();
// keys are always strings
Schema keySchema = Schema.create(Schema.Type.STRING);
for (Map.Entry entry : mapValue.entrySet()) {
addToVector(
type.getChildren().get(0),
mapColumnVector.keys,
keySchema,
entry.getKey(),
mapColumnVector.childCount
);
addToVector(
type.getChildren().get(1),
mapColumnVector.values,
avroSchema.getValueType(),
entry.getValue(),
mapColumnVector.childCount
);
mapColumnVector.childCount++;
}
break;
case STRUCT:
StructColumnVector structColVec = (StructColumnVector) colVector;
GenericData.Record record = (GenericData.Record) value;
for (int i = 0; i < type.getFieldNames().size(); i++) {
String fieldName = type.getFieldNames().get(i);
Object fieldValue = record.get(fieldName);
TypeDescription fieldType = type.getChildren().get(i);
addToVector(fieldType, structColVec.fields[i], avroSchema.getFields().get(i).schema(), fieldValue, vectorPos);
}
break;
case UNION:
UnionColumnVector unionColVec = (UnionColumnVector) colVector;
List childTypes = type.getChildren();
boolean added = addUnionValue(unionColVec, childTypes, avroSchema, value, vectorPos);
if (!added) {
throw new IllegalStateException(String.format(
"Failed to add value %s to union with type %s",
value == null ? "null" : value.toString(),
type.toString()
));
}
break;
default:
throw new IllegalArgumentException("Invalid TypeDescription " + type.toString() + ".");
}
}
/**
* Match value with its ORC type and add to the union vector at a given position.
*
* @param unionVector The vector to store value.
* @param unionChildTypes All possible types for the value Object.
* @param avroSchema Avro union schema for the value Object.
* @param value Object to be added to the unionVector
* @param vectorPos The position in the vector where value will be stored at.
* @return succeeded or failed
*/
public static boolean addUnionValue(
UnionColumnVector unionVector,
List unionChildTypes,
Schema avroSchema,
Object value,
int vectorPos
) {
int matchIndex = -1;
TypeDescription matchType = null;
Object matchValue = null;
for (int t = 0; t < unionChildTypes.size(); t++) {
TypeDescription childType = unionChildTypes.get(t);
boolean matches = false;
switch (childType.getCategory()) {
case BOOLEAN:
matches = value instanceof Boolean;
break;
case BYTE:
matches = value instanceof Byte;
break;
case SHORT:
matches = value instanceof Short;
break;
case INT:
matches = value instanceof Integer;
break;
case LONG:
matches = value instanceof Long;
break;
case FLOAT:
matches = value instanceof Float;
break;
case DOUBLE:
matches = value instanceof Double;
break;
case STRING:
case VARCHAR:
case CHAR:
if (value instanceof String) {
matches = true;
matchValue = getUTF8Bytes((String) value);
} else if (value instanceof Utf8) {
matches = true;
matchValue = ((Utf8) value).getBytes();
}
break;
case DATE:
matches = value instanceof Date;
break;
case TIMESTAMP:
matches = value instanceof Timestamp;
break;
case BINARY:
matches = value instanceof byte[] || value instanceof GenericData.Fixed;
break;
case DECIMAL:
matches = value instanceof BigDecimal;
break;
case LIST:
matches = value instanceof List;
break;
case MAP:
matches = value instanceof Map;
break;
case STRUCT:
throw new UnsupportedOperationException("Cannot handle STRUCT within UNION.");
case UNION:
List children = childType.getChildren();
if (value == null) {
matches = children == null || children.size() == 0;
} else {
matches = addUnionValue(unionVector, children, avroSchema, value, vectorPos);
}
break;
default:
throw new IllegalArgumentException("Invalid TypeDescription " + childType.getCategory().toString() + ".");
}
if (matches) {
matchIndex = t;
matchType = childType;
break;
}
}
if (value == null && matchValue != null) {
value = matchValue;
}
if (matchIndex >= 0) {
unionVector.tags[vectorPos] = matchIndex;
if (value == null) {
unionVector.isNull[vectorPos] = true;
unionVector.noNulls = false;
} else {
addToVector(matchType, unionVector.fields[matchIndex], avroSchema.getTypes().get(matchIndex), value, vectorPos);
}
return true;
} else {
return false;
}
}
/**
* Read the Column vector at a given position conforming to a given ORC schema.
*
* @param type ORC schema of the object to read.
* @param colVector The column vector to read.
* @param avroSchema Avro schema of the object to read.
* Only used to check logical types for timestamp unit conversion.
* @param vectorPos The position in the vector where the value to read is stored at.
* @return The object being read.
*/
public static Object readFromVector(TypeDescription type, ColumnVector colVector, Schema avroSchema, int vectorPos) {
if (colVector.isRepeating) {
vectorPos = 0;
}
if (colVector.isNull[vectorPos]) {
return null;
}
if (avroSchema.getType().equals(Schema.Type.UNION)) {
avroSchema = getActualSchemaType(avroSchema);
}
LogicalType logicalType = avroSchema != null ? avroSchema.getLogicalType() : null;
switch (type.getCategory()) {
case BOOLEAN:
return ((LongColumnVector) colVector).vector[vectorPos] != 0;
case BYTE:
return (byte) ((LongColumnVector) colVector).vector[vectorPos];
case SHORT:
return (short) ((LongColumnVector) colVector).vector[vectorPos];
case INT:
return (int) ((LongColumnVector) colVector).vector[vectorPos];
case LONG:
return ((LongColumnVector) colVector).vector[vectorPos];
case FLOAT:
return (float) ((DoubleColumnVector) colVector).vector[vectorPos];
case DOUBLE:
return ((DoubleColumnVector) colVector).vector[vectorPos];
case VARCHAR:
case CHAR:
int maxLength = type.getMaxLength();
String result = ((BytesColumnVector) colVector).toString(vectorPos);
if (result.length() <= maxLength) {
return result;
} else {
throw new HoodieIOException("CHAR/VARCHAR has length " + result.length() + " greater than Max Length allowed");
}
case STRING:
String stringType = avroSchema.getProp(GenericData.STRING_PROP);
if (stringType == null || !stringType.equals(StringType.String)) {
int stringLength = ((BytesColumnVector) colVector).length[vectorPos];
int stringOffset = ((BytesColumnVector) colVector).start[vectorPos];
byte[] stringBytes = new byte[stringLength];
System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], stringOffset, stringBytes, 0, stringLength);
return new Utf8(stringBytes);
} else {
return ((BytesColumnVector) colVector).toString(vectorPos);
}
case DATE:
// convert to daysSinceEpoch for LogicalType.Date
return (int) ((LongColumnVector) colVector).vector[vectorPos];
case TIMESTAMP:
// The unit of time in ORC is millis. Convert (time,nanos) to the desired unit per logicalType
long time = ((TimestampColumnVector) colVector).time[vectorPos];
int nanos = ((TimestampColumnVector) colVector).nanos[vectorPos];
if (logicalType instanceof LogicalTypes.TimestampMillis) {
return time;
} else if (logicalType instanceof LogicalTypes.TimestampMicros) {
return time * MICROS_PER_MILLI + nanos / NANOS_PER_MICRO;
} else {
return ((TimestampColumnVector) colVector).getTimestampAsLong(vectorPos);
}
case BINARY:
int binaryLength = ((BytesColumnVector) colVector).length[vectorPos];
int binaryOffset = ((BytesColumnVector) colVector).start[vectorPos];
byte[] binaryBytes = new byte[binaryLength];
System.arraycopy(((BytesColumnVector) colVector).vector[vectorPos], binaryOffset, binaryBytes, 0, binaryLength);
// return a ByteBuffer to be consistent with AvroRecordConverter
return ByteBuffer.wrap(binaryBytes);
case DECIMAL:
// HiveDecimal always ignores trailing zeros, thus modifies the scale implicitly,
// therefore, the scale must be enforced here.
BigDecimal bigDecimal = ((DecimalColumnVector) colVector).vector[vectorPos]
.getHiveDecimal().bigDecimalValue()
.setScale(((LogicalTypes.Decimal) logicalType).getScale());
Schema.Type baseType = avroSchema.getType();
if (baseType.equals(Schema.Type.FIXED)) {
return new Conversions.DecimalConversion().toFixed(bigDecimal, avroSchema, logicalType);
} else if (baseType.equals(Schema.Type.BYTES)) {
return bigDecimal.unscaledValue().toByteArray();
} else {
throw new HoodieIOException(baseType.getName() + "is not a valid type for LogicalTypes.DECIMAL.");
}
case LIST:
ArrayList