com.aliyun.datahub.client.impl.batch.arrow.ArrowUtils Maven / Gradle / Ivy
The newest version!
package com.aliyun.datahub.client.impl.batch.arrow;
import com.aliyun.datahub.client.exception.DatahubClientException;
import com.aliyun.datahub.client.impl.batch.BatchConstants;
import com.aliyun.datahub.client.model.Field;
import com.aliyun.datahub.client.model.FieldType;
import com.aliyun.datahub.client.model.RecordSchema;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.types.FloatingPointPrecision;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Schema;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class ArrowUtils {
private final static BufferAllocator BUFFER_ALLOCATOR = new RootAllocator(Integer.MAX_VALUE);
public static BufferAllocator getBufferAllocator() {
return BUFFER_ALLOCATOR;
}
public static Schema genArrowSchema(RecordSchema recordSchema) {
return recordSchema != null ? genTupleSchema(recordSchema) : genBlobSchema();
}
private static Schema genTupleSchema(RecordSchema recordSchema) {
List fieldList = new ArrayList<>();
for (Field dhField : recordSchema.getFields()) {
fieldList.add(getDataField(dhField.getName(), dhField.getType(), dhField.isAllowNull()));
}
fieldList.add(getAttributeField());
return new Schema(fieldList);
}
private static Schema genBlobSchema() {
List fieldList = new ArrayList<>();
fieldList.add(getDataField(BatchConstants.BLOB_COLUMN_NAME, new ArrowType.Binary(), false));
fieldList.add(getAttributeField());
return new Schema(fieldList);
}
private static org.apache.arrow.vector.types.pojo.Field getDataField(String name, FieldType type, boolean allowNull) {
return getDataField(name, getArrowType(type), allowNull);
}
private static org.apache.arrow.vector.types.pojo.Field getDataField(String name, ArrowType type, boolean allowNull) {
return new org.apache.arrow.vector.types.pojo.Field(
name,
new org.apache.arrow.vector.types.pojo.FieldType(allowNull, type, null),
null);
}
private static org.apache.arrow.vector.types.pojo.Field getAttributeField() {
List children = Collections.singletonList(
new org.apache.arrow.vector.types.pojo.Field("element",
new org.apache.arrow.vector.types.pojo.FieldType(false, new ArrowType.Struct(), null, null),
Arrays.asList(
getDataField("key", FieldType.STRING, false),
getDataField("value", FieldType.STRING, false)
)
)
);
return new org.apache.arrow.vector.types.pojo.Field(
BatchConstants.ATTRIBUTE_COLUMN_NAME,
new org.apache.arrow.vector.types.pojo.FieldType(true, new ArrowType.Map(false), null),
children);
}
private static ArrowType getArrowType(FieldType type) {
switch (type) {
case BOOLEAN:
return new ArrowType.Bool();
case TINYINT:
return new ArrowType.Int(8, true);
case SMALLINT:
return new ArrowType.Int(16, true);
case INTEGER:
return new ArrowType.Int(32, true);
case BIGINT:
case TIMESTAMP:
return new ArrowType.Int(64, true);
case FLOAT:
return new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE);
case DOUBLE:
return new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE);
case STRING:
case DECIMAL:
case JSON:
return new ArrowType.Utf8();
}
throw new DatahubClientException("Unknown DataHub type " + type);
}
}