com.aliyun.datahub.client.impl.batch.avro.AvroDeserializer Maven / Gradle / Ivy
The newest version!
package com.aliyun.datahub.client.impl.batch.avro;
import com.aliyun.datahub.client.exception.DatahubClientException;
import com.aliyun.datahub.client.impl.batch.BatchConstants;
import com.aliyun.datahub.client.impl.batch.BatchDeserializer;
import com.aliyun.datahub.client.impl.batch.header.BatchHeader;
import com.aliyun.datahub.client.model.*;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.util.Utf8;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class AvroDeserializer extends BatchDeserializer {
private final static Logger LOGGER = LoggerFactory.getLogger(AvroDeserializer.class);
@Override
public List deserializeRecord(InputStream inputStream, BatchHeader header) {
try {
List genericRecordList = deserializeAvroRecord(inputStream, header);
return convertRecord(genericRecordList, header.getSchemaVersion());
} catch (Exception e) {
LOGGER.error("Deserialize avro record failed", e);
throw new DatahubClientException(e.getMessage());
}
}
private List deserializeAvroRecord(InputStream inputStream, BatchHeader header) throws IOException {
RecordSchema dhSchema = getSchema(header.getSchemaVersion());
Schema avroSchema = AvroSchemaCache.getSchema(dhSchema);
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null);
DatumReader datumReader = new GenericDatumReader<>(avroSchema);
List recordList = new ArrayList<>(header.getRecordCount());
while (!decoder.isEnd()) {
GenericRecord record = new GenericData.Record(avroSchema);
datumReader.read(record, decoder);
recordList.add(record);
}
return recordList;
}
public List convertRecord(List genericRecordList, int schemaVersion) {
RecordSchema dhSchema = getSchema(schemaVersion);
List recordEntryList = new ArrayList<>(genericRecordList.size());
for (int idx = 0; idx < genericRecordList.size(); ++idx) {
RecordEntry entry = new RecordEntry();
GenericRecord record = genericRecordList.get(idx);
if (dhSchema != null) {
TupleRecordData data = new TupleRecordData(dhSchema);
setColumnValue(record, data);
entry.setRecordData(data);
} else {
ByteBuffer buffer = (ByteBuffer) record.get(BatchConstants.BLOB_COLUMN_NAME);
entry.setRecordData(new BlobRecordData((buffer.array())));
}
Map attrs = (Map) record.get(BatchConstants.ATTRIBUTE_COLUMN_NAME);
if (attrs != null && !attrs.isEmpty()) {
for (Map.Entry me : attrs.entrySet()) {
entry.addAttribute(me.getKey().toString(), me.getValue().toString());
}
}
// 因为binary的反序列化schemaVersion可能不一致,所以这个需要放到子类中
// 如果可以保证binary中的schemaVersion全部一致,这个逻辑可以放到基类中
entry.innerSetSegmentInfo(schemaVersion, 0, idx);
recordEntryList.add(entry);
}
return recordEntryList;
}
private void setColumnValue(GenericRecord record, TupleRecordData data) {
for (int idx = 0; idx < data.getRecordSchema().getFields().size(); ++idx) {
Object obj = record.get(idx);
if (obj == null) {
continue;
}
FieldType type = data.getRecordSchema().getField(idx).getType();
// avro的decimal需要设置精度,而DataHub全部都是按String处理的,所以把decimal转为String来处理
if (type == FieldType.DECIMAL) {
obj = new BigDecimal(obj.toString());
} else if (type == FieldType.STRING || type == FieldType.JSON) {
// org.apache.avro.util.Utf8 => String
obj = obj.toString();
} else if (type == FieldType.TINYINT) {
obj = ((Integer) obj).byteValue();
} else if (type == FieldType.SMALLINT) {
obj = ((Integer) obj).shortValue();
}
data.setField(idx, obj);
}
}
}