com.aliyun.datahub.client.impl.batch.avro.AvroDeserializer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of aliyun-sdk-datahub Show documentation
The newest version!
package com.aliyun.datahub.client.impl.batch.avro;

import com.aliyun.datahub.client.exception.DatahubClientException;
import com.aliyun.datahub.client.impl.batch.BatchConstants;
import com.aliyun.datahub.client.impl.batch.BatchDeserializer;
import com.aliyun.datahub.client.impl.batch.header.BatchHeader;
import com.aliyun.datahub.client.model.*;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.util.Utf8;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

public class AvroDeserializer extends BatchDeserializer {
    private final static Logger LOGGER = LoggerFactory.getLogger(AvroDeserializer.class);

    @Override
    public List deserializeRecord(InputStream inputStream, BatchHeader header) {
        try {
            List genericRecordList = deserializeAvroRecord(inputStream, header);
            return convertRecord(genericRecordList, header.getSchemaVersion());
        } catch (Exception e) {
            LOGGER.error("Deserialize avro record failed", e);
            throw new DatahubClientException(e.getMessage());
        }
    }

    private List deserializeAvroRecord(InputStream inputStream, BatchHeader header) throws IOException {
        RecordSchema dhSchema = getSchema(header.getSchemaVersion());
        Schema avroSchema = AvroSchemaCache.getSchema(dhSchema);

        BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(inputStream, null);
        DatumReader datumReader = new GenericDatumReader<>(avroSchema);

        List recordList = new ArrayList<>(header.getRecordCount());
        while (!decoder.isEnd()) {
            GenericRecord record = new GenericData.Record(avroSchema);
            datumReader.read(record, decoder);
            recordList.add(record);
        }
        return recordList;
    }

    public List convertRecord(List genericRecordList, int schemaVersion) {
        RecordSchema dhSchema = getSchema(schemaVersion);
        List recordEntryList = new ArrayList<>(genericRecordList.size());
        for (int idx = 0; idx < genericRecordList.size(); ++idx) {
            RecordEntry entry = new RecordEntry();
            GenericRecord record = genericRecordList.get(idx);
            if (dhSchema != null) {
                TupleRecordData data = new TupleRecordData(dhSchema);
                setColumnValue(record, data);
                entry.setRecordData(data);
            } else {
                ByteBuffer buffer = (ByteBuffer) record.get(BatchConstants.BLOB_COLUMN_NAME);
                entry.setRecordData(new BlobRecordData((buffer.array())));
            }

            Map attrs = (Map) record.get(BatchConstants.ATTRIBUTE_COLUMN_NAME);

            if (attrs != null && !attrs.isEmpty()) {
                for (Map.Entry me : attrs.entrySet()) {
                    entry.addAttribute(me.getKey().toString(), me.getValue().toString());
                }
            }

            // 因为binary的反序列化schemaVersion可能不一致，所以这个需要放到子类中
            // 如果可以保证binary中的schemaVersion全部一致，这个逻辑可以放到基类中
            entry.innerSetSegmentInfo(schemaVersion, 0, idx);
            recordEntryList.add(entry);
        }
        return recordEntryList;
    }

    private void setColumnValue(GenericRecord record, TupleRecordData data) {
        for (int idx = 0; idx < data.getRecordSchema().getFields().size(); ++idx) {
            Object obj = record.get(idx);
            if (obj == null) {
                continue;
            }

            FieldType type = data.getRecordSchema().getField(idx).getType();
            // avro的decimal需要设置精度，而DataHub全部都是按String处理的，所以把decimal转为String来处理
            if (type == FieldType.DECIMAL) {
                obj = new BigDecimal(obj.toString());
            } else if (type == FieldType.STRING || type == FieldType.JSON) {
                // org.apache.avro.util.Utf8 => String
                obj = obj.toString();
            } else if (type == FieldType.TINYINT) {
                obj = ((Integer) obj).byteValue();
            } else if (type == FieldType.SMALLINT) {
                obj = ((Integer) obj).shortValue();
            }
            data.setField(idx, obj);
        }
    }
}