All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.antgroup.geaflow.dsl.connector.hive.HiveReader Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2023 AntGroup CO., Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

package com.antgroup.geaflow.dsl.connector.hive;

import com.antgroup.geaflow.common.utils.ClassUtil;
import com.antgroup.geaflow.dsl.common.data.Row;
import com.antgroup.geaflow.dsl.common.data.impl.ObjectRow;
import com.antgroup.geaflow.dsl.common.exception.GeaFlowDSLException;
import com.antgroup.geaflow.dsl.common.types.StructType;
import com.antgroup.geaflow.dsl.common.types.TableField;
import com.antgroup.geaflow.dsl.common.util.Windows;
import com.antgroup.geaflow.dsl.connector.api.FetchData;
import com.antgroup.geaflow.dsl.connector.hive.HiveTableSource.HiveOffset;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.RecordReader;

public class HiveReader {

    private final RecordReader recordReader;
    private final StructType readSchema;
    private final Deserializer deserializer;


    public HiveReader(RecordReader recordReader, StructType readSchema,
                      StorageDescriptor sd, Properties tableProps) {
        this.recordReader = recordReader;
        this.readSchema = new StructType(readSchema.getFields().stream().map(
            f -> new TableField(f.getName().toLowerCase(Locale.ROOT), f.getType(), f.isNullable()))
            .collect(Collectors.toList()));
        this.deserializer = ClassUtil.newInstance(sd.getSerdeInfo().getSerializationLib());
        try {
            org.apache.hadoop.conf.Configuration conf = new org.apache.hadoop.conf.Configuration();
            SerDeUtils.initializeSerDe(deserializer, conf, tableProps, null);

        } catch (SerDeException e) {
            throw new GeaFlowDSLException(e);
        }
    }

    public FetchData read(long windowSize, String[] partitionValues) {
        if (windowSize == Windows.SIZE_OF_ALL_WINDOW) {
            Iterator hiveIterator = new HiveIterator(recordReader, deserializer, partitionValues, readSchema);
            return FetchData.createBatchFetch(hiveIterator, new HiveOffset(-1L));
        } else {
            throw new GeaFlowDSLException("Cannot support stream read for hive");
        }
    }

    private static class HiveIterator implements Iterator {

        private final RecordReader recordReader;
        private final Deserializer deserializer;
        private final String[] partitionValues;
        private final StructType readSchema;

        private final Map name2Fields = new HashMap<>();

        private final Writable key;
        private final Writable value;


        public HiveIterator(RecordReader recordReader,
                            Deserializer deserializer,
                            String[] partitionValues,
                            StructType readSchema) {
            this.recordReader = recordReader;
            this.deserializer = deserializer;
            this.partitionValues = partitionValues;
            this.readSchema = readSchema;
            key = recordReader.createKey();
            value = recordReader.createValue();

            try {
                StructObjectInspector structObjectInspector = (StructObjectInspector) deserializer.getObjectInspector();
                for (StructField field : structObjectInspector.getAllStructFieldRefs()) {
                    name2Fields.put(field.getFieldName(), field);
                }
            } catch (Exception e) {
                throw new GeaFlowDSLException(e);
            }
        }

        @Override
        public boolean hasNext() {
            try {
                return recordReader.next(key, value);
            } catch (IOException e) {
                throw new GeaFlowDSLException(e);
            }
        }

        @Override
        public Row next() {
            try {
                Object hiveRowStruct = deserializer.deserialize(value);
                StructObjectInspector structObjectInspector = (StructObjectInspector) deserializer.getObjectInspector();
                Object[] values = convertHiveStructToRow(hiveRowStruct, structObjectInspector);
                if (partitionValues.length > 0) { // append partition values.
                    Object[] valueWithPartitions = new Object[values.length + partitionValues.length];
                    System.arraycopy(values, 0, valueWithPartitions, 0, values.length);
                    System.arraycopy(partitionValues, 0, valueWithPartitions,
                        values.length, partitionValues.length);
                    values = valueWithPartitions;
                }
                return ObjectRow.create(values);
            } catch (Exception e) {
                throw new GeaFlowDSLException(e);
            }
        }

        private Object[] convertHiveStructToRow(Object hiveRowStruct, StructObjectInspector structObjectInspector) {
            Object[] values = new Object[readSchema.size()];
            for (int i = 0; i < values.length; i++) {
                String fieldName = readSchema.getField(i).getName();
                StructField field = name2Fields.get(fieldName);
                if (field != null) {
                    values[i] = toSqlValue(
                        structObjectInspector.getStructFieldData(hiveRowStruct, field),
                        field.getFieldObjectInspector());
                } else {
                    values[i] = null;
                }
            }
            return values;
        }

        private Object toSqlValue(Object hiveValue, ObjectInspector fieldInspector) {
            if (fieldInspector instanceof PrimitiveObjectInspector) {
                PrimitiveObjectInspector primitiveObjectInspector = (PrimitiveObjectInspector) fieldInspector;
                return primitiveObjectInspector.getPrimitiveJavaObject(hiveValue);
            }
            throw new GeaFlowDSLException("Complex type:{} have not support", fieldInspector.getTypeName());
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy