org.apache.hadoop.hive.druid.serde.DruidSerDe Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.druid.serde;
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import io.druid.query.Druids;
import io.druid.query.Druids.SegmentMetadataQueryBuilder;
import io.druid.query.metadata.metadata.ColumnAnalysis;
import io.druid.query.metadata.metadata.SegmentAnalysis;
import io.druid.query.metadata.metadata.SegmentMetadataQuery;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.common.type.TimestampTZ;
import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.druid.DruidStorageHandler;
import org.apache.hadoop.hive.druid.DruidStorageHandlerUtils;
import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.io.ByteWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.io.HiveCharWritable;
import org.apache.hadoop.hive.serde2.io.HiveVarcharWritable;
import org.apache.hadoop.hive.serde2.io.ShortWritable;
import org.apache.hadoop.hive.serde2.io.TimestampLocalTZWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveCharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveVarcharObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampLocalTZObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TimestampLocalTZTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.time.Instant;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import java.util.stream.Collectors;
/**
* DruidSerDe that is used to deserialize objects from a Druid data source.
*/
@SerDeSpec(schemaProps = { Constants.DRUID_DATA_SOURCE })
public class DruidSerDe extends AbstractSerDe {
protected static final Logger LOG = LoggerFactory.getLogger(DruidSerDe.class);
private String[] columns;
private PrimitiveTypeInfo[] types;
private ObjectInspector inspector;
private TimestampLocalTZTypeInfo tsTZTypeInfo;
@Override
public void initialize(Configuration configuration, Properties properties) throws SerDeException {
tsTZTypeInfo = new TimestampLocalTZTypeInfo(
configuration.get(HiveConf.ConfVars.HIVE_LOCAL_TIME_ZONE.varname));
// Druid query
final String druidQuery = properties.getProperty(Constants.DRUID_QUERY_JSON, null);
if (druidQuery != null && !druidQuery.isEmpty()) {
initFromDruidQueryPlan(properties, druidQuery);
} else {
// No query. Either it is a CTAS, or we need to create a Druid meta data Query
if (!org.apache.commons.lang3.StringUtils
.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMNS))
&& !org.apache.commons.lang3.StringUtils
.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMN_TYPES))) {
// CASE CTAS statement
initFromProperties(properties);
} else {
// Segment Metadata query that retrieves all columns present in
// the data source (dimensions and metrics).
initFromMetaDataQuery(configuration, properties);
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("DruidSerDe initialized with\n" + "\t columns: " + Arrays.toString(columns)
+ "\n\t types: " + Arrays.toString(types));
}
}
private void initFromMetaDataQuery(final Configuration configuration, final Properties properties)
throws SerDeException {
final List columnNames = new ArrayList<>();
final List columnTypes = new ArrayList<>();
final List inspectors = new ArrayList<>();
String dataSource = properties.getProperty(Constants.DRUID_DATA_SOURCE);
if (dataSource == null) {
throw new SerDeException("Druid data source not specified; use " +
Constants.DRUID_DATA_SOURCE + " in table properties");
}
SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
builder.dataSource(dataSource);
builder.merge(true);
builder.analysisTypes();
SegmentMetadataQuery query = builder.build();
// Execute query in Druid
String address = HiveConf.getVar(configuration,
HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS
);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
// Infer schema
SegmentAnalysis schemaInfo;
try {
schemaInfo = submitMetadataRequest(address, query);
} catch (IOException e) {
throw new SerDeException(e);
}
for (Entry columnInfo : schemaInfo.getColumns().entrySet()) {
if (columnInfo.getKey().equals(DruidStorageHandlerUtils.DEFAULT_TIMESTAMP_COLUMN)) {
// Special handling for timestamp column
columnNames.add(columnInfo.getKey()); // field name
PrimitiveTypeInfo type = tsTZTypeInfo; // field type
columnTypes.add(type);
inspectors
.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
continue;
}
columnNames.add(columnInfo.getKey()); // field name
PrimitiveTypeInfo type = DruidSerDeUtils.convertDruidToHiveType(
columnInfo.getValue().getType()); // field type
columnTypes.add(type instanceof TimestampLocalTZTypeInfo ? tsTZTypeInfo : type);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
}
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory
.getStandardStructObjectInspector(columnNames, inspectors);
}
private void initFromProperties(final Properties properties)
throws SerDeException {
final List inspectors = new ArrayList<>();
final List columnNames = new ArrayList<>();
final List columnTypes = new ArrayList<>();
columnNames.addAll(Utilities.getColumnNames(properties));
if (!columnNames.contains(DruidStorageHandlerUtils.DEFAULT_TIMESTAMP_COLUMN)) {
throw new SerDeException("Timestamp column (' " + DruidStorageHandlerUtils.DEFAULT_TIMESTAMP_COLUMN +
"') not specified in create table; list of columns is : " +
properties.getProperty(serdeConstants.LIST_COLUMNS));
}
columnTypes.addAll(Lists.transform(
Lists.transform(Utilities.getColumnTypes(properties), type -> TypeInfoFactory.getPrimitiveTypeInfo(type)),
e -> e instanceof TimestampLocalTZTypeInfo ? tsTZTypeInfo : e
));
inspectors.addAll(Lists.transform(columnTypes,
(Function) type -> PrimitiveObjectInspectorFactory
.getPrimitiveWritableObjectInspector(type)
));
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory
.getStandardStructObjectInspector(columnNames, inspectors);
}
private void initFromDruidQueryPlan(Properties properties, String druidQuery) {
Preconditions.checkNotNull(druidQuery, "Why Druid query is null");
final List inspectors = new ArrayList<>();
final List columnNames;
final List columnTypes;
final String fieldNamesProperty =
Preconditions.checkNotNull(properties.getProperty(Constants.DRUID_QUERY_FIELD_NAMES, null));
final String fieldTypesProperty =
Preconditions.checkNotNull(properties.getProperty(Constants.DRUID_QUERY_FIELD_TYPES, null));
if (fieldNamesProperty.isEmpty()) {
// this might seem counter intuitive but some queries like query
// SELECT YEAR(Calcs.date0) AS yr_date0_ok FROM druid_tableau.calcs Calcs WHERE (YEAR(Calcs.date0) IS NULL) LIMIT 1
// is planed in a way where we only push a filter down and keep the project of null as hive project. Thus empty columns
columnNames = Collections.EMPTY_LIST;
columnTypes = Collections.EMPTY_LIST;
} else {
columnNames =
Arrays.stream(fieldNamesProperty.trim().split(",")).collect(Collectors.toList());
columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(fieldTypesProperty).stream()
.map(e -> TypeInfoFactory.getPrimitiveTypeInfo(e.getTypeName()))
.map(primitiveTypeInfo -> {
if (primitiveTypeInfo instanceof TimestampLocalTZTypeInfo) {
return tsTZTypeInfo;
}
return primitiveTypeInfo;
}).collect(Collectors.toList());
}
columns = new String[columnNames.size()];
types = new PrimitiveTypeInfo[columnNames.size()];
for (int i = 0; i < columnTypes.size(); ++i) {
columns[i] = columnNames.get(i);
types[i] = columnTypes.get(i);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(types[i]));
}
inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
}
/* Submits the request and returns */
protected SegmentAnalysis submitMetadataRequest(String address, SegmentMetadataQuery query)
throws SerDeException, IOException {
InputStream response;
try {
response = DruidStorageHandlerUtils.submitRequest(DruidStorageHandler.getHttpClient(),
DruidStorageHandlerUtils.createSmileRequest(address, query)
);
} catch (Exception e) {
throw new SerDeException(StringUtils.stringifyException(e));
}
// Retrieve results
List resultsList;
try {
// This will throw an exception in case of the response from druid is not an array
// this case occurs if for instance druid query execution returns an exception instead of array of results.
resultsList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response,
new TypeReference>() {
}
);
} catch (Exception e) {
response.close();
throw new SerDeException(StringUtils.stringifyException(e));
}
if (resultsList == null || resultsList.isEmpty()) {
throw new SerDeException("Connected to Druid but could not retrieve datasource information");
}
if (resultsList.size() != 1) {
throw new SerDeException("Information about segments should have been merged");
}
return resultsList.get(0);
}
@Override
public Class extends Writable> getSerializedClass() {
return DruidWritable.class;
}
@Override
public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException {
if (objectInspector.getCategory() != ObjectInspector.Category.STRUCT) {
throw new SerDeException(getClass().toString()
+ " can only serialize struct types, but we got: "
+ objectInspector.getTypeName());
}
// Prepare the field ObjectInspectors
StructObjectInspector soi = (StructObjectInspector) objectInspector;
List extends StructField> fields = soi.getAllStructFieldRefs();
List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy