org.apache.hadoop.hive.druid.serde.DruidSerDe Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.druid.serde;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import org.apache.calcite.adapter.druid.DruidTable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.druid.DruidStorageHandlerUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeSpec;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable;
import org.apache.hadoop.hive.serde2.io.TimestampWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.io.ByteWritable;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.ShortWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
import org.joda.time.Period;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Function;
import com.google.common.collect.Lists;
import com.metamx.common.lifecycle.Lifecycle;
import com.metamx.http.client.HttpClient;
import com.metamx.http.client.HttpClientConfig;
import com.metamx.http.client.HttpClientInit;
import io.druid.query.Druids;
import io.druid.query.Druids.SegmentMetadataQueryBuilder;
import io.druid.query.Query;
import io.druid.query.aggregation.AggregatorFactory;
import io.druid.query.aggregation.PostAggregator;
import io.druid.query.dimension.DimensionSpec;
import io.druid.query.groupby.GroupByQuery;
import io.druid.query.metadata.metadata.ColumnAnalysis;
import io.druid.query.metadata.metadata.SegmentAnalysis;
import io.druid.query.metadata.metadata.SegmentMetadataQuery;
import io.druid.query.select.SelectQuery;
import io.druid.query.timeseries.TimeseriesQuery;
import io.druid.query.topn.TopNQuery;
/**
* DruidSerDe that is used to deserialize objects from a Druid data source.
*/
@SerDeSpec(schemaProps = { Constants.DRUID_DATA_SOURCE })
public class DruidSerDe extends AbstractSerDe {
protected static final Logger LOG = LoggerFactory.getLogger(DruidSerDe.class);
private int numConnection;
private Period readTimeout;
private String[] columns;
private PrimitiveTypeInfo[] types;
private ObjectInspector inspector;
@Override
public void initialize(Configuration configuration, Properties properties) throws SerDeException {
// Init connection properties
numConnection = HiveConf
.getIntVar(configuration, HiveConf.ConfVars.HIVE_DRUID_NUM_HTTP_CONNECTION);
readTimeout = new Period(
HiveConf.getVar(configuration, HiveConf.ConfVars.HIVE_DRUID_HTTP_READ_TIMEOUT));
final List columnNames = new ArrayList<>();
final List columnTypes = new ArrayList<>();
List inspectors = new ArrayList<>();
// Druid query
String druidQuery = properties.getProperty(Constants.DRUID_QUERY_JSON);
if (druidQuery == null) {
// No query. Either it is a CTAS, or we need to create a Druid
// Segment Metadata query that retrieves all columns present in
// the data source (dimensions and metrics).
if (!org.apache.commons.lang3.StringUtils
.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMNS))
&& !org.apache.commons.lang3.StringUtils
.isEmpty(properties.getProperty(serdeConstants.LIST_COLUMN_TYPES))) {
columnNames.addAll(Utilities.getColumnNames(properties));
if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
throw new SerDeException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN +
"') not specified in create table; list of columns is : " +
properties.getProperty(serdeConstants.LIST_COLUMNS));
}
columnTypes.addAll(Lists.transform(Utilities.getColumnTypes(properties),
new Function() {
@Override
public PrimitiveTypeInfo apply(String type) {
return TypeInfoFactory.getPrimitiveTypeInfo(type);
}
}
));
inspectors.addAll(Lists.transform(columnTypes,
new Function() {
@Override
public ObjectInspector apply(PrimitiveTypeInfo type) {
return PrimitiveObjectInspectorFactory
.getPrimitiveWritableObjectInspector(type);
}
}
));
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory
.getStandardStructObjectInspector(columnNames, inspectors);
} else {
String dataSource = properties.getProperty(Constants.DRUID_DATA_SOURCE);
if (dataSource == null) {
throw new SerDeException("Druid data source not specified; use " +
Constants.DRUID_DATA_SOURCE + " in table properties");
}
SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
builder.dataSource(dataSource);
builder.merge(true);
builder.analysisTypes();
SegmentMetadataQuery query = builder.build();
// Execute query in Druid
String address = HiveConf.getVar(configuration,
HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS
);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
// Infer schema
SegmentAnalysis schemaInfo;
try {
schemaInfo = submitMetadataRequest(address, query);
} catch (IOException e) {
throw new SerDeException(e);
}
for (Entry columnInfo : schemaInfo.getColumns().entrySet()) {
if (columnInfo.getKey().equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
// Special handling for timestamp column
columnNames.add(columnInfo.getKey()); // field name
PrimitiveTypeInfo type = TypeInfoFactory.timestampTypeInfo; // field type
columnTypes.add(type);
inspectors
.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
continue;
}
columnNames.add(columnInfo.getKey()); // field name
PrimitiveTypeInfo type = DruidSerDeUtils.convertDruidToHiveType(
columnInfo.getValue().getType()); // field type
columnTypes.add(type);
inspectors.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(type));
}
columns = columnNames.toArray(new String[columnNames.size()]);
types = columnTypes.toArray(new PrimitiveTypeInfo[columnTypes.size()]);
inspector = ObjectInspectorFactory
.getStandardStructObjectInspector(columnNames, inspectors);
}
} else {
// Query is specified, we can extract the results schema from the query
Query> query;
try {
query = DruidStorageHandlerUtils.JSON_MAPPER.readValue(druidQuery, Query.class);
switch (query.getType()) {
case Query.TIMESERIES:
inferSchema((TimeseriesQuery) query, columnNames, columnTypes);
break;
case Query.TOPN:
inferSchema((TopNQuery) query, columnNames, columnTypes);
break;
case Query.SELECT:
String address = HiveConf.getVar(configuration,
HiveConf.ConfVars.HIVE_DRUID_BROKER_DEFAULT_ADDRESS);
if (org.apache.commons.lang3.StringUtils.isEmpty(address)) {
throw new SerDeException("Druid broker address not specified in configuration");
}
inferSchema((SelectQuery) query, columnNames, columnTypes, address);
break;
case Query.GROUP_BY:
inferSchema((GroupByQuery) query, columnNames, columnTypes);
break;
default:
throw new SerDeException("Not supported Druid query");
}
} catch (Exception e) {
throw new SerDeException(e);
}
columns = new String[columnNames.size()];
types = new PrimitiveTypeInfo[columnNames.size()];
for (int i = 0; i < columnTypes.size(); ++i) {
columns[i] = columnNames.get(i);
types[i] = columnTypes.get(i);
inspectors
.add(PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector(types[i]));
}
inspector = ObjectInspectorFactory.getStandardStructObjectInspector(columnNames, inspectors);
}
if (LOG.isDebugEnabled()) {
LOG.debug("DruidSerDe initialized with\n"
+ "\t columns: " + columnNames
+ "\n\t types: " + columnTypes);
}
}
/* Submits the request and returns */
protected SegmentAnalysis submitMetadataRequest(String address, SegmentMetadataQuery query)
throws SerDeException, IOException {
final Lifecycle lifecycle = new Lifecycle();
HttpClient client = HttpClientInit.createClient(
HttpClientConfig.builder().withNumConnections(numConnection)
.withReadTimeout(readTimeout.toStandardDuration()).build(), lifecycle);
InputStream response;
try {
lifecycle.start();
response = DruidStorageHandlerUtils.submitRequest(client,
DruidStorageHandlerUtils.createRequest(address, query)
);
} catch (Exception e) {
throw new SerDeException(StringUtils.stringifyException(e));
} finally {
lifecycle.stop();
}
// Retrieve results
List resultsList;
try {
resultsList = DruidStorageHandlerUtils.SMILE_MAPPER.readValue(response,
new TypeReference>() {
}
);
} catch (Exception e) {
response.close();
throw new SerDeException(StringUtils.stringifyException(e));
}
if (resultsList == null || resultsList.isEmpty()) {
throw new SerDeException("Connected to Druid but could not retrieve datasource information");
}
if (resultsList.size() != 1) {
throw new SerDeException("Information about segments should have been merged");
}
return resultsList.get(0);
}
/* Timeseries query */
private void inferSchema(TimeseriesQuery query, List columnNames,
List columnTypes
) {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Aggregator columns
for (AggregatorFactory af : query.getAggregatorSpecs()) {
columnNames.add(af.getName());
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
}
// Post-aggregator columns
// TODO: Currently Calcite only infers avg for post-aggregate,
// but once we recognize other functions, we will need to infer
// different types for post-aggregation functions
for (PostAggregator pa : query.getPostAggregatorSpecs()) {
columnNames.add(pa.getName());
columnTypes.add(TypeInfoFactory.floatTypeInfo);
}
}
/* TopN query */
private void inferSchema(TopNQuery query, List columnNames,
List columnTypes
) {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Dimension column
columnNames.add(query.getDimensionSpec().getOutputName());
columnTypes.add(TypeInfoFactory.stringTypeInfo);
// Aggregator columns
for (AggregatorFactory af : query.getAggregatorSpecs()) {
columnNames.add(af.getName());
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
}
// Post-aggregator columns
// TODO: Currently Calcite only infers avg for post-aggregate,
// but once we recognize other functions, we will need to infer
// different types for post-aggregation functions
for (PostAggregator pa : query.getPostAggregatorSpecs()) {
columnNames.add(pa.getName());
columnTypes.add(TypeInfoFactory.floatTypeInfo);
}
}
/* Select query */
private void inferSchema(SelectQuery query, List columnNames,
List columnTypes, String address) throws SerDeException {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Dimension columns
for (DimensionSpec ds : query.getDimensions()) {
columnNames.add(ds.getOutputName());
columnTypes.add(TypeInfoFactory.stringTypeInfo);
}
// The type for metric columns is not explicit in the query, thus in this case
// we need to emit a metadata query to know their type
SegmentMetadataQueryBuilder builder = new Druids.SegmentMetadataQueryBuilder();
builder.dataSource(query.getDataSource());
builder.merge(true);
builder.analysisTypes();
SegmentMetadataQuery metadataQuery = builder.build();
// Execute query in Druid
SegmentAnalysis schemaInfo;
try {
schemaInfo = submitMetadataRequest(address, metadataQuery);
} catch (IOException e) {
throw new SerDeException(e);
}
if (schemaInfo == null) {
throw new SerDeException("Connected to Druid but could not retrieve datasource information");
}
for (String metric : query.getMetrics()) {
columnNames.add(metric);
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(
schemaInfo.getColumns().get(metric).getType()));
}
}
/* GroupBy query */
private void inferSchema(GroupByQuery query, List columnNames,
List columnTypes
) {
// Timestamp column
columnNames.add(DruidTable.DEFAULT_TIMESTAMP_COLUMN);
columnTypes.add(TypeInfoFactory.timestampTypeInfo);
// Dimension columns
for (DimensionSpec ds : query.getDimensions()) {
columnNames.add(ds.getOutputName());
columnTypes.add(TypeInfoFactory.stringTypeInfo);
}
// Aggregator columns
for (AggregatorFactory af : query.getAggregatorSpecs()) {
columnNames.add(af.getName());
columnTypes.add(DruidSerDeUtils.convertDruidToHiveType(af.getTypeName()));
}
// Post-aggregator columns
// TODO: Currently Calcite only infers avg for post-aggregate,
// but once we recognize other functions, we will need to infer
// different types for post-aggregation functions
for (PostAggregator pa : query.getPostAggregatorSpecs()) {
columnNames.add(pa.getName());
columnTypes.add(TypeInfoFactory.floatTypeInfo);
}
}
@Override
public Class extends Writable> getSerializedClass() {
return DruidWritable.class;
}
@Override
public Writable serialize(Object o, ObjectInspector objectInspector) throws SerDeException {
if (objectInspector.getCategory() != ObjectInspector.Category.STRUCT) {
throw new SerDeException(getClass().toString()
+ " can only serialize struct types, but we got: "
+ objectInspector.getTypeName());
}
// Prepare the field ObjectInspectors
StructObjectInspector soi = (StructObjectInspector) objectInspector;
List extends StructField> fields = soi.getAllStructFieldRefs();
List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy