org.elasticsearch.hadoop.hive.EsSerDe Maven / Gradle / Ivy
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.hadoop.hive;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.serde2.AbstractSerDe;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.SerDeStats;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.elasticsearch.hadoop.EsHadoopIllegalStateException;
import org.elasticsearch.hadoop.cfg.CompositeSettings;
import org.elasticsearch.hadoop.cfg.HadoopSettingsManager;
import org.elasticsearch.hadoop.cfg.Settings;
import org.elasticsearch.hadoop.mr.security.HadoopUserProvider;
import org.elasticsearch.hadoop.rest.InitializationUtils;
import org.elasticsearch.hadoop.serialization.bulk.BulkCommand;
import org.elasticsearch.hadoop.serialization.bulk.BulkCommands;
import org.elasticsearch.hadoop.util.*;
public class EsSerDe extends AbstractSerDe {
private static Log log = LogFactory.getLog(EsSerDe.class);
private Configuration cfg;
private Settings settings;
private StructObjectInspector inspector;
// serialization artifacts
private final BytesArray scratchPad = new BytesArray(512);
private final HiveType hiveType = new HiveType(null, null);
private final HiveBytesArrayWritable result = new HiveBytesArrayWritable();
private StructTypeInfo structTypeInfo;
private FieldAlias alias;
private BulkCommand command;
private boolean writeInitialized = false;
private boolean trace = false;
private boolean outputJSON = false;
private Text jsonFieldName = null;
// introduced in Hive 0.14
// implemented to actually get access to the raw properties
@Override
public void initialize(Configuration conf, Properties tbl, Properties partitionProperties) throws SerDeException {
inspector = HiveUtils.structObjectInspector(tbl);
structTypeInfo = HiveUtils.typeInfo(inspector);
cfg = conf;
List settingSources = new ArrayList<>();
settingSources.add(HadoopSettingsManager.loadFrom(tbl));
if (cfg != null) {
settingSources.add(HadoopSettingsManager.loadFrom(cfg));
}
settings = new CompositeSettings(settingSources);
alias = HiveUtils.alias(settings);
HiveUtils.fixHive13InvalidComments(settings, tbl);
trace = log.isTraceEnabled();
outputJSON = settings.getOutputAsJson();
if (outputJSON) {
jsonFieldName = new Text(HiveUtils.discoverJsonFieldName(settings, alias));
}
}
@Override
public void initialize(Configuration conf, Properties tbl) throws SerDeException {
initialize(conf, tbl, new Properties());
}
@Override
public Object deserialize(Writable blob) throws SerDeException {
if (blob == null || blob instanceof NullWritable) {
return null;
}
Writable deserialize = blob;
if (outputJSON) {
deserialize = wrapJsonData(blob);
}
Object des = hiveFromWritable(structTypeInfo, deserialize, alias);
if (trace) {
log.trace(String.format("Deserialized [%s] to [%s]", blob, des));
}
return des;
}
private Writable wrapJsonData(Writable blob) {
Assert.isTrue(blob instanceof Text, "Property `es.output.json` is enabled, but returned data was not of type Text...");
switch (structTypeInfo.getCategory()) {
case STRUCT:
Map mapContainer = new MapWritable();
mapContainer.put(jsonFieldName, blob);
return (Writable) mapContainer;
default:
throw new EsHadoopIllegalStateException("Could not correctly wrap JSON data for structural type " + structTypeInfo.getCategory());
}
}
@Override
public ObjectInspector getObjectInspector() throws SerDeException {
return inspector;
}
@Override
public SerDeStats getSerDeStats() {
// TODO: can compute serialize stats but not deserialized ones
return null;
}
@Override
public Class extends Writable> getSerializedClass() {
return HiveBytesArrayWritable.class;
}
@Override
public Writable serialize(Object data, ObjectInspector objInspector) throws SerDeException {
lazyInitializeWrite();
// serialize the type directly to json (to avoid converting to Writable and then serializing)
scratchPad.reset();
hiveType.setObjectInspector(objInspector);
hiveType.setObject(data);
// We use the command directly instead of the bulk entry writer since there is no close() method on SerDes.
// See FileSinkOperator#process() for more info of how this is used with the output format.
command.write(hiveType).copyTo(scratchPad);
result.setContent(scratchPad);
return result;
}
private void lazyInitializeWrite() {
if (writeInitialized) {
return;
}
writeInitialized = true;
InitializationUtils.setUserProviderIfNotSet(settings, HadoopUserProvider.class, log);
ClusterInfo clusterInfo = InitializationUtils.discoverClusterInfo(settings, log);
InitializationUtils.setValueWriterIfNotSet(settings, HiveValueWriter.class, log);
InitializationUtils.setFieldExtractorIfNotSet(settings, HiveFieldExtractor.class, log);
InitializationUtils.setBytesConverterIfNeeded(settings, HiveBytesConverter.class, log);
this.command = BulkCommands.create(settings, null, clusterInfo.getMajorVersion());
}
@SuppressWarnings("unchecked")
static Object hiveFromWritable(TypeInfo type, Writable data, FieldAlias alias) {
if (data == null || data instanceof NullWritable) {
return null;
}
switch (type.getCategory()) {
case LIST: {// or ARRAY
ListTypeInfo listType = (ListTypeInfo) type;
TypeInfo listElementType = listType.getListElementTypeInfo();
ArrayWritable aw = (ArrayWritable) data;
List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy