org.apache.hadoop.hive.ql.exec.SerializationUtilities Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.util.LinkedList;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Array;
import java.lang.reflect.Field;
import java.net.URI;
import java.sql.Timestamp;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.CompilationOpContext;
import org.apache.hadoop.hive.ql.exec.vector.VectorFileSinkOperator;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat;
import org.apache.hadoop.hive.ql.io.RCFileInputFormat;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.plan.AbstractOperatorDesc;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.SparkEdgeProperty;
import org.apache.hadoop.hive.ql.plan.SparkWork;
import org.apache.hadoop.hive.ql.plan.TableDesc;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantMapObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StandardConstantStructObjectInspector;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.objenesis.strategy.StdInstantiatorStrategy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.esotericsoftware.kryo.Kryo;
import com.esotericsoftware.kryo.Registration;
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;
import com.esotericsoftware.kryo.pool.KryoFactory;
import com.esotericsoftware.kryo.pool.KryoPool;
import com.esotericsoftware.kryo.serializers.FieldSerializer;
/**
* Utilities related to serialization and deserialization.
*/
public class SerializationUtilities {
private static final String CLASS_NAME = SerializationUtilities.class.getName();
private static final Logger LOG = LoggerFactory.getLogger(CLASS_NAME);
public static class Hook {
public boolean preRead(Class> type) {
return true;
}
public Object postRead(Object o) {
return o;
}
}
private static final Map, Hook> kryoTypeHooks = new HashMap<>();
private static Hook globalHook = null;
/**
* Must be called before any serialization takes place (e.g. in some static/service init)!
* Not thread safe.
*
* Will not work if different classes are added in different order on two sides of the
* communication, due to explicit class registration that we use causing class ID mismatch.
* Some processing might be added for this later (e.g. sorting the overrides here if the order
* is hard to enforce, and making sure they are added symmetrically everywhere, or just
* reverting back to hardcoding stuff if all else fails).
* For now, this seems to work, but Kryo seems pretty brittle. Seems to be ok to add class on
* read side but not write side, the other way doesn't work. Kryo needs a proper event system,
* otherwise this is all rather brittle.
*/
public static void addKryoTypeHook(Class> clazz, Hook hook) {
kryoTypeHooks.put(clazz, hook);
}
/**
* Must be called before any serialization takes place (e.g. in some static/service init)!
* Not thread safe.
*
* This is somewhat brittle because there's no way to add proper superclass hook in Kryo.
* On the other hand, it doesn't suffer from the mismatch problems that register() causes!
*/
public static void setGlobalHook(Hook hook) {
globalHook = hook;
}
/**
* Provides general-purpose hooks for specific types, as well as a global hook.
*/
private static class KryoWithHooks extends Kryo {
private Hook globalHook;
@SuppressWarnings({"unchecked", "rawtypes"})
private static final class SerializerWithHook extends com.esotericsoftware.kryo.Serializer {
private final com.esotericsoftware.kryo.Serializer old;
private final Hook hook;
private SerializerWithHook(com.esotericsoftware.kryo.Serializer old, Hook hook) {
this.old = old;
this.hook = hook;
}
@Override
public Object read(Kryo kryo, Input input, Class type) {
return hook.preRead(type)
? hook.postRead(old.read(kryo, input, type)) : old.read(kryo, input, type);
}
@Override
public void write(Kryo kryo, Output output, Object object) {
// Add write hooks if needed.
old.write(kryo, output, object);
}
}
public Kryo processHooks(Map, Hook> hooks, Hook globalHook) {
for (Map.Entry, Hook> e : hooks.entrySet()) {
register(e.getKey(), new SerializerWithHook(
newDefaultSerializer(e.getKey()), e.getValue()));
}
this.globalHook = globalHook;
return this; // To make it more explicit below that processHooks needs to be called last.
}
// The globalHook stuff. There's no proper way to insert this, so we add it everywhere.
private Hook ponderGlobalPreReadHook(Class> clazz) {
Hook globalHook = this.globalHook;
return (globalHook != null && globalHook.preRead(clazz)) ? globalHook : null;
}
@SuppressWarnings("unchecked")
private T ponderGlobalPostReadHook(Hook hook, T result) {
return (hook == null) ? result : (T)hook.postRead(result);
}
private Object ponderGlobalPostHook(Object result) {
Hook globalHook = this.globalHook;
return (globalHook != null) ? globalHook.postRead(result) : result;
}
@Override
public Object readClassAndObject(Input input) {
return ponderGlobalPostHook(super.readClassAndObject(input));
}
@Override
public Registration readClass(Input input) {
Registration reg = super.readClass(input);
if (reg != null) {
ponderGlobalPreReadHook(reg.getType()); // Needed to intercept readClassAndObject.
}
return reg;
}
@Override
public T readObjectOrNull(Input input, Class type) {
Hook hook = ponderGlobalPreReadHook(type);
T result = super.readObjectOrNull(input, type);
return ponderGlobalPostReadHook(hook, result);
}
@Override
public T readObjectOrNull(Input input, Class type,
@SuppressWarnings("rawtypes") com.esotericsoftware.kryo.Serializer serializer) {
Hook hook = ponderGlobalPreReadHook(type);
T result = super.readObjectOrNull(input, type, serializer);
return ponderGlobalPostReadHook(hook, result);
}
@Override
public T readObject(Input input, Class type) {
Hook hook = ponderGlobalPreReadHook(type);
T result = super.readObject(input, type);
return ponderGlobalPostReadHook(hook, result);
}
@Override
public T readObject(Input input, Class type,
@SuppressWarnings("rawtypes") com.esotericsoftware.kryo.Serializer serializer) {
Hook hook = ponderGlobalPreReadHook(type);
T result = super.readObject(input, type, serializer);
return ponderGlobalPostReadHook(hook, result);
}
}
private static KryoFactory factory = new KryoFactory() {
public Kryo create() {
KryoWithHooks kryo = new KryoWithHooks();
kryo.register(java.sql.Date.class, new SqlDateSerializer());
kryo.register(java.sql.Timestamp.class, new TimestampSerializer());
kryo.register(Path.class, new PathSerializer());
kryo.register(Arrays.asList("").getClass(), new ArraysAsListSerializer());
((Kryo.DefaultInstantiatorStrategy) kryo.getInstantiatorStrategy())
.setFallbackInstantiatorStrategy(
new StdInstantiatorStrategy());
removeField(kryo, Operator.class, "colExprMap");
removeField(kryo, AbstractOperatorDesc.class, "statistics");
kryo.register(MapWork.class);
kryo.register(ReduceWork.class);
kryo.register(TableDesc.class);
kryo.register(UnionOperator.class);
kryo.register(FileSinkOperator.class);
kryo.register(VectorFileSinkOperator.class);
kryo.register(HiveIgnoreKeyTextOutputFormat.class);
kryo.register(StandardConstantListObjectInspector.class);
kryo.register(StandardConstantMapObjectInspector.class);
kryo.register(StandardConstantStructObjectInspector.class);
kryo.register(SequenceFileInputFormat.class);
kryo.register(RCFileInputFormat.class);
kryo.register(HiveSequenceFileOutputFormat.class);
kryo.register(SparkEdgeProperty.class);
kryo.register(SparkWork.class);
kryo.register(Pair.class);
// This must be called after all the explicit register calls.
return kryo.processHooks(kryoTypeHooks, globalHook);
}
};
// Bounded queue could be specified here but that will lead to blocking.
// ConcurrentLinkedQueue is unbounded and will release soft referenced kryo instances under
// memory pressure.
private static KryoPool kryoPool = new KryoPool.Builder(factory).softReferences().build();
/**
* By default, kryo pool uses ConcurrentLinkedQueue which is unbounded. To facilitate reuse of
* kryo object call releaseKryo() after done using the kryo instance. The class loader for the
* kryo instance will be set to current thread's context class loader.
*
* @return kryo instance
*/
public static Kryo borrowKryo() {
Kryo kryo = kryoPool.borrow();
kryo.setClassLoader(Thread.currentThread().getContextClassLoader());
return kryo;
}
/**
* Release kryo instance back to the pool.
*
* @param kryo - kryo instance to be released
*/
public static void releaseKryo(Kryo kryo) {
kryoPool.release(kryo);
}
private static void removeField(Kryo kryo, Class type, String fieldName) {
FieldSerializer fld = new FieldSerializer(kryo, type);
fld.removeField(fieldName);
kryo.register(type, fld);
}
/**
* Kryo serializer for timestamp.
*/
private static class TimestampSerializer extends
com.esotericsoftware.kryo.Serializer {
@Override
public Timestamp read(Kryo kryo, Input input, Class clazz) {
Timestamp ts = new Timestamp(input.readLong());
ts.setNanos(input.readInt());
return ts;
}
@Override
public void write(Kryo kryo, Output output, Timestamp ts) {
output.writeLong(ts.getTime());
output.writeInt(ts.getNanos());
}
}
/**
* Custom Kryo serializer for sql date, otherwise Kryo gets confused between
* java.sql.Date and java.util.Date while deserializing
*/
private static class SqlDateSerializer extends
com.esotericsoftware.kryo.Serializer {
@Override
public java.sql.Date read(Kryo kryo, Input input, Class clazz) {
return new java.sql.Date(input.readLong());
}
@Override
public void write(Kryo kryo, Output output, java.sql.Date sqlDate) {
output.writeLong(sqlDate.getTime());
}
}
private static class PathSerializer extends com.esotericsoftware.kryo.Serializer {
@Override
public void write(Kryo kryo, Output output, Path path) {
output.writeString(path.toUri().toString());
}
@Override
public Path read(Kryo kryo, Input input, Class type) {
return new Path(URI.create(input.readString()));
}
}
/**
* A kryo {@link Serializer} for lists created via {@link Arrays#asList(Object...)}.
*
* Note: This serializer does not support cyclic references, so if one of the objects
* gets set the list as attribute this might cause an error during deserialization.
*
*
* This is from kryo-serializers package. Added explicitly to avoid classpath issues.
*/
private static class ArraysAsListSerializer
extends com.esotericsoftware.kryo.Serializer> {
private Field _arrayField;
public ArraysAsListSerializer() {
try {
_arrayField = Class.forName("java.util.Arrays$ArrayList").getDeclaredField("a");
_arrayField.setAccessible(true);
} catch (final Exception e) {
throw new RuntimeException(e);
}
// Immutable causes #copy(obj) to return the original object
setImmutable(true);
}
@Override
public List> read(final Kryo kryo, final Input input, final Class> type) {
final int length = input.readInt(true);
Class> componentType = kryo.readClass(input).getType();
if (componentType.isPrimitive()) {
componentType = getPrimitiveWrapperClass(componentType);
}
try {
final Object items = Array.newInstance(componentType, length);
for (int i = 0; i < length; i++) {
Array.set(items, i, kryo.readClassAndObject(input));
}
return Arrays.asList((Object[]) items);
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
@Override
public void write(final Kryo kryo, final Output output, final List> obj) {
try {
final Object[] array = (Object[]) _arrayField.get(obj);
output.writeInt(array.length, true);
final Class> componentType = array.getClass().getComponentType();
kryo.writeClass(output, componentType);
for (final Object item : array) {
kryo.writeClassAndObject(output, item);
}
} catch (final RuntimeException e) {
// Don't eat and wrap RuntimeExceptions because the ObjectBuffer.write...
// handles SerializationException specifically (resizing the buffer)...
throw e;
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
private Class> getPrimitiveWrapperClass(final Class> c) {
if (c.isPrimitive()) {
if (c.equals(Long.TYPE)) {
return Long.class;
} else if (c.equals(Integer.TYPE)) {
return Integer.class;
} else if (c.equals(Double.TYPE)) {
return Double.class;
} else if (c.equals(Float.TYPE)) {
return Float.class;
} else if (c.equals(Boolean.TYPE)) {
return Boolean.class;
} else if (c.equals(Character.TYPE)) {
return Character.class;
} else if (c.equals(Short.TYPE)) {
return Short.class;
} else if (c.equals(Byte.TYPE)) {
return Byte.class;
}
}
return c;
}
}
/**
* Serializes the plan.
*
* @param plan The plan, such as QueryPlan, MapredWork, etc.
* @param out The stream to write to.
*/
public static void serializePlan(Object plan, OutputStream out) {
serializePlan(plan, out, false);
}
public static void serializePlan(Kryo kryo, Object plan, OutputStream out) {
serializePlan(kryo, plan, out, false);
}
private static void serializePlan(Object plan, OutputStream out, boolean cloningPlan) {
Kryo kryo = borrowKryo();
try {
serializePlan(kryo, plan, out, cloningPlan);
} finally {
releaseKryo(kryo);
}
}
private static void serializePlan(Kryo kryo, Object plan, OutputStream out, boolean cloningPlan) {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
LOG.info("Serializing " + plan.getClass().getSimpleName() + " using kryo");
if (cloningPlan) {
serializeObjectByKryo(kryo, plan, out);
} else {
serializeObjectByKryo(kryo, plan, out);
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.SERIALIZE_PLAN);
}
/**
* Deserializes the plan.
*
* @param in The stream to read from.
* @param planClass class of plan
* @return The plan, such as QueryPlan, MapredWork, etc.
*/
public static T deserializePlan(InputStream in, Class planClass) {
return deserializePlan(in, planClass, false);
}
public static T deserializePlan(Kryo kryo, InputStream in, Class planClass) {
return deserializePlan(kryo, in, planClass, false);
}
private static T deserializePlan(InputStream in, Class planClass, boolean cloningPlan) {
Kryo kryo = borrowKryo();
T result = null;
try {
result = deserializePlan(kryo, in, planClass, cloningPlan);
} finally {
releaseKryo(kryo);
}
return result;
}
private static T deserializePlan(Kryo kryo, InputStream in, Class planClass,
boolean cloningPlan) {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
T plan;
LOG.info("Deserializing " + planClass.getSimpleName() + " using kryo");
if (cloningPlan) {
plan = deserializeObjectByKryo(kryo, in, planClass);
} else {
plan = deserializeObjectByKryo(kryo, in, planClass);
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.DESERIALIZE_PLAN);
return plan;
}
/**
* Clones using the powers of XML. Do not use unless necessary.
* @param plan The plan.
* @return The clone.
*/
public static MapredWork clonePlan(MapredWork plan) {
// TODO: need proper clone. Meanwhile, let's at least keep this horror in one place
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
Operator> op = plan.getAnyOperator();
CompilationOpContext ctx = (op == null) ? null : op.getCompilationOpContext();
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
serializePlan(plan, baos, true);
MapredWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
MapredWork.class, true);
// Restore the context.
for (Operator> newOp : newPlan.getAllOperators()) {
newOp.setCompilationOpContext(ctx);
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
return newPlan;
}
/**
* Clones using the powers of XML. Do not use unless necessary.
* @param roots The roots.
* @return The clone.
*/
public static List> cloneOperatorTree(List> roots) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
CompilationOpContext ctx = roots.isEmpty() ? null : roots.get(0).getCompilationOpContext();
serializePlan(roots, baos, true);
@SuppressWarnings("unchecked")
List> result =
deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
roots.getClass(), true);
// Restore the context.
LinkedList> newOps = new LinkedList<>(result);
while (!newOps.isEmpty()) {
Operator> newOp = newOps.poll();
newOp.setCompilationOpContext(ctx);
List> children = newOp.getChildOperators();
if (children != null) {
newOps.addAll(children);
}
}
return result;
}
public static List> cloneOperatorTree(List> roots, int indexForTezUnion) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
CompilationOpContext ctx = roots.isEmpty() ? null : roots.get(0).getCompilationOpContext();
serializePlan(roots, baos, true);
@SuppressWarnings("unchecked")
List> result =
deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
roots.getClass(), true);
// Restore the context.
LinkedList> newOps = new LinkedList<>(result);
while (!newOps.isEmpty()) {
Operator> newOp = newOps.poll();
newOp.setIndexForTezUnion(indexForTezUnion);
newOp.setCompilationOpContext(ctx);
List> children = newOp.getChildOperators();
if (children != null) {
newOps.addAll(children);
}
}
return result;
}
/**
* Clones using the powers of XML. Do not use unless necessary.
* @param plan The plan.
* @return The clone.
*/
public static BaseWork cloneBaseWork(BaseWork plan) {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
Operator> op = plan.getAnyRootOperator();
CompilationOpContext ctx = (op == null) ? null : op.getCompilationOpContext();
ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
serializePlan(plan, baos, true);
BaseWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()),
plan.getClass(), true);
// Restore the context.
for (Operator> newOp : newPlan.getAllOperators()) {
newOp.setCompilationOpContext(ctx);
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
return newPlan;
}
/**
* @param plan Usually of type MapredWork, MapredLocalWork etc.
* @param out stream in which serialized plan is written into
*/
private static void serializeObjectByKryo(Kryo kryo, Object plan, OutputStream out) {
Output output = new Output(out);
kryo.setClassLoader(Utilities.getSessionSpecifiedClassLoader());
kryo.writeObject(output, plan);
output.close();
}
private static T deserializeObjectByKryo(Kryo kryo, InputStream in, Class clazz ) {
Input inp = new Input(in);
kryo.setClassLoader(Utilities.getSessionSpecifiedClassLoader());
T t = kryo.readObject(inp,clazz);
inp.close();
return t;
}
/**
* Serializes expression via Kryo.
* @param expr Expression.
* @return Bytes.
*/
public static byte[] serializeExpressionToKryo(ExprNodeGenericFuncDesc expr) {
return serializeObjectToKryo(expr);
}
/**
* Deserializes expression from Kryo.
* @param bytes Bytes containing the expression.
* @return Expression; null if deserialization succeeded, but the result type is incorrect.
*/
public static ExprNodeGenericFuncDesc deserializeExpressionFromKryo(byte[] bytes) {
return deserializeObjectFromKryo(bytes, ExprNodeGenericFuncDesc.class);
}
public static String serializeExpression(ExprNodeGenericFuncDesc expr) {
try {
return new String(Base64.encodeBase64(serializeExpressionToKryo(expr)), "UTF-8");
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("UTF-8 support required", ex);
}
}
public static ExprNodeGenericFuncDesc deserializeExpression(String s) {
byte[] bytes;
try {
bytes = Base64.decodeBase64(s.getBytes("UTF-8"));
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("UTF-8 support required", ex);
}
return deserializeExpressionFromKryo(bytes);
}
private static byte[] serializeObjectToKryo(Serializable object) {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
Output output = new Output(baos);
Kryo kryo = borrowKryo();
try {
kryo.writeObject(output, object);
} finally {
releaseKryo(kryo);
}
output.close();
return baos.toByteArray();
}
private static T deserializeObjectFromKryo(byte[] bytes, Class clazz) {
Input inp = new Input(new ByteArrayInputStream(bytes));
Kryo kryo = borrowKryo();
T func = null;
try {
func = kryo.readObject(inp, clazz);
} finally {
releaseKryo(kryo);
}
inp.close();
return func;
}
public static String serializeObject(Serializable expr) {
try {
return new String(Base64.encodeBase64(serializeObjectToKryo(expr)), "UTF-8");
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("UTF-8 support required", ex);
}
}
public static T deserializeObject(String s, Class clazz) {
try {
return deserializeObjectFromKryo(Base64.decodeBase64(s.getBytes("UTF-8")), clazz);
} catch (UnsupportedEncodingException ex) {
throw new RuntimeException("UTF-8 support required", ex);
}
}
}