
com.twitter.elephantbird.mapreduce.io.BinaryWritable Maven / Gradle / Ivy
package com.twitter.elephantbird.mapreduce.io;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.WritableComparable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A Hadoop Writable wrapper around a serialized messages like Protocol buffers.
*/
public abstract class BinaryWritable implements WritableComparable> {
private static final Logger LOG = LoggerFactory.getLogger(BinaryWritable.class);
// NOTE: only one of message and messageBytes is non-null at any time so that
// message and messageBytes don't go out of sync (user could modify message).
private M message;
private byte[] messageBytes;
private BinaryConverter converter;
protected BinaryWritable(M message, BinaryConverter converter) {
this.message = message;
this.converter = converter;
}
/** throws an exception if the converter is not set */
private void checkConverter() {
if (converter == null) {
throw new IllegalStateException("Runtime parameterized Protobuf/Thrift class is unkonwn. " +
"This object was probably created with default constructor. " +
"Please use setConverter(Class).");
}
}
protected abstract BinaryConverter getConverterFor(Class clazz);
/**
* Sets the handler for serialization and deserialization based on the class.
* This converter is often set in constructor. But some times it might be
* impossible to know the the actual class during construction.
*
* E.g. when this writable is used as output value for a Mapper,
* MR creates writable on the Reducer using the default constructor,
* and there is no way for us to know the parameterized class.
* In this case, user invokes setConverter() before
* calling get() to supply parameterized class.
*
* The class name could be written as part of writable serialization, but we
* don't yet see a need to do that as it has many other disadvantages.
*/
public void setConverter(Class clazz) {
converter = getConverterFor(clazz);
}
/**
* sets converter. useful for reusing existing converter.
*/
public void setConverter(BinaryConverter converter) {
this.converter = converter;
}
/**
* Returns the current object. Subsequent calls to get() may not return the
* same object, but in stead might return a new object deserialized from same
* set of bytes. As a result, multiple calls to get() should be avoided, and
* modifications to an object returned by get() may not
* reflect even if this writable is serialized later.
* Please use set() to be certain of what object is serialized.
*
* The deserialization of the actual Protobuf/Thrift object is often delayed
* till the first call to this method.
* In some cases the the parameterized proto class may not be known yet
* ( in case of default construction. see {@link #setConverter(Class)} ),
* and this will throw an {@link IllegalStateException}.
*/
public M get() {
// may be we should rename this method. the contract would be less
// confusing with a different name.
if (message == null && messageBytes != null) {
checkConverter();
try {
return converter.fromBytes(messageBytes);
} catch (DecodeException e) {
throw new IllegalStateException("Converter failed to deserialize", e);
}
}
return message;
}
public void clear() {
message = null;
messageBytes = null;
}
public void set(M message) {
this.message = message;
this.messageBytes = null;
// should we serialize the object to messageBytes instead?
// that is the only way we can guarantee any subsequent modifications to
// message by the user don't affect serialization. Unlike Protobuf objects
// Thrift objects are mutable. For now we will delay deserialization until
// it is required.
}
@Override
public void write(DataOutput out) throws IOException {
byte[] bytes = serialize();
if (bytes != null) {
out.writeInt(bytes.length);
out.write(bytes, 0, bytes.length);
} else {
out.writeInt(0);
}
}
/**
* Converts the message to raw bytes, and caches the converted value.
* @return converted value, which may be null in case of null message or error.
*/
private byte[] serialize() {
if (messageBytes == null && message != null) {
checkConverter();
messageBytes = converter.toBytes(message);
if (messageBytes == null) {
// should we throw an IOException instead?
LOG.warn("Could not serialize " + message.getClass());
} else {
message = null; // so that message and messageBytes don't go out of
// sync.
}
}
return messageBytes;
}
@Override
public void readFields(DataInput in) throws IOException {
message = null;
messageBytes = null;
int size = in.readInt();
if (size > 0) {
byte[] buf = new byte[size];
in.readFully(buf, 0, size);
messageBytes = buf;
// messageBytes is deserialized in get()
}
}
@Override
public int compareTo(BinaryWritable other) {
byte[] thisBytes = serialize();
byte[] otherBytes = other.serialize();
int thisLen = thisBytes == null ? 0 : thisBytes.length;
int otherLen = otherBytes == null ? 0 : otherBytes.length;
return BytesWritable.Comparator.compareBytes(thisBytes, 0, thisLen,
otherBytes, 0, otherLen);
}
@SuppressWarnings("unchecked")
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
BinaryWritable other;
try {
other = (BinaryWritable)obj;
} catch (ClassCastException e) {
return false;
}
return compareTo(other) == 0;
}
/**
* Returns a hashCode that is based on the serialized bytes.
* This makes the hash stable across multiple instances of JVMs.
* (hashCode()
is not required to return the same value in
* different instances of the same applications in Java, just in a
* single instance of the application; Hadoop imposes a more strict requirement.)
*
* In addition, it may not be feasible to create a deserialized object from
* the serialized bytes (see {@link #setConverter(Class)})
*/
@Override
public int hashCode() {
byte[] bytes = serialize();
return (bytes == null) ? 31 : Arrays.hashCode(bytes);
}
@Override
public String toString() {
M msgObj = null;
try {
msgObj = get();
} catch (IllegalStateException e) {
// It is ok. might not be able to avoid this case in some situations.
return super.toString() + "{could not be deserialized}";
}
if (msgObj == null) {
return super.toString();
}
return msgObj.toString();
}
}