Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package com.twitter.elephantbird.pig.util;
import java.util.Collections;
import java.util.List;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.protobuf.Descriptors.Descriptor;
import com.google.protobuf.Descriptors.EnumValueDescriptor;
import com.google.protobuf.Descriptors.FieldDescriptor;
import com.google.protobuf.ByteString;
import com.google.protobuf.Message;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A class for turning codegen'd protos into Pig Tuples and Schemas
* for custom Pig LoadFuncs.
* @author Kevin Weil
*/
public class ProtobufToPig {
private static final Logger LOG = LoggerFactory.getLogger(ProtobufToPig.class);
private static final TupleFactory tupleFactory_ = TupleFactory.getInstance();
public ProtobufToPig() { }
/**
* Turn a generic message into a Tuple. Individual fields that are enums
* are converted into their string equivalents. Fields that are not filled
* out in the protobuf are set to null, unless there is a default field value in
* which case that is used instead.
* @param msg the protobuf message
* @return a pig tuple representing the message.
*/
public Tuple toTuple(Message msg) {
if (msg == null) {
// Pig tuples deal gracefully with nulls.
// Also, we can be called with null here in recursive calls.
return null;
}
Descriptor msgDescriptor = msg.getDescriptorForType();
Tuple tuple = tupleFactory_.newTuple(msgDescriptor.getFields().size());
int curField = 0;
try {
// Walk through all the possible fields in the message.
for (FieldDescriptor fieldDescriptor : msgDescriptor.getFields()) {
// Get the set value, or the default value, or null.
Object fieldValue = msg.getField(fieldDescriptor);
if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
tuple.set(curField++, messageToTuple(fieldDescriptor, fieldValue));
} else {
tuple.set(curField++, singleFieldToTuple(fieldDescriptor, fieldValue));
}
}
} catch (ExecException e) {
LOG.warn("Could not convert msg " + msg + " to tuple", e);
}
return tuple;
}
/**
* Returns either {@link #messageToTuple(FieldDescriptor, Object)}
* or {@link #singleFieldToTuple(FieldDescriptor, Object)} depending
* on whether the field is a Message or a simple field.
*/
public Object fieldToPig(FieldDescriptor fieldDescriptor, Object fieldValue) {
if (fieldValue == null) {
// protobufs unofficially ensures values are not null. just in case:
return null;
}
if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
return messageToTuple(fieldDescriptor, fieldValue);
} else {
return singleFieldToTuple(fieldDescriptor, fieldValue);
}
}
/**
* Translate a nested message to a tuple. If the field is repeated, it walks the list and adds each to a bag.
* Otherwise, it just adds the given one.
* @param fieldDescriptor the descriptor object for the given field.
* @param fieldValue the object representing the value of this field, possibly null.
* @return the object representing fieldValue in Pig -- either a bag or a tuple.
*/
@SuppressWarnings("unchecked")
protected Object messageToTuple(FieldDescriptor fieldDescriptor, Object fieldValue) {
if (fieldValue == null) {
// protobufs unofficially ensures values are not null. just in case:
return null;
}
assert fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE : "messageToTuple called with field of type " + fieldDescriptor.getType();
if (fieldDescriptor.isRepeated()) {
// The protobuf contract is that if the field is repeated, then the object returned is actually a List
// of the underlying datatype, which in this case is a nested message.
List messageList = (List) (fieldValue != null ? fieldValue : Lists.newArrayList());
DataBag bag = new NonSpillableDataBag(messageList.size());
for (Message m : messageList) {
bag.add(new ProtobufTuple(m));
}
return bag;
} else {
return new ProtobufTuple((Message)fieldValue);
}
}
/**
* Translate a single field to a tuple. If the field is repeated, it walks the list and adds each to a bag.
* Otherwise, it just adds the given one.
* @param fieldDescriptor the descriptor object for the given field.
* @param fieldValue the object representing the value of this field, possibly null.
* @return the object representing fieldValue in Pig -- either a bag or a single field.
* @throws ExecException if Pig decides to. Shouldn't happen because we won't walk off the end of a tuple's field set.
*/
@SuppressWarnings("unchecked")
protected Object singleFieldToTuple(FieldDescriptor fieldDescriptor, Object fieldValue) {
assert fieldDescriptor.getType() != FieldDescriptor.Type.MESSAGE : "messageToFieldSchema called with field of type " + fieldDescriptor.getType();
if (fieldDescriptor.isRepeated()) {
// The protobuf contract is that if the field is repeated, then the object returned is actually a List
// of the underlying datatype, which in this case is a "primitive" like int, float, String, etc.
// We have to make a single-item tuple out of it to put it in the bag.
List