![JAR search and dependency download from the Maven repository](/logo.png)
com.twitter.elephantbird.pig.util.PigToProtobuf Maven / Gradle / Ivy
package com.twitter.elephantbird.pig.util;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.google.protobuf.ByteString;
import com.google.protobuf.Descriptors;
import com.google.protobuf.Descriptors.Descriptor;
import com.google.protobuf.Descriptors.EnumValueDescriptor;
import com.google.protobuf.Descriptors.FieldDescriptor;
import com.google.protobuf.Descriptors.DescriptorValidationException;
import com.google.protobuf.Message;
import com.google.protobuf.Message.Builder;
import com.google.protobuf.DescriptorProtos.DescriptorProto;
import com.google.protobuf.DescriptorProtos.FieldDescriptorProto;
import com.google.protobuf.DescriptorProtos.FieldDescriptorProto.Type;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.elephantbird.util.Protobufs;
/**
* Converts a Pig Tuple into a Protobuf message. Tuple values should be ordered to match the natural
* order of Protobuf field ordinal values. For example, say we define the following Protobuf
* message:
*
*
* message MyProtobufType {
* optional int32 f1 = 1;
* optional int32 f2 = 3;
* optional int32 f3 = 7;
* }
*
*
* Input Tuples are expected to contain field values in order {@code (f1, f2, f3)}. Tuples may
* contain fewer values than Protobuf message fields (e.g. only {@code (f1, f2)} in the prior
* example); Any remaining fields will be left unset.
*
* @author Vikram Oberoi
*/
public class PigToProtobuf {
private static final Logger LOG = LoggerFactory.getLogger(PigToProtobuf.class);
public PigToProtobuf() {}
@SuppressWarnings("unchecked")
public static M tupleToMessage(Class protoClass, Tuple tuple) {
Builder builder = Protobufs.getMessageBuilder(protoClass);
return (M) tupleToMessage(builder, tuple);
}
/**
* Turn a Tuple into a Message with the given type.
* @param builder a builder for the Message type the tuple will be converted to
* @param tuple the tuple
* @return a message representing the given tuple
*/
public static Message tupleToMessage(Builder builder, Tuple tuple) {
return tupleToMessage(builder, builder.getDescriptorForType().getFields(), tuple);
}
/**
* @param builder
* @param fieldDescriptors should be same as builder.getDescriptorForType.getFields().
* Avoids overhead of getFields() which creates an array each time.
* @param tuple
* @return
*/
public static Message tupleToMessage(Builder builder, List fieldDescriptors, Tuple tuple) {
if (tuple == null) {
return builder.build();
}
for (int i = 0; i < fieldDescriptors.size() && i < tuple.size(); i++) {
Object tupleField = null;
FieldDescriptor fieldDescriptor = fieldDescriptors.get(i);
try {
tupleField = tuple.get(i);
} catch (ExecException e) {
LOG.warn("Could not convert tuple field " + tupleField + " to field with descriptor " + fieldDescriptor);
continue;
}
if (tupleField != null) {
try {
if (fieldDescriptor.isRepeated()) {
// Repeated fields are set with Lists containing objects of the fields' Java type.
builder.setField(fieldDescriptor,
dataBagToRepeatedField(builder, fieldDescriptor, (DataBag) tupleField));
} else {
if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
Builder nestedMessageBuilder = builder.newBuilderForField(fieldDescriptor);
builder.setField(fieldDescriptor,
tupleToMessage(nestedMessageBuilder, (Tuple) tupleField));
} else {
builder.setField(fieldDescriptor,
tupleFieldToSingleField(fieldDescriptor, tupleField));
}
}
} catch (Exception e) {
String value = String.valueOf(tupleField);
final int max_length = 100;
if (max_length < value.length()) {
value = value.substring(0, max_length - 3) + "...";
}
String type = tupleField == null ? "unknown" : tupleField.getClass().getName();
throw new RuntimeException(String.format(
"Failed to set field '%s' using tuple value '%s' of type '%s' at index %d",
fieldDescriptor.getName(), value, type, i), e);
}
}
}
return builder.build();
}
/**
* For a given ResourceSchema
, generate a protobufs Descriptor
with analagous field names
* and types.
*
* @param schema Pig schema.
* @return Protobufs Descriptor
* @throws Descriptors.DescriptorValidationException
*/
public static Descriptor schemaToProtoDescriptor(ResourceSchema schema)
throws DescriptorValidationException {
return schemaToProtoDescriptor(schema, null);
}
/**
* For a given ResourceSchema
, generate a protobufs Descriptor
with analogous field names
* and types.
*
* @param schema Pig schema.
* @param extraFields optionally pass a List of extra fields (Pairs of name:type) to be included.
* @return Protobufs Descriptor
* @throws Descriptors.DescriptorValidationException
*/
public static Descriptor schemaToProtoDescriptor(ResourceSchema schema, List> extraFields)
throws DescriptorValidationException {
// init protobufs
DescriptorProto.Builder desBuilder = DescriptorProto.newBuilder();
int count = 0;
for (ResourceFieldSchema fieldSchema : schema.getFields()) {
// Pig types
int position = ++count;
String fieldName = fieldSchema.getName();
byte dataTypeId = fieldSchema.getType();
// determine and add protobuf types
Type protoType = pigTypeToProtoType(dataTypeId);
LOG.info("Mapping Pig field " + fieldName + " of type " + dataTypeId + " to protobuf type: " + protoType);
addField(desBuilder, fieldName, position, protoType);
}
if (count == 0) {
throw new IllegalArgumentException("ResourceSchema does not have any fields");
}
// If extra fields are needed, let's add them
if (extraFields != null) {
for (Pair extraField : extraFields) {
addField(desBuilder, extraField.first, ++count, extraField.second);
}
}
desBuilder.setName("PigToProtobufDynamicBuilder");
return Protobufs.makeMessageDescriptor(desBuilder.build());
}
/**
* Converts a DataBag into a List of objects with the type in the given FieldDescriptor. DataBags
* don't map cleanly to repeated protobuf types, so each Tuple has to be unwrapped (by taking the
* first element if the type is primitive or by converting the Tuple to a Message if the type is
* MESSAGE), and the contents have to be appended to a List.
* @param containingMessageBuilder a Message builder for the Message that contains this repeated field
* @param fieldDescriptor a FieldDescriptor for this repeated field
* @param bag the DataBag being serialized
* @return a protobuf-friendly List of fieldDescriptor-type objects
*/
private static List
© 2015 - 2025 Weber Informatics LLC | Privacy Policy