All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.pig.util.ProtobufToPig Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.pig.util;

import java.util.Collections;
import java.util.List;

import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import com.google.protobuf.Descriptors.Descriptor;
import com.google.protobuf.Descriptors.EnumValueDescriptor;
import com.google.protobuf.Descriptors.FieldDescriptor;
import com.google.protobuf.ByteString;
import com.google.protobuf.Message;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A class for turning codegen'd protos into Pig Tuples and Schemas
 * for custom Pig LoadFuncs.
 * @author Kevin Weil
 */
public class ProtobufToPig {
  private static final Logger LOG = LoggerFactory.getLogger(ProtobufToPig.class);

  private static final TupleFactory tupleFactory_ = TupleFactory.getInstance();

  public ProtobufToPig() { }

  /**
   * Turn a generic message into a Tuple.  Individual fields that are enums
   * are converted into their string equivalents.  Fields that are not filled
   * out in the protobuf are set to null, unless there is a default field value in
   * which case that is used instead.
   * @param msg the protobuf message
   * @return a pig tuple representing the message.
   */
  public Tuple toTuple(Message msg) {
    if (msg == null) {
      // Pig tuples deal gracefully with nulls.
      // Also, we can be called with null here in recursive calls.
      return null;
    }

    Descriptor msgDescriptor = msg.getDescriptorForType();
    Tuple tuple = tupleFactory_.newTuple(msgDescriptor.getFields().size());
    int curField = 0;
    try {
      // Walk through all the possible fields in the message.
      for (FieldDescriptor fieldDescriptor : msgDescriptor.getFields()) {
        // Get the set value, or the default value, or null.
        Object fieldValue = msg.getField(fieldDescriptor);

        if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
          tuple.set(curField++, messageToTuple(fieldDescriptor, fieldValue));
        } else {
          tuple.set(curField++, singleFieldToTuple(fieldDescriptor, fieldValue));
        }
      }
    } catch (ExecException e) {
      LOG.warn("Could not convert msg " + msg + " to tuple", e);
    }

    return tuple;
  }

  /**
   * Returns either {@link #messageToTuple(FieldDescriptor, Object)}
   * or {@link #singleFieldToTuple(FieldDescriptor, Object)} depending
   * on whether the field is a Message or a simple field.
   */
  public Object fieldToPig(FieldDescriptor fieldDescriptor, Object fieldValue) {
    if (fieldValue == null) {
      // protobufs unofficially ensures values are not null. just in case:
      return null;
    }
    if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
      return messageToTuple(fieldDescriptor, fieldValue);
    } else {
      return singleFieldToTuple(fieldDescriptor, fieldValue);
    }
  }

  /**
   * Translate a nested message to a tuple.  If the field is repeated, it walks the list and adds each to a bag.
   * Otherwise, it just adds the given one.
   * @param fieldDescriptor the descriptor object for the given field.
   * @param fieldValue the object representing the value of this field, possibly null.
   * @return the object representing fieldValue in Pig -- either a bag or a tuple.
   */
  @SuppressWarnings("unchecked")
  protected Object messageToTuple(FieldDescriptor fieldDescriptor, Object fieldValue) {
    if (fieldValue == null) {
      // protobufs unofficially ensures values are not null. just in case:
      return null;
    }
    assert fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE : "messageToTuple called with field of type " + fieldDescriptor.getType();

    if (fieldDescriptor.isRepeated()) {
      // The protobuf contract is that if the field is repeated, then the object returned is actually a List
      // of the underlying datatype, which in this case is a nested message.
      List messageList = (List) (fieldValue != null ? fieldValue : Lists.newArrayList());

      DataBag bag = new NonSpillableDataBag(messageList.size());
      for (Message m : messageList) {
        bag.add(new ProtobufTuple(m));
      }
      return bag;
    } else {
      return new ProtobufTuple((Message)fieldValue);
    }
  }

  /**
   * Translate a single field to a tuple.  If the field is repeated, it walks the list and adds each to a bag.
   * Otherwise, it just adds the given one.
   * @param fieldDescriptor the descriptor object for the given field.
   * @param fieldValue the object representing the value of this field, possibly null.
   * @return the object representing fieldValue in Pig -- either a bag or a single field.
   * @throws ExecException if Pig decides to.  Shouldn't happen because we won't walk off the end of a tuple's field set.
   */
  @SuppressWarnings("unchecked")
  protected Object singleFieldToTuple(FieldDescriptor fieldDescriptor, Object fieldValue) {
    assert fieldDescriptor.getType() != FieldDescriptor.Type.MESSAGE : "messageToFieldSchema called with field of type " + fieldDescriptor.getType();

    if (fieldDescriptor.isRepeated()) {
      // The protobuf contract is that if the field is repeated, then the object returned is actually a List
      // of the underlying datatype, which in this case is a "primitive" like int, float, String, etc.
      // We have to make a single-item tuple out of it to put it in the bag.
      List fieldValueList = (List) (fieldValue != null ? fieldValue : Collections.emptyList());
      DataBag bag = new NonSpillableDataBag(fieldValueList.size());
      for (Object singleFieldValue : fieldValueList) {
        Object nonEnumFieldValue = coerceToPigTypes(fieldDescriptor, singleFieldValue);
        Tuple innerTuple = tupleFactory_.newTuple(1);
        try {
          innerTuple.set(0, nonEnumFieldValue);
        } catch (ExecException e) { // not expected
          throw new RuntimeException(e);
        }
        bag.add(innerTuple);
      }
      return bag;
    } else {
      return coerceToPigTypes(fieldDescriptor, fieldValue);
    }
  }

  /**
   * If the given field value is an enum, translate it to the enum's name as a string, since Pig cannot handle enums.
   * Also, if the given field value is a bool, translate it to 0 or 1 to avoid Pig bools, which can be sketchy.
   * @param fieldDescriptor the descriptor object for the given field.
   * @param fieldValue the object representing the value of this field, possibly null.
   * @return the object, unless it was from an enum field, in which case we return the name of the enum field.
   */
  private Object coerceToPigTypes(FieldDescriptor fieldDescriptor, Object fieldValue) {
    if (fieldDescriptor.getType() == FieldDescriptor.Type.ENUM && fieldValue != null) {
      EnumValueDescriptor enumValueDescriptor = (EnumValueDescriptor)fieldValue;
      return enumValueDescriptor.getName();
    } else if (fieldDescriptor.getType() == FieldDescriptor.Type.BOOL && fieldValue != null) {
      Boolean boolValue = (Boolean)fieldValue;
      return new Integer(boolValue ? 1 : 0);
    } else if (fieldDescriptor.getType() == FieldDescriptor.Type.BYTES && fieldValue != null) {
      ByteString bsValue = (ByteString)fieldValue;
      return new DataByteArray(bsValue.toByteArray());
    }
    return fieldValue;
  }

  /**
   * Turn a generic message descriptor into a Schema.  Individual fields that are enums
   * are converted into their string equivalents.
   * @param msgDescriptor the descriptor for the given message type.
   * @return a pig schema representing the message.
   */
  public Schema toSchema(Descriptor msgDescriptor) {
    Schema schema = new Schema();

    try {
      // Walk through all the possible fields in the message.
      for (FieldDescriptor fieldDescriptor : msgDescriptor.getFields()) {
        if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
          schema.add(messageToFieldSchema(fieldDescriptor));
        } else {
          schema.add(singleFieldToFieldSchema(fieldDescriptor));
        }
      }
    } catch (FrontendException e) {
      LOG.warn("Could not convert descriptor " + msgDescriptor + " to schema", e);
    }

    return schema;
  }

  /**
   * Turn a nested message into a Schema object.  For repeated nested messages, it generates a schema for a bag of
   * tuples.  For non-repeated nested messages, it just generates a schema for the tuple itself.
   * @param fieldDescriptor the descriptor object for the given field.
   * @return the Schema for the nested message.
   * @throws FrontendException if Pig decides to.
   */
  private FieldSchema messageToFieldSchema(FieldDescriptor fieldDescriptor) throws FrontendException {
    assert fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE : "messageToFieldSchema called with field of type " + fieldDescriptor.getType();

    Schema innerSchema = toSchema(fieldDescriptor.getMessageType());

    if (fieldDescriptor.isRepeated()) {
      Schema tupleSchema = new Schema();
      tupleSchema.add(new FieldSchema(fieldDescriptor.getName() + "_tuple", innerSchema, DataType.TUPLE));
      return new FieldSchema(fieldDescriptor.getName(), tupleSchema, DataType.BAG);
    } else {
      return new FieldSchema(fieldDescriptor.getName(), innerSchema, DataType.TUPLE);
    }
  }

  /**
   * Turn a single field into a Schema object.  For repeated single fields, it generates a schema for a bag of single-item tuples.
   * For non-repeated fields, it just generates a standard field schema.
   * @param fieldDescriptor the descriptor object for the given field.
   * @return the Schema for the nested message.
   * @throws FrontendException if Pig decides to.
   */
  private FieldSchema singleFieldToFieldSchema(FieldDescriptor fieldDescriptor) throws FrontendException {
    assert fieldDescriptor.getType() != FieldDescriptor.Type.MESSAGE : "singleFieldToFieldSchema called with field of type " + fieldDescriptor.getType();

    if (fieldDescriptor.isRepeated()) {
      Schema itemSchema = new Schema();
      itemSchema.add(new FieldSchema(fieldDescriptor.getName(), null, getPigDataType(fieldDescriptor)));
      Schema itemTupleSchema = new Schema();
      itemTupleSchema.add(new FieldSchema(fieldDescriptor.getName() + "_tuple", itemSchema, DataType.TUPLE));

      return new FieldSchema(fieldDescriptor.getName() + "_bag", itemTupleSchema, DataType.BAG);
    } else {
      return new FieldSchema(fieldDescriptor.getName(), null, getPigDataType(fieldDescriptor));
    }
  }

  /**
   * Translate between protobuf's datatype representation and Pig's datatype representation.
   * @param fieldDescriptor the descriptor object for the given field.
   * @return the byte representing the pig datatype for the given field type.
   */
  private byte getPigDataType(FieldDescriptor fieldDescriptor) {
    switch (fieldDescriptor.getType()) {
      case INT32:
      case UINT32:
      case SINT32:
      case FIXED32:
      case SFIXED32:
      case BOOL: // We convert booleans to ints for pig output.
        return DataType.INTEGER;
      case INT64:
      case UINT64:
      case SINT64:
      case FIXED64:
      case SFIXED64:
        return DataType.LONG;
      case FLOAT:
        return DataType.FLOAT;
      case DOUBLE:
        return DataType.DOUBLE;
      case STRING:
      case ENUM: // We convert enums to strings for pig output.
        return DataType.CHARARRAY;
      case BYTES:
        return DataType.BYTEARRAY;
      case MESSAGE:
        throw new IllegalArgumentException("getPigDataType called on field " + fieldDescriptor.getFullName() + " of type message.");
      default:
        throw new IllegalArgumentException("Unexpected field type. " + fieldDescriptor.toString() + " " + fieldDescriptor.getFullName() + " " + fieldDescriptor.getType());
    }
  }

  /**
   * Turn a generic message descriptor into a loading schema for a pig script.
   * @param msgDescriptor the descriptor for the given message type.
   * @param loaderClassName the fully qualified classname of the pig loader to use.  Not
   * passed a Class because in many situations that class
   * is being generated as well, and so doesn't exist in compiled form.
   * @return a pig script that can load the given message.
   */
  public String toPigScript(Descriptor msgDescriptor, String loaderClassName) {
    StringBuffer sb = new StringBuffer();
    final int initialTabOffset = 3;

    sb.append("raw_data = load '$INPUT_FILES' using " + loaderClassName + "()").append("\n");
    sb.append(tabs(initialTabOffset)).append("as (").append("\n");
    sb.append(toPigScriptInternal(msgDescriptor, initialTabOffset));
    sb.append(tabs(initialTabOffset)).append(");").append("\n").append("\n");

    return sb.toString();
  }

  /**
   * Same as toPigScript(Descriptor, String) but allows parameters for the loader.
   *
   * @param msgDescriptor
   * @param loaderClassName
   * @param params
   * @return a pig script that can load the given message.
   */
  public String toPigScript(Descriptor msgDescriptor, String loaderClassName, String... params) {
    StringBuffer sb = new StringBuffer();
    final int initialTabOffset = 3;

    sb.append("raw_data = load '$INPUT_FILES' using ")
    .append(loaderClassName)
    .append("(");
    String paramString = "";
    if (params.length > 0) {
      paramString = "'" + Joiner.on(",'").join(params) + "'";
    }
    sb.append(paramString).append(")").append("\n");
    sb.append("/**\n");
    sb.append(tabs(initialTabOffset)).append("as (").append("\n");
    sb.append(toPigScriptInternal(msgDescriptor, initialTabOffset));
    sb.append(tabs(initialTabOffset)).append(")").append("\n").append("\n");
    sb.append("**/\n;\n");
    return sb.toString();
  }

  /**
   * Turn a generic message descriptor into a loading schema for a pig script.  Individual fields that are enums
   * are converted into their string equivalents.
   * @param msgDescriptor the descriptor for the given message type.
   * @param numTabs the tab depth at the current point in the recursion, for pretty printing.
   * @return a pig schema representing the message.
   */
  private StringBuffer toPigScriptInternal(Descriptor msgDescriptor, int numTabs) {
    StringBuffer sb = new StringBuffer();
    try {
      // Walk through all the possible fields in the message.
      for (FieldDescriptor fieldDescriptor : msgDescriptor.getFields()) {
        // We have to add a comma after every line EXCEPT for the last, or Pig gets mad.
        boolean isLast = (fieldDescriptor == msgDescriptor.getFields().get(msgDescriptor.getFields().size() - 1));
        if (fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE) {
          sb.append(messageToPigScript(fieldDescriptor, numTabs + 1, isLast));
        } else {
          sb.append(singleFieldToPigScript(fieldDescriptor, numTabs + 1, isLast));
        }
      }
    } catch (FrontendException e) {
      LOG.warn("Could not convert descriptor " + msgDescriptor + " to pig script", e);
    }

    return sb;
  }

  /**
   * Turn a nested message into a pig script load string.  For repeated nested messages, it generates a string for a bag of
   * tuples.  For non-repeated nested messages, it just generates a string for the tuple itself.
   * @param fieldDescriptor the descriptor object for the given field.
   * @param numTabs the tab depth at the current point in the recursion, for pretty printing.
   * @return the pig script load schema for the nested message.
   * @throws FrontendException if Pig decides to.
   */
  private StringBuffer messageToPigScript(FieldDescriptor fieldDescriptor, int numTabs, boolean isLast) throws FrontendException {
    assert fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE : "messageToPigScript called with field of type " + fieldDescriptor.getType();

    if (fieldDescriptor.isRepeated()) {
      return new StringBuffer().append(tabs(numTabs)).append(fieldDescriptor.getName()).append(": bag {").append("\n")
          .append(tabs(numTabs + 1)).append(fieldDescriptor.getName()).append("_tuple: tuple (").append("\n")
          .append(toPigScriptInternal(fieldDescriptor.getMessageType(), numTabs + 2))
          .append(tabs(numTabs + 1)).append(")").append("\n")
          .append(tabs(numTabs)).append("}").append(isLast ? "" : ",").append("\n");
    } else {
      return new StringBuffer().append(tabs(numTabs)).append(fieldDescriptor.getName()).append(": tuple (").append("\n")
          .append(toPigScriptInternal(fieldDescriptor.getMessageType(), numTabs + 1))
          .append(tabs(numTabs)).append(")").append(isLast ? "" : ",").append("\n");
    }
  }

  /**
   * Turn a single field into a pig script load string.  For repeated single fields, it generates a string for a bag of single-item tuples.
   * For non-repeated fields, it just generates a standard single-element string.
   * @param fieldDescriptor the descriptor object for the given field.
   * @param numTabs the tab depth at the current point in the recursion, for pretty printing.
   * @return the pig script load string for the nested message.
   * @throws FrontendException if Pig decides to.
   */
  private StringBuffer singleFieldToPigScript(FieldDescriptor fieldDescriptor, int numTabs, boolean isLast) throws FrontendException {
    assert fieldDescriptor.getType() != FieldDescriptor.Type.MESSAGE : "singleFieldToPigScript called with field of type " + fieldDescriptor.getType();

    if (fieldDescriptor.isRepeated()) {
      return new StringBuffer().append(tabs(numTabs)).append(fieldDescriptor.getName()).append("_bag: bag {").append("\n")
          .append(tabs(numTabs + 1)).append(fieldDescriptor.getName()).append("_tuple: tuple (").append("\n")
          .append(tabs(numTabs + 2)).append(fieldDescriptor.getName()).append(": ").append(getPigScriptDataType(fieldDescriptor)).append("\n")
          .append(tabs(numTabs + 1)).append(")").append("\n")
          .append(tabs(numTabs)).append("}").append(isLast ? "" : ",").append("\n");
    } else {
      return new StringBuffer().append(tabs(numTabs)).append(fieldDescriptor.getName()).append(": ")
          .append(getPigScriptDataType(fieldDescriptor)).append(isLast ? "" : ",").append("\n");
    }
  }

  /**
   * Translate between protobuf's datatype representation and Pig's datatype representation.
   * @param fieldDescriptor the descriptor object for the given field.
   * @return the byte representing the pig datatype for the given field type.
   */
  private String getPigScriptDataType(FieldDescriptor fieldDescriptor) {
    switch (fieldDescriptor.getType()) {
      case INT32:
      case UINT32:
      case SINT32:
      case FIXED32:
      case SFIXED32:
      case BOOL: // We convert booleans to ints for pig output.
        return "int";
      case INT64:
      case UINT64:
      case SINT64:
      case FIXED64:
      case SFIXED64:
        return "long";
      case FLOAT:
        return "float";
      case DOUBLE:
        return "double";
      case STRING:
      case ENUM: // We convert enums to strings for pig output.
        return "chararray";
      case BYTES:
        return "bytearray";
      case MESSAGE:
        throw new IllegalArgumentException("getPigScriptDataType called on field " + fieldDescriptor.getFullName() + " of type message.");
      default:
        throw new IllegalArgumentException("Unexpected field type. " + fieldDescriptor.toString() + " " + fieldDescriptor.getFullName() + " " + fieldDescriptor.getType());
    }
  }

  private StringBuffer tabs(int numTabs) {
    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < numTabs; i++) {
      sb.append("  ");
    }
    return sb;
  }

}