All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.pig.load.ProtobufPigLoader Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.pig.load;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.pig.ResourceSchema;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.protobuf.Message;
import com.twitter.elephantbird.mapreduce.input.LzoRecordReader;
import com.twitter.elephantbird.mapreduce.input.MultiInputFormat;
import com.twitter.elephantbird.mapreduce.io.BinaryWritable;
import com.twitter.elephantbird.pig.util.PigUtil;
import com.twitter.elephantbird.pig.util.ProjectedProtobufTupleFactory;
import com.twitter.elephantbird.pig.util.ProtobufToPig;
import com.twitter.elephantbird.util.Protobufs;
import com.twitter.elephantbird.util.TypeRef;

/**
 * Loader for Protobuf objects stored as base64 lines or as binary
 * blocks. This may support more file formats in future.
 * Initialize with a String argument that represents the full classpath of the protocol buffer class to be loaded.
*/ public class ProtobufPigLoader extends LzoBaseLoadFunc { static final Logger LOG = LoggerFactory.getLogger(ProtobufPigLoader.class); protected TypeRef typeRef = null; private final ProtobufToPig protoToPig = new ProtobufToPig(); private ProjectedProtobufTupleFactory tupleTemplate = null; /** * * @param protoClassName full classpath to the generated Protocol Buffer to be loaded. */ public ProtobufPigLoader(String protoClassName) { typeRef = PigUtil.getProtobufTypeRef(protoClassName); } @Override public RequiredFieldResponse pushProjection(RequiredFieldList requiredFieldList) throws FrontendException { return pushProjectionHelper(requiredFieldList); } /** * Return every non-null line as a single-element tuple to Pig. *

* A small fraction of bad records in input are tolerated. * See {@link LzoRecordReader} for more information on error handling. */ @Override public Tuple getNext() throws IOException { if (tupleTemplate == null) { tupleTemplate = new ProjectedProtobufTupleFactory(typeRef, requiredFieldList); } M value = getNextBinaryValue(typeRef); return value != null ? tupleTemplate.newTuple(value) : null; } @Override public ResourceSchema getSchema(String filename, Job job) throws IOException { return new ResourceSchema(protoToPig.toSchema(Protobufs.getMessageDescriptor(typeRef.getRawClass()))); } @Override public InputFormat> getInputFormat() throws IOException { if (typeRef == null) { LOG.error("Protobuf class must be specified before an InputFormat can be created. Do not use the no-argument constructor."); throw new IllegalArgumentException("Protobuf class must be specified before an InputFormat can be created. Do not use the no-argument constructor."); } return new MultiInputFormat(typeRef); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy