All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.pig.util.ThriftWritableConverter Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.pig.util;

import java.io.IOException;

import org.apache.hadoop.io.SequenceFile;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.thrift.TBase;

import com.google.common.base.Preconditions;
import com.twitter.elephantbird.mapreduce.io.ThriftWritable;
import com.twitter.elephantbird.pig.store.SequenceFileStorage;
import com.twitter.elephantbird.util.TypeRef;

/**
 * Supports conversion between Pig {@link Tuple} and {@link ThriftWritable} types. For example, say
 * we have thrift type {@code Person}. We can use {@link ThriftWritableConverter} and
 * {@link SequenceFileStorage} to convert Tuple data to {@link ThriftWritable}{@code }
 * instances and store these as values in a {@link SequenceFile}:
 *
 * 
 * -- assume that we identify Person instances by integer id
 * people = LOAD '$data' AS (id: int, person: ());
 *
 * STORE people INTO '$output' USING com.twitter.elephantbird.pig.store.SequenceFileStorage (
 *   '-c com.twitter.elephantbird.pig.util.IntWritableConverter',
 *   '-c com.twitter.elephantbird.pig.util.ThriftWritableConverter Person'
 * );
 * 
* * Notice that we supply the name of our thrift {@code Person} class as an extra argument to * {@code -c com.twitter.elephantbird.pig.util.ThriftWritableConverter} above. This ensures the * ThriftWritableConverter instance created by the SequenceFileStorage instance knows what thrift * type it's dealing with. * * We can also load {@code ThriftWritable} data from a SequenceFile and convert back to * Tuples: * *
 * people = LOAD '$data' USING com.twitter.elephantbird.pig.load.SequenceFileLoader (
 *   '-c com.twitter.elephantbird.pig.util.IntWritableConverter',
 *   '-c com.twitter.elephantbird.pig.util.ThriftWritableConverter Person'
 * );
 * 
* * @author Andy Schlaikjer */ public class ThriftWritableConverter> extends AbstractWritableConverter> { protected final TypeRef typeRef; protected final ThriftToPig thriftToPig; protected final Schema expectedSchema; protected final PigToThrift pigToThrift; public ThriftWritableConverter(String thriftClassName) { super(new ThriftWritable()); Preconditions.checkNotNull(thriftClassName); typeRef = PigUtil.getThriftTypeRef(thriftClassName); thriftToPig = ThriftToPig.newInstance(typeRef); expectedSchema = thriftToPig.toSchema(); pigToThrift = PigToThrift.newInstance(typeRef); writable.setConverter(typeRef.getRawClass()); } @Override public void initialize(Class> writableClass) throws IOException { if (writableClass == null) { return; } super.initialize(writableClass); writable.setConverter(typeRef.getRawClass()); } @Override public ResourceFieldSchema getLoadSchema() throws IOException { return new ResourceFieldSchema(new FieldSchema(null, expectedSchema)); } @Override public void checkStoreSchema(ResourceFieldSchema schema) throws IOException { Preconditions.checkNotNull(schema, "Schema is null"); Preconditions.checkArgument(DataType.TUPLE == schema.getType(), "Expected schema type '%s' but found type '%s'", DataType.findTypeName(DataType.TUPLE), DataType.findTypeName(schema.getType())); ResourceSchema childSchema = schema.getSchema(); Preconditions.checkNotNull(childSchema, "Child schema is null"); Schema actualSchema = Schema.getPigSchema(childSchema); Preconditions.checkArgument(Schema.equals(expectedSchema, actualSchema, false, true), "Expected store schema '%s' but found schema '%s'", expectedSchema, actualSchema); } @Override public Object bytesToObject(DataByteArray dataByteArray) throws IOException { return bytesToTuple(dataByteArray.get(), null); } @Override protected Tuple toTuple(ThriftWritable writable, ResourceFieldSchema schema) throws IOException { return thriftToPig.getPigTuple(writable.get()); } @Override protected ThriftWritable toWritable(Tuple value) throws IOException { writable.set(pigToThrift.getThriftObject(value)); return writable; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy