All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.pig.load.LocationAsTuple Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.pig.load;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.StorageUtil;
import org.apache.pig.impl.util.UDFContext;

import com.google.common.collect.Lists;
import com.twitter.elephantbird.mapreduce.input.IntegerListInputFormat;

/**
 * Parses the "location" into a tuple by splitting on a delimiter, and returns it.
 * Handy for turning scalars into relations. For example:
 * 
{@code
 * languages = load 'en,fr,jp' using LocationAsTuple(',');
 * -- languages is ('en', 'fr', 'jp')
 * language_bag = foreach languages generate flatten(TOBAG(*));
 * -- language_bag is a relation with three rows, ('en'), ('fr'), ('jp')
 * }
*/ public class LocationAsTuple extends LoadFunc implements LoadMetadata { public static final String DATA_PROP = "data"; final byte token; boolean returned = false; private String signature; public LocationAsTuple() { this("\t"); } public LocationAsTuple(String s) { token = StorageUtil.parseFieldDel(s); } @SuppressWarnings("rawtypes") @Override public InputFormat getInputFormat() throws IOException { return new IntegerListInputFormat(); } @Override public Tuple getNext() throws IOException { if (!returned) { returned = true; String dataString = getUDFProperties().getProperty(DATA_PROP); LinkedList tups = Lists.newLinkedList(); int offset = 0; for (int i = 0; i < dataString.length(); i++) { if (dataString.charAt(i) == token) { tups.add(dataString.substring(offset, i)); offset = i+1; } } tups.add(dataString.substring(offset, dataString.length())); return TupleFactory.getInstance().newTupleNoCopy(tups); } else { return null; } } @Override public void setUDFContextSignature(String signature) { this.signature = signature; } @Override public String relativeToAbsolutePath(String location, Path curDir) throws IOException { return location; } @Override public void setLocation(String location, Job job) throws IOException { Properties p = getUDFProperties(); p.setProperty(DATA_PROP, location); } /** * Returns UDFProperties based on signature. */ private Properties getUDFProperties() { return UDFContext.getUDFContext() .getUDFProperties(this.getClass(), new String[] {signature}); } @Override public void prepareToRead(RecordReader arg0, PigSplit arg1) throws IOException { // do nothing. } @Override public ResourceSchema getSchema(String filename, Job job) throws IOException { List fieldSchemas = Lists.newArrayList(); for (int i = 0; i < filename.length(); i++) { if (filename.charAt(i) == token) { FieldSchema fieldSchema = new FieldSchema(null, DataType.CHARARRAY); fieldSchemas.add(fieldSchema); } } // add last element after the token FieldSchema fieldSchema = new FieldSchema(null, DataType.CHARARRAY); fieldSchemas.add(fieldSchema); return new ResourceSchema(new Schema(fieldSchemas)); } @Override public String[] getPartitionKeys(String arg0, Job arg1) throws IOException { return null; } @Override public ResourceStatistics getStatistics(String arg0, Job arg1) throws IOException { return null; } @Override public void setPartitionFilter(Expression arg0) throws IOException { // not implemented } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy