com.twitter.elephantbird.pig.load.LocationAsTuple Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elephant-bird-pig Show documentation
Pig utilities.
There is a newer version: 4.17
package com.twitter.elephantbird.pig.load;

import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.pig.Expression;
import org.apache.pig.LoadFunc;
import org.apache.pig.LoadMetadata;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.apache.pig.impl.util.StorageUtil;
import org.apache.pig.impl.util.UDFContext;

import com.google.common.collect.Lists;
import com.twitter.elephantbird.mapreduce.input.IntegerListInputFormat;

/**
 * Parses the "location" into a tuple by splitting on a delimiter, and returns it.
 * Handy for turning scalars into relations. For example:
 * {@code
 * languages = load 'en,fr,jp' using LocationAsTuple(',');
 * -- languages is ('en', 'fr', 'jp')
 * language_bag = foreach languages generate flatten(TOBAG(*));
 * -- language_bag is a relation with three rows, ('en'), ('fr'), ('jp')
 * }
 */
public class LocationAsTuple extends LoadFunc implements LoadMetadata {

  public static final String DATA_PROP = "data";
  final byte token;
  boolean returned = false;
  private String signature;

  public LocationAsTuple() {
    this("\t");
  }

  public LocationAsTuple(String s) {
     token = StorageUtil.parseFieldDel(s);
  }

  @SuppressWarnings("rawtypes")
  @Override
  public InputFormat getInputFormat() throws IOException {
    return new IntegerListInputFormat();
  }

  @Override
  public Tuple getNext() throws IOException {
    if (!returned) {
      returned = true;
      String dataString = getUDFProperties().getProperty(DATA_PROP);
      LinkedList tups = Lists.newLinkedList();
      int offset = 0;
      for (int i = 0; i < dataString.length(); i++) {
        if (dataString.charAt(i) == token) {
          tups.add(dataString.substring(offset, i));
          offset = i+1;
        }
      }
      tups.add(dataString.substring(offset, dataString.length()));
      return TupleFactory.getInstance().newTupleNoCopy(tups);
    } else {
      return null;
    }
  }

  @Override
  public void setUDFContextSignature(String signature) {
    this.signature = signature;
  }

  @Override
  public String relativeToAbsolutePath(String location, Path curDir)
  throws IOException {
      return location;
  }

  @Override
  public void setLocation(String location, Job job) throws IOException {
    Properties p = getUDFProperties();
    p.setProperty(DATA_PROP, location);
  }

  /**
   * Returns UDFProperties based on signature.
   */
  private Properties getUDFProperties() {
      return UDFContext.getUDFContext()
          .getUDFProperties(this.getClass(), new String[] {signature});
  }

  @Override
  public void prepareToRead(RecordReader arg0, PigSplit arg1) throws IOException {
    // do nothing.
  }

  @Override
  public ResourceSchema getSchema(String filename, Job job) throws IOException {
    List fieldSchemas = Lists.newArrayList();
    for (int i = 0; i < filename.length(); i++) {
      if (filename.charAt(i) == token) {
        FieldSchema fieldSchema = new FieldSchema(null, DataType.CHARARRAY);
        fieldSchemas.add(fieldSchema);
      }
    }
    // add last element after the token
    FieldSchema fieldSchema = new FieldSchema(null, DataType.CHARARRAY);
    fieldSchemas.add(fieldSchema);
    return new ResourceSchema(new Schema(fieldSchemas));
  }

  @Override
  public String[] getPartitionKeys(String arg0, Job arg1) throws IOException {
    return null;
  }

  @Override
  public ResourceStatistics getStatistics(String arg0, Job arg1) throws IOException {
    return null;
  }

  @Override
  public void setPartitionFilter(Expression arg0) throws IOException {
    // not implemented
  }

}