All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.pig.store.LzoPigStorage Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.pig.store;

import java.io.DataOutputStream;
import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.StorageUtil;

import com.twitter.elephantbird.mapreduce.input.LzoTextInputFormat;
import com.twitter.elephantbird.mapreduce.output.LzoOutputFormat;

/**
 * A wrapper for {@link PigStorage} to enable LZO compression.
 * LzoTextInputFormat is used for loading since PigStorage
 * can not split lzo files.
 * LzoTextOutputFormat is used for storage so that lzo index files
 * can be written at the same time.
 *
 * This is similar to:
 * 
 *   set output.compression.enabled true;
 *   set output.compression.codec com.hadoop.compression.lzo.LzopCodec;
 *   store/load ... using PigStorage();
 * 
*/ public class LzoPigStorage extends PigStorage { private String delimiter = null; // temporary for outpupt format public LzoPigStorage() { super(); } public LzoPigStorage(String delimiter) { super(delimiter); this.delimiter = delimiter; } @Override public InputFormat getInputFormat() { // PigStorage can handle lzo files, but cannot split them. return new LzoTextInputFormat(); } @Override public OutputFormat getOutputFormat() { // LzoOutputFormat can write lzo index file. // LzoTextInputFormat can't be used here. return new TupleOutputFormat(delimiter); } // This is a temporary work around for PigStorage since // it writes a Tuple to outputformat rather than Text. // This may change soon and we can use LzoTextOutputFormat directly. protected static class TupleOutputFormat extends LzoOutputFormat { private byte fieldDel; public TupleOutputFormat(String delimiter) { this.fieldDel = delimiter == null ? (byte)'\t' : StorageUtil.parseFieldDel(delimiter); } @Override public RecordWriter getRecordWriter( TaskAttemptContext job) throws IOException, InterruptedException { final DataOutputStream out = getOutputStream(job); return new RecordWriter() { public void close(TaskAttemptContext context) throws IOException, InterruptedException { out.close(); } public void write(NullWritable key, Tuple value) throws IOException, InterruptedException { int sz = value.size(); for (int i = 0; i < sz; i++) { StorageUtil.putField(out, value.get(i)); if (i != sz - 1) { out.writeByte(fieldDel); } } out.write('\n'); } }; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy