All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.pig.load.LzoW3CLogLoader Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.pig.load;

import java.io.IOException;
import java.util.Map;

import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.pig.Expression;
import org.apache.pig.LoadMetadata;
import org.apache.pig.PigException;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.collect.Maps;
import com.twitter.elephantbird.mapreduce.input.LzoW3CLogInputFormat;
import com.twitter.elephantbird.mapreduce.input.LzoW3CLogRecordReader;


/**
 * Load w3c log LZO file line by line, passing each line as a single-field Tuple to Pig.
 */
public class LzoW3CLogLoader extends LzoBaseLoadFunc implements LoadMetadata {
  protected static final Logger LOG = LoggerFactory.getLogger(LzoW3CLogLoader.class);

  protected static final TupleFactory tupleFactory_ = TupleFactory.getInstance();
  protected final String fileURI;
  protected enum LzoW3CLogLoaderCounters { LinesW3CDecoded, UnparseableLines};

  /**
   * Constructor.
   * @param fileURI path to HDFS file that contains the CRC to column list mappings, one per line.
   * @throws IOException
   */
  public LzoW3CLogLoader(String fileURI) throws IOException {
    LOG.debug("Initialize LzoW3CLogLoader from " + fileURI);
    this.fileURI = fileURI;
  }

  /**
   * Return every non-null line as a single-element tuple to Pig.
   */
  @Override
  public Tuple getNext() throws IOException {
    LzoW3CLogRecordReader w3CLogRecordReader = (LzoW3CLogRecordReader) reader;
    if (w3CLogRecordReader == null) {
      return null;
    }
    MapWritable value_;
    try {
      if (w3CLogRecordReader.nextKeyValue()
          && (value_ = w3CLogRecordReader.getCurrentValue()) != null) {
        Map values = Maps.newHashMap();

        for (Writable key : value_.keySet()) {
          Writable value = value_.get(key);
          values.put(key.toString(), value != null ? value.toString() : null);
        }
        incrCounter(LzoW3CLogLoaderCounters.LinesW3CDecoded, 1L);
        incrCounter(LzoW3CLogLoaderCounters.UnparseableLines,
            w3CLogRecordReader.getBadRecordsSkipped());
        return tupleFactory_.newTuple(values);
      }
    } catch (InterruptedException e) {
      int errCode = 6018;
      String errMsg = "Error while reading input";
      throw new ExecException(errMsg, errCode,
          PigException.REMOTE_ENVIRONMENT, e);
    }
    return null;
  }

  @Override
  public void setLocation(String location, Job job)
  throws IOException {
    FileInputFormat.setInputPaths(job, location);
  }

  @SuppressWarnings("rawtypes")
  @Override
  public InputFormat getInputFormat() {
    return LzoW3CLogInputFormat.newInstance(fileURI);
  }

  /**
   * NOT IMPLEMENTED
   */
  @Override
  public String[] getPartitionKeys(String arg0, Job arg1) throws IOException {
    return null;
  }

  @Override
  public ResourceSchema getSchema(String arg0, Job arg1) throws IOException {
    return new ResourceSchema(new Schema(new FieldSchema("data", DataType.MAP)));
  }

  /**
   * NOT IMPLEMENTED
   */
  @Override
  public ResourceStatistics getStatistics(String arg0, Job arg1) throws IOException {
    return null;
  }

  /**
   * NOT IMPLEMENTED
   */
  @Override
  public void setPartitionFilter(Expression arg0) throws IOException {

  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy