com.hadoop.mapreduce.LzoTextInputFormat Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-gpl-compression Show documentation
This project is a set of plugins for Apache Hadoop that provide access to the GPL'ed compression codecs.
The newest version!
/*
 * This file is part of Hadoop-Gpl-Compression.
 *
 * Hadoop-Gpl-Compression is free software: you can redistribute it
 * and/or modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Hadoop-Gpl-Compression is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Hadoop-Gpl-Compression.  If not, see
 * .
 */
package com.hadoop.mapreduce;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import com.hadoop.compression.lzo.LzopCodec;
import com.hadoop.compression.lzo.LzopCodec.LzopDecompressor;

/**
 * An {@link InputFormat} for lzop compressed text files. Files are broken into
 * lines. Either linefeed or carriage-return are used to signal end of line.
 * Keys are the position in the file, and values are the line of text.
 */
public class LzoTextInputFormat extends FileInputFormat {

  public static final String LZO_INDEX_SUFFIX = ".index";
  
  private Map indexes = new HashMap();
  
  @Override
  protected List listStatus(JobContext job) throws IOException {
    List files = super.listStatus(job);

    FileSystem fs = FileSystem.get(job.getConfiguration());
    String fileExtension = new LzopCodec().getDefaultExtension();

    for (Iterator iterator = files.iterator(); iterator.hasNext();) {
      FileStatus fileStatus = (FileStatus) iterator.next();
      Path file = fileStatus.getPath();
      
      if (!file.toString().endsWith(fileExtension)) {
        //get rid of non lzo files
        iterator.remove();
      } else {
        //read the index file
        LzoIndex index = readIndex(file, fs);
        indexes.put(file, index);
      }
    }

    return files;
  }

  @Override
  protected boolean isSplitable(JobContext context, Path filename) {
    LzoIndex index = indexes.get(filename);
    return !index.isEmpty();
  }

  @Override
  public List getSplits(JobContext job) throws IOException {
    List splits = super.getSplits(job);
    // find new start/ends of the filesplit that aligns
    // with the lzo blocks

    List result = new ArrayList();
    FileSystem fs = FileSystem.get(job.getConfiguration());

    for (InputSplit genericSplit : splits) {
      // load the index
      FileSplit fileSplit = (FileSplit) genericSplit;
      Path file = fileSplit.getPath();
      LzoIndex index = indexes.get(file);
      if (index == null) {
        throw new IOException("Index not found for " + file);
      }
      
      if (index.isEmpty()) {
        // empty index, keep as is
        result.add(fileSplit);
        continue;
      }

      long start = fileSplit.getStart();
      long end = start + fileSplit.getLength();

      if (start != 0) {
        // find the next block position from
        // the start of the split
        long newStart = index.findNextPosition(start);
        if (newStart == -1 || newStart >= end) {
          // just skip this since it will be handled by another split
          continue;
        }
        start = newStart;
      }

      long newEnd = index.findNextPosition(end);
      if (newEnd != -1) {
        end = newEnd;
      } else {
        //didn't find the next position
        //we have hit the end of the file
        end = fs.getFileStatus(file).getLen();
      }

      result.add(new FileSplit(file, start, end - start, fileSplit
          .getLocations()));
    }

    return result;
  }

  /**
   * Read the index of the lzo file.
   * 
   * @param split
   *          Read the index of this file.
   * @param fs
   *          The index file is on this file system.
   * @throws IOException
   */
  private LzoIndex readIndex(Path file, FileSystem fs) throws IOException {
    FSDataInputStream indexIn = null;
    try {
      Path indexFile = new Path(file.toString() + LZO_INDEX_SUFFIX);
      if (!fs.exists(indexFile)) {
        // return empty index, fall back to the unsplittable mode
        return new LzoIndex();
      }

      long indexLen = fs.getFileStatus(indexFile).getLen();
      int blocks = (int) (indexLen / 8);
      LzoIndex index = new LzoIndex(blocks);
      indexIn = fs.open(indexFile);
      for (int i = 0; i < blocks; i++) {
        index.set(i, indexIn.readLong());
      }
      return index;
    } finally {
      if (indexIn != null) {
        indexIn.close();
      }
    }
  }

  /**
   * Index an lzo file to allow the input format to split them into separate map
   * jobs.
   * 
   * @param fs
   *          File system that contains the file.
   * @param lzoFile
   *          the lzo file to index.
   * @throws IOException
   */
  public static void createIndex(FileSystem fs, Path lzoFile)
      throws IOException {

    Configuration conf = fs.getConf();
    CompressionCodecFactory factory = new CompressionCodecFactory(fs.getConf());
    CompressionCodec codec = factory.getCodec(lzoFile);
    ((Configurable) codec).setConf(conf);

    InputStream lzoIs = null;
    FSDataOutputStream os = null;
    Path outputFile = new Path(lzoFile.toString()
        + LzoTextInputFormat.LZO_INDEX_SUFFIX);
    Path tmpOutputFile = outputFile.suffix(".tmp");
    
    try {
      FSDataInputStream is = fs.open(lzoFile);
      os = fs.create(tmpOutputFile);
      LzopDecompressor decompressor = (LzopDecompressor) codec
          .createDecompressor();
      // for reading the header
      lzoIs = codec.createInputStream(is, decompressor);

      int numChecksums = decompressor.getChecksumsCount();

      while (true) {
        // read and ignore, we just want to get to the next int
        int uncompressedBlockSize = is.readInt();
        if (uncompressedBlockSize == 0) {
          break;
        } else if (uncompressedBlockSize < 0) {
          throw new EOFException();
        }

        int compressedBlockSize = is.readInt();
        if (compressedBlockSize <= 0) {
          throw new IOException("Could not read compressed block size");
        }

        long pos = is.getPos();
        // write the pos of the block start
        os.writeLong(pos - 8);
        // seek to the start of the next block, skip any checksums
        is.seek(pos + compressedBlockSize + (4 * numChecksums));
      }
    } finally {
      if (lzoIs != null) {
        lzoIs.close();
      }

      if (os != null) {
        os.close();
      }
    }
    
    fs.rename(tmpOutputFile, outputFile);
  }

  /**
   * Represents the lzo index.
   */
  static class LzoIndex {

    private long[] blockPositions;

    LzoIndex() {
    }

    LzoIndex(int blocks) {
      blockPositions = new long[blocks];
    }

    /**
     * Set the position for the block.
     * 
     * @param blockNumber
     *          Block to set pos for.
     * @param pos
     *          Position.
     */
    public void set(int blockNumber, long pos) {
      blockPositions[blockNumber] = pos;
    }

    /**
     * Find the next lzo block start from the given position.
     * 
     * @param pos
     *          The position to start looking from.
     * @return Either the start position of the block or -1 if it couldn't be
     *         found.
     */
    public long findNextPosition(long pos) {
      int block = Arrays.binarySearch(blockPositions, pos);

      if (block >= 0) {
        // direct hit on a block start position
        return blockPositions[block];
      } else {
        block = Math.abs(block) - 1;
        if (block > blockPositions.length - 1) {
          return -1;
        }
        return blockPositions[block];
      }
    }

    public boolean isEmpty() {
      return blockPositions == null || blockPositions.length == 0;
    }

  }

  @Override
  public RecordReader createRecordReader(InputSplit split,
      TaskAttemptContext taskAttempt) throws IOException, InterruptedException {

    return new LzoLineRecordReader();
  }
}