com.twitter.elephantbird.util.LzoUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elephant-bird-core Show documentation
Core utilities.
There is a newer version: 4.17
package com.twitter.elephantbird.util;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzoIndexer;
import com.hadoop.compression.lzo.LzopCodec;

/**
 * Miscellaneous lzo related utilities.
 */
public class LzoUtils {

  public static final Logger LOG = LoggerFactory.getLogger(LzoUtils.class);
  public static final String LZO_OUTPUT_INDEXABLE_MINSIZE =
      "elephantbird.lzo.output.indexable.minsize";
  public static final String LZO_OUTPUT_INDEX = "elephantbird.lzo.output.index";

  /**
   * A work-around to support environments with older versions of LzopCodec.
   * It might not be feasible for to select right version of hadoop-lzo
   * in some cases. This should be removed latest by EB-3.0.
   */
  private static boolean isLzopIndexSupported = false;
  static {
    try {
      isLzopIndexSupported =
        null != LzopCodec.class.getMethod("createIndexedOutputStream",
                                          OutputStream.class,
                                          DataOutputStream.class);
    } catch (Exception e) {
      // older version of hadoop-lzo.
    }
  }

  /**
   * Creates an lzop output stream. The index for the lzop is
   * also written to another at the same time if
   * elephantbird.lzo.output.index is set in configuration. 
   *
   * If the file size at closing is not larger than a single block,
   * the index file is deleted (in line with {@link LzoIndexer} behavior).
   */
  public static DataOutputStream
  getIndexedLzoOutputStream(Configuration conf, Path path) throws IOException {

    LzopCodec codec = new LzopCodec();
    codec.setConf(conf);

    final Path file = path;
    final FileSystem fs = file.getFileSystem(conf);
    FSDataOutputStream fileOut = fs.create(file, false);

    FSDataOutputStream indexOut = null;
    if (conf.getBoolean(LZO_OUTPUT_INDEX, false)) {
      if ( isLzopIndexSupported ) {
        Path indexPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX);
        indexOut = fs.create(indexPath, false);
      } else {
        LOG.warn("elephantbird.lzo.output.index is enabled, but LzopCodec "
            + "does not have createIndexedOutputStream method. "
            + "Please upgrade hadoop-lzo.");
      }
    }

    final boolean isIndexed = indexOut != null;
    final long minIndexableSize = conf.getLong(LZO_OUTPUT_INDEXABLE_MINSIZE,
        -1L);

    OutputStream out = ( isIndexed ?
        codec.createIndexedOutputStream(fileOut, indexOut) :
        codec.createOutputStream(fileOut) );

    return new DataOutputStream(out) {
      // override close() to handle renaming index file.

      public void close() throws IOException {
        super.close();

        if ( isIndexed ) {
          // rename or remove the index file based on file size.

          Path tmpPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX);
          FileStatus stat = fs.getFileStatus(file);
          final long minSizeToIndex = minIndexableSize < 0
              ? stat.getBlockSize()
              : minIndexableSize;

          if (stat.getLen() <= minSizeToIndex) {
            fs.delete(tmpPath, false);
          } else {
            fs.rename(tmpPath, file.suffix(LzoIndex.LZO_INDEX_SUFFIX));
          }
        }
      }
    };
  }

}