All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.elephantbird.util.LzoUtils Maven / Gradle / Ivy

There is a newer version: 4.17
Show newest version
package com.twitter.elephantbird.util;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzoIndexer;
import com.hadoop.compression.lzo.LzopCodec;

/**
 * Miscellaneous lzo related utilities.
 */
public class LzoUtils {

  public static final Logger LOG = LoggerFactory.getLogger(LzoUtils.class);
  public static final String LZO_OUTPUT_INDEXABLE_MINSIZE =
      "elephantbird.lzo.output.indexable.minsize";
  public static final String LZO_OUTPUT_INDEX = "elephantbird.lzo.output.index";

  /**
   * A work-around to support environments with older versions of LzopCodec.
   * It might not be feasible for to select right version of hadoop-lzo
   * in some cases. This should be removed latest by EB-3.0.
   */
  private static boolean isLzopIndexSupported = false;
  static {
    try {
      isLzopIndexSupported =
        null != LzopCodec.class.getMethod("createIndexedOutputStream",
                                          OutputStream.class,
                                          DataOutputStream.class);
    } catch (Exception e) {
      // older version of hadoop-lzo.
    }
  }

  /**
   * Creates an lzop output stream. The index for the lzop is
   * also written to another at the same time if
   * elephantbird.lzo.output.index is set in configuration. 

* * If the file size at closing is not larger than a single block, * the index file is deleted (in line with {@link LzoIndexer} behavior). */ public static DataOutputStream getIndexedLzoOutputStream(Configuration conf, Path path) throws IOException { LzopCodec codec = new LzopCodec(); codec.setConf(conf); final Path file = path; final FileSystem fs = file.getFileSystem(conf); FSDataOutputStream fileOut = fs.create(file, false); FSDataOutputStream indexOut = null; if (conf.getBoolean(LZO_OUTPUT_INDEX, false)) { if ( isLzopIndexSupported ) { Path indexPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX); indexOut = fs.create(indexPath, false); } else { LOG.warn("elephantbird.lzo.output.index is enabled, but LzopCodec " + "does not have createIndexedOutputStream method. " + "Please upgrade hadoop-lzo."); } } final boolean isIndexed = indexOut != null; final long minIndexableSize = conf.getLong(LZO_OUTPUT_INDEXABLE_MINSIZE, -1L); OutputStream out = ( isIndexed ? codec.createIndexedOutputStream(fileOut, indexOut) : codec.createOutputStream(fileOut) ); return new DataOutputStream(out) { // override close() to handle renaming index file. public void close() throws IOException { super.close(); if ( isIndexed ) { // rename or remove the index file based on file size. Path tmpPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX); FileStatus stat = fs.getFileStatus(file); final long minSizeToIndex = minIndexableSize < 0 ? stat.getBlockSize() : minIndexableSize; if (stat.getLen() <= minSizeToIndex) { fs.delete(tmpPath, false); } else { fs.rename(tmpPath, file.suffix(LzoIndex.LZO_INDEX_SUFFIX)); } } } }; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy