
com.twitter.elephantbird.util.LzoUtils Maven / Gradle / Ivy
package com.twitter.elephantbird.util;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.hadoop.compression.lzo.LzoIndex;
import com.hadoop.compression.lzo.LzoIndexer;
import com.hadoop.compression.lzo.LzopCodec;
/**
* Miscellaneous lzo related utilities.
*/
public class LzoUtils {
public static final Logger LOG = LoggerFactory.getLogger(LzoUtils.class);
public static final String LZO_OUTPUT_INDEXABLE_MINSIZE =
"elephantbird.lzo.output.indexable.minsize";
public static final String LZO_OUTPUT_INDEX = "elephantbird.lzo.output.index";
/**
* A work-around to support environments with older versions of LzopCodec.
* It might not be feasible for to select right version of hadoop-lzo
* in some cases. This should be removed latest by EB-3.0.
*/
private static boolean isLzopIndexSupported = false;
static {
try {
isLzopIndexSupported =
null != LzopCodec.class.getMethod("createIndexedOutputStream",
OutputStream.class,
DataOutputStream.class);
} catch (Exception e) {
// older version of hadoop-lzo.
}
}
/**
* Creates an lzop output stream. The index for the lzop is
* also written to another at the same time if
* elephantbird.lzo.output.index
is set in configuration.
*
* If the file size at closing is not larger than a single block,
* the index file is deleted (in line with {@link LzoIndexer} behavior).
*/
public static DataOutputStream
getIndexedLzoOutputStream(Configuration conf, Path path) throws IOException {
LzopCodec codec = new LzopCodec();
codec.setConf(conf);
final Path file = path;
final FileSystem fs = file.getFileSystem(conf);
FSDataOutputStream fileOut = fs.create(file, false);
FSDataOutputStream indexOut = null;
if (conf.getBoolean(LZO_OUTPUT_INDEX, false)) {
if ( isLzopIndexSupported ) {
Path indexPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX);
indexOut = fs.create(indexPath, false);
} else {
LOG.warn("elephantbird.lzo.output.index is enabled, but LzopCodec "
+ "does not have createIndexedOutputStream method. "
+ "Please upgrade hadoop-lzo.");
}
}
final boolean isIndexed = indexOut != null;
final long minIndexableSize = conf.getLong(LZO_OUTPUT_INDEXABLE_MINSIZE,
-1L);
OutputStream out = ( isIndexed ?
codec.createIndexedOutputStream(fileOut, indexOut) :
codec.createOutputStream(fileOut) );
return new DataOutputStream(out) {
// override close() to handle renaming index file.
public void close() throws IOException {
super.close();
if ( isIndexed ) {
// rename or remove the index file based on file size.
Path tmpPath = file.suffix(LzoIndex.LZO_TMP_INDEX_SUFFIX);
FileStatus stat = fs.getFileStatus(file);
final long minSizeToIndex = minIndexableSize < 0
? stat.getBlockSize()
: minIndexableSize;
if (stat.getLen() <= minSizeToIndex) {
fs.delete(tmpPath, false);
} else {
fs.rename(tmpPath, file.suffix(LzoIndex.LZO_INDEX_SUFFIX));
}
}
}
};
}
}