org.apache.hadoop.mapred.gridmix.CompressionEmulationUtil Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred.gridmix;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Utils;
import org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics;
import org.apache.hadoop.mapred.gridmix.GenerateData.GenDataFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
/**
* This is a utility class for all the compression related modules.
*/
class CompressionEmulationUtil {
static final Log LOG = LogFactory.getLog(CompressionEmulationUtil.class);
/**
* Enable compression usage in GridMix runs.
*/
private static final String COMPRESSION_EMULATION_ENABLE =
"gridmix.compression-emulation.enable";
/**
* Enable input data decompression.
*/
private static final String INPUT_DECOMPRESSION_EMULATION_ENABLE =
"gridmix.compression-emulation.input-decompression.enable";
/**
* Configuration property for setting the compression ratio for map input
* data.
*/
private static final String GRIDMIX_MAP_INPUT_COMPRESSION_RATIO =
"gridmix.compression-emulation.map-input.decompression-ratio";
/**
* Configuration property for setting the compression ratio of map output.
*/
private static final String GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO =
"gridmix.compression-emulation.map-output.compression-ratio";
/**
* Configuration property for setting the compression ratio of job output.
*/
private static final String GRIDMIX_JOB_OUTPUT_COMPRESSION_RATIO =
"gridmix.compression-emulation.job-output.compression-ratio";
/**
* Default compression ratio.
*/
static final float DEFAULT_COMPRESSION_RATIO = 0.5F;
private static final CompressionRatioLookupTable COMPRESSION_LOOKUP_TABLE =
new CompressionRatioLookupTable();
private static final Charset charsetUTF8 = Charset.forName("UTF-8");
/**
* This is a {@link Mapper} implementation for generating random text data.
* It uses {@link RandomTextDataGenerator} for generating text data and the
* output files are compressed.
*/
public static class RandomTextDataMapper
extends Mapper {
private RandomTextDataGenerator rtg;
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
int listSize =
RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf);
int wordSize =
RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf);
rtg = new RandomTextDataGenerator(listSize, wordSize);
}
/**
* Emits random words sequence of desired size. Note that the desired output
* size is passed as the value parameter to this map.
*/
@Override
public void map(NullWritable key, LongWritable value, Context context)
throws IOException, InterruptedException {
//TODO Control the extra data written ..
//TODO Should the key\tvalue\n be considered for measuring size?
// Can counters like BYTES_WRITTEN be used? What will be the value of
// such counters in LocalJobRunner?
for (long bytes = value.get(); bytes > 0;) {
String randomKey = rtg.getRandomWord();
String randomValue = rtg.getRandomWord();
context.write(new Text(randomKey), new Text(randomValue));
bytes -= (randomValue.getBytes(charsetUTF8).length +
randomKey.getBytes(charsetUTF8).length);
}
}
}
/**
* Configure the {@link Job} for enabling compression emulation.
*/
static void configure(final Job job) throws IOException, InterruptedException,
ClassNotFoundException {
// set the random text mapper
job.setMapperClass(RandomTextDataMapper.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setInputFormatClass(GenDataFormat.class);
job.setJarByClass(GenerateData.class);
// set the output compression true
FileOutputFormat.setCompressOutput(job, true);
try {
FileInputFormat.addInputPath(job, new Path("ignored"));
} catch (IOException e) {
LOG.error("Error while adding input path ", e);
}
}
/**
* This is the lookup table for mapping compression ratio to the size of the
* word in the {@link RandomTextDataGenerator}'s dictionary.
*
* Note that this table is computed (empirically) using a dictionary of
* default length i.e {@value RandomTextDataGenerator#DEFAULT_LIST_SIZE}.
*/
private static class CompressionRatioLookupTable {
private static Map map = new HashMap(60);
private static final float MIN_RATIO = 0.07F;
private static final float MAX_RATIO = 0.68F;
// add the empirically obtained data points in the lookup table
CompressionRatioLookupTable() {
map.put(.07F,30);
map.put(.08F,25);
map.put(.09F,60);
map.put(.10F,20);
map.put(.11F,70);
map.put(.12F,15);
map.put(.13F,80);
map.put(.14F,85);
map.put(.15F,90);
map.put(.16F,95);
map.put(.17F,100);
map.put(.18F,105);
map.put(.19F,110);
map.put(.20F,115);
map.put(.21F,120);
map.put(.22F,125);
map.put(.23F,130);
map.put(.24F,140);
map.put(.25F,145);
map.put(.26F,150);
map.put(.27F,155);
map.put(.28F,160);
map.put(.29F,170);
map.put(.30F,175);
map.put(.31F,180);
map.put(.32F,190);
map.put(.33F,195);
map.put(.34F,205);
map.put(.35F,215);
map.put(.36F,225);
map.put(.37F,230);
map.put(.38F,240);
map.put(.39F,250);
map.put(.40F,260);
map.put(.41F,270);
map.put(.42F,280);
map.put(.43F,295);
map.put(.44F,310);
map.put(.45F,325);
map.put(.46F,335);
map.put(.47F,355);
map.put(.48F,375);
map.put(.49F,395);
map.put(.50F,420);
map.put(.51F,440);
map.put(.52F,465);
map.put(.53F,500);
map.put(.54F,525);
map.put(.55F,550);
map.put(.56F,600);
map.put(.57F,640);
map.put(.58F,680);
map.put(.59F,734);
map.put(.60F,813);
map.put(.61F,905);
map.put(.62F,1000);
map.put(.63F,1055);
map.put(.64F,1160);
map.put(.65F,1355);
map.put(.66F,1510);
map.put(.67F,1805);
map.put(.68F,2170);
}
/**
* Returns the size of the word in {@link RandomTextDataGenerator}'s
* dictionary that can generate text with the desired compression ratio.
*
* @throws RuntimeException If ratio is less than {@value #MIN_RATIO} or
* greater than {@value #MAX_RATIO}.
*/
int getWordSizeForRatio(float ratio) {
ratio = standardizeCompressionRatio(ratio);
if (ratio >= MIN_RATIO && ratio <= MAX_RATIO) {
return map.get(ratio);
} else {
throw new RuntimeException("Compression ratio should be in the range ["
+ MIN_RATIO + "," + MAX_RATIO + "]. Configured compression ratio is "
+ ratio + ".");
}
}
}
/**
* Setup the data generator's configuration to generate compressible random
* text data with the desired compression ratio.
* Note that the compression ratio, if configured, will set the
* {@link RandomTextDataGenerator}'s list-size and word-size based on
* empirical values using the compression ratio set in the configuration.
*
* Hence to achieve the desired compression ratio,
* {@link RandomTextDataGenerator}'s list-size will be set to the default
* value i.e {@value RandomTextDataGenerator#DEFAULT_LIST_SIZE}.
*/
static void setupDataGeneratorConfig(Configuration conf) {
boolean compress = isCompressionEmulationEnabled(conf);
if (compress) {
float ratio = getMapInputCompressionEmulationRatio(conf);
LOG.info("GridMix is configured to generate compressed input data with "
+ " a compression ratio of " + ratio);
int wordSize = COMPRESSION_LOOKUP_TABLE.getWordSizeForRatio(ratio);
RandomTextDataGenerator.setRandomTextDataGeneratorWordSize(conf,
wordSize);
// since the compression ratios are computed using the default value of
// list size
RandomTextDataGenerator.setRandomTextDataGeneratorListSize(conf,
RandomTextDataGenerator.DEFAULT_LIST_SIZE);
}
}
/**
* Returns a {@link RandomTextDataGenerator} that generates random
* compressible text with the desired compression ratio.
*/
static RandomTextDataGenerator getRandomTextDataGenerator(float ratio,
long seed) {
int wordSize = COMPRESSION_LOOKUP_TABLE.getWordSizeForRatio(ratio);
RandomTextDataGenerator rtg =
new RandomTextDataGenerator(RandomTextDataGenerator.DEFAULT_LIST_SIZE,
seed, wordSize);
return rtg;
}
/** Publishes compression related data statistics. Following statistics are
* published
*
* - Total compressed input data size
* - Number of compressed input data files
* - Compression Ratio
* - Text data dictionary size
* - Random text word size
*
*/
static DataStatistics publishCompressedDataStatistics(Path inputDir,
Configuration conf, long uncompressedDataSize)
throws IOException {
FileSystem fs = inputDir.getFileSystem(conf);
CompressionCodecFactory compressionCodecs =
new CompressionCodecFactory(conf);
// iterate over compressed files and sum up the compressed file sizes
long compressedDataSize = 0;
int numCompressedFiles = 0;
// obtain input data file statuses
FileStatus[] outFileStatuses =
fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
for (FileStatus status : outFileStatuses) {
// check if the input file is compressed
if (compressionCodecs != null) {
CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
if (codec != null) {
++numCompressedFiles;
compressedDataSize += status.getLen();
}
}
}
LOG.info("Gridmix is configured to use compressed input data.");
// publish the input data size
LOG.info("Total size of compressed input data : "
+ StringUtils.humanReadableInt(compressedDataSize));
LOG.info("Total number of compressed input data files : "
+ numCompressedFiles);
if (numCompressedFiles == 0) {
throw new RuntimeException("No compressed file found in the input"
+ " directory : " + inputDir.toString() + ". To enable compression"
+ " emulation, run Gridmix either with "
+ " an input directory containing compressed input file(s) or"
+ " use the -generate option to (re)generate it. If compression"
+ " emulation is not desired, disable it by setting '"
+ COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
}
// publish compression ratio only if its generated in this gridmix run
if (uncompressedDataSize > 0) {
// compute the compression ratio
double ratio = ((double)compressedDataSize) / uncompressedDataSize;
// publish the compression ratio
LOG.info("Input Data Compression Ratio : " + ratio);
}
return new DataStatistics(compressedDataSize, numCompressedFiles, true);
}
/**
* Enables/Disables compression emulation.
* @param conf Target configuration where the parameter
* {@value #COMPRESSION_EMULATION_ENABLE} will be set.
* @param val The value to be set.
*/
static void setCompressionEmulationEnabled(Configuration conf, boolean val) {
conf.setBoolean(COMPRESSION_EMULATION_ENABLE, val);
}
/**
* Checks if compression emulation is enabled or not. Default is {@code true}.
*/
static boolean isCompressionEmulationEnabled(Configuration conf) {
return conf.getBoolean(COMPRESSION_EMULATION_ENABLE, true);
}
/**
* Enables/Disables input decompression emulation.
* @param conf Target configuration where the parameter
* {@value #INPUT_DECOMPRESSION_EMULATION_ENABLE} will be set.
* @param val The value to be set.
*/
static void setInputCompressionEmulationEnabled(Configuration conf,
boolean val) {
conf.setBoolean(INPUT_DECOMPRESSION_EMULATION_ENABLE, val);
}
/**
* Check if input decompression emulation is enabled or not.
* Default is {@code false}.
*/
static boolean isInputCompressionEmulationEnabled(Configuration conf) {
return conf.getBoolean(INPUT_DECOMPRESSION_EMULATION_ENABLE, false);
}
/**
* Set the map input data compression ratio in the given conf.
*/
static void setMapInputCompressionEmulationRatio(Configuration conf,
float ratio) {
conf.setFloat(GRIDMIX_MAP_INPUT_COMPRESSION_RATIO, ratio);
}
/**
* Get the map input data compression ratio using the given configuration.
* If the compression ratio is not set in the configuration then use the
* default value i.e {@value #DEFAULT_COMPRESSION_RATIO}.
*/
static float getMapInputCompressionEmulationRatio(Configuration conf) {
return conf.getFloat(GRIDMIX_MAP_INPUT_COMPRESSION_RATIO,
DEFAULT_COMPRESSION_RATIO);
}
/**
* Set the map output data compression ratio in the given configuration.
*/
static void setMapOutputCompressionEmulationRatio(Configuration conf,
float ratio) {
conf.setFloat(GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO, ratio);
}
/**
* Get the map output data compression ratio using the given configuration.
* If the compression ratio is not set in the configuration then use the
* default value i.e {@value #DEFAULT_COMPRESSION_RATIO}.
*/
static float getMapOutputCompressionEmulationRatio(Configuration conf) {
return conf.getFloat(GRIDMIX_MAP_OUTPUT_COMPRESSION_RATIO,
DEFAULT_COMPRESSION_RATIO);
}
/**
* Set the job output data compression ratio in the given configuration.
*/
static void setJobOutputCompressionEmulationRatio(Configuration conf,
float ratio) {
conf.setFloat(GRIDMIX_JOB_OUTPUT_COMPRESSION_RATIO, ratio);
}
/**
* Get the job output data compression ratio using the given configuration.
* If the compression ratio is not set in the configuration then use the
* default value i.e {@value #DEFAULT_COMPRESSION_RATIO}.
*/
static float getJobOutputCompressionEmulationRatio(Configuration conf) {
return conf.getFloat(GRIDMIX_JOB_OUTPUT_COMPRESSION_RATIO,
DEFAULT_COMPRESSION_RATIO);
}
/**
* Standardize the compression ratio i.e round off the compression ratio to
* only 2 significant digits.
*/
static float standardizeCompressionRatio(float ratio) {
// round off to 2 significant digits
int significant = (int)Math.round(ratio * 100);
return ((float)significant)/100;
}
/**
* Returns a {@link InputStream} for a file that might be compressed.
*/
static InputStream getPossiblyDecompressedInputStream(Path file,
Configuration conf,
long offset)
throws IOException {
FileSystem fs = file.getFileSystem(conf);
if (isCompressionEmulationEnabled(conf)
&& isInputCompressionEmulationEnabled(conf)) {
CompressionCodecFactory compressionCodecs =
new CompressionCodecFactory(conf);
CompressionCodec codec = compressionCodecs.getCodec(file);
if (codec != null) {
Decompressor decompressor = CodecPool.getDecompressor(codec);
if (decompressor != null) {
CompressionInputStream in =
codec.createInputStream(fs.open(file), decompressor);
//TODO Seek doesnt work with compressed input stream.
// Use SplittableCompressionCodec?
return (InputStream)in;
}
}
}
FSDataInputStream in = fs.open(file);
in.seek(offset);
return (InputStream)in;
}
/**
* Returns a {@link OutputStream} for a file that might need
* compression.
*/
static OutputStream getPossiblyCompressedOutputStream(Path file,
Configuration conf)
throws IOException {
FileSystem fs = file.getFileSystem(conf);
JobConf jConf = new JobConf(conf);
if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) {
// get the codec class
Class extends CompressionCodec> codecClass =
org.apache.hadoop.mapred.FileOutputFormat
.getOutputCompressorClass(jConf,
GzipCodec.class);
// get the codec implementation
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
// add the appropriate extension
file = file.suffix(codec.getDefaultExtension());
if (isCompressionEmulationEnabled(conf)) {
FSDataOutputStream fileOut = fs.create(file, false);
return new DataOutputStream(codec.createOutputStream(fileOut));
}
}
return fs.create(file, false);
}
/**
* Extracts compression/decompression related configuration parameters from
* the source configuration to the target configuration.
*/
static void configureCompressionEmulation(Configuration source,
Configuration target) {
// enable output compression
target.setBoolean(FileOutputFormat.COMPRESS,
source.getBoolean(FileOutputFormat.COMPRESS, false));
// set the job output compression codec
String jobOutputCompressionCodec =
source.get(FileOutputFormat.COMPRESS_CODEC);
if (jobOutputCompressionCodec != null) {
target.set(FileOutputFormat.COMPRESS_CODEC, jobOutputCompressionCodec);
}
// set the job output compression type
String jobOutputCompressionType =
source.get(FileOutputFormat.COMPRESS_TYPE);
if (jobOutputCompressionType != null) {
target.set(FileOutputFormat.COMPRESS_TYPE, jobOutputCompressionType);
}
// enable map output compression
target.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS,
source.getBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, false));
// set the map output compression codecs
String mapOutputCompressionCodec =
source.get(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC);
if (mapOutputCompressionCodec != null) {
target.set(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC,
mapOutputCompressionCodec);
}
// enable input decompression
//TODO replace with mapInputBytes and hdfsBytesRead
Path[] inputs =
org.apache.hadoop.mapred.FileInputFormat
.getInputPaths(new JobConf(source));
boolean needsCompressedInput = false;
CompressionCodecFactory compressionCodecs =
new CompressionCodecFactory(source);
for (Path input : inputs) {
CompressionCodec codec = compressionCodecs.getCodec(input);
if (codec != null) {
needsCompressedInput = true;
}
}
setInputCompressionEmulationEnabled(target, needsCompressedInput);
}
/**
* Get the uncompressed input bytes count from the given possibly compressed
* input bytes count.
* @param possiblyCompressedInputBytes input bytes count. This is compressed
* input size if compression emulation is on.
* @param conf configuration of the Gridmix simulated job
* @return uncompressed input bytes count. Compute this in case if compressed
* input was used
*/
static long getUncompressedInputBytes(long possiblyCompressedInputBytes,
Configuration conf) {
long uncompressedInputBytes = possiblyCompressedInputBytes;
if (CompressionEmulationUtil.isInputCompressionEmulationEnabled(conf)) {
float inputCompressionRatio =
CompressionEmulationUtil.getMapInputCompressionEmulationRatio(conf);
uncompressedInputBytes /= inputCompressionRatio;
}
return uncompressedInputBytes;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy