hivemall.hcatalog.mapreduce.MultiOutputFormat Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.hive.hcatalog.mapreduce;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.shims.HadoopShims;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.JobStatus.State;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hive.hcatalog.common.HCatUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The MultiOutputFormat class simplifies writing output data to multiple
* outputs.
*
* Multiple output formats can be defined each with its own
* OutputFormat
class, own key class and own value class. Any
* configuration on these output format classes can be done without interfering
* with other output format's configuration.
*
* Usage pattern for job submission:
*
*
*
* Job job = new Job();
*
* FileInputFormat.setInputPath(job, inDir);
*
* job.setMapperClass(WordCountMap.class);
* job.setReducerClass(WordCountReduce.class);
* job.setInputFormatClass(TextInputFormat.class);
* job.setOutputFormatClass(MultiOutputFormat.class);
* // Need not define OutputKeyClass and OutputValueClass. They default to
* // Writable.class
* job.setMapOutputKeyClass(Text.class);
* job.setMapOutputValueClass(IntWritable.class);
*
*
* // Create a JobConfigurer that will configure the job with the multiple
* // output format information.
* JobConfigurer configurer = MultiOutputFormat.createConfigurer(job);
*
* // Defines additional single text based output 'text' for the job.
* // Any configuration for the defined OutputFormat should be done with
* // the Job obtained with configurer.getJob() method.
* configurer.addOutputFormat("text", TextOutputFormat.class,
* IntWritable.class, Text.class);
* FileOutputFormat.setOutputPath(configurer.getJob("text"), textOutDir);
*
* // Defines additional sequence-file based output 'sequence' for the job
* configurer.addOutputFormat("sequence", SequenceFileOutputFormat.class,
* Text.class, IntWritable.class);
* FileOutputFormat.setOutputPath(configurer.getJob("sequence"), seqOutDir);
* ...
* // configure method to be called on the JobConfigurer once all the
* // output formats have been defined and configured.
* configurer.configure();
*
* job.waitForCompletion(true);
* ...
*
*
* Usage in Reducer:
*
*
* public class WordCountReduce extends
* Reducer<Text, IntWritable, Writable, Writable> {
*
* private IntWritable count = new IntWritable();
*
* public void reduce(Text word, Iterator<IntWritable> values,
* Context context)
* throws IOException {
* int sum = 0;
* for (IntWritable val : values) {
* sum += val.get();
* }
* count.set(sum);
* MultiOutputFormat.write("text", count, word, context);
* MultiOutputFormat.write("sequence", word, count, context);
* }
*
* }
*
*
*
* Map only jobs:
*
* MultiOutputFormat.write("output", key, value, context); can be called similar
* to a reducer in map only jobs.
*
*/
public class MultiOutputFormat extends OutputFormat {
private static final Logger LOGGER = LoggerFactory.getLogger(MultiOutputFormat.class.getName());
private static final String MO_ALIASES = "mapreduce.multiout.aliases";
private static final String MO_ALIAS = "mapreduce.multiout.alias";
private static final String CONF_KEY_DELIM = "%%";
private static final String CONF_VALUE_DELIM = ";;";
private static final String COMMA_DELIM = ",";
private static final List configsToOverride = new ArrayList();
private static final Map configsToMerge = new HashMap();
static {
configsToOverride.add("mapred.output.dir");
configsToOverride.add(ShimLoader.getHadoopShims().getHCatShim().getPropertyName(
HadoopShims.HCatHadoopShims.PropertyName.CACHE_SYMLINK));
configsToMerge.put(JobContext.JOB_NAMENODES, COMMA_DELIM);
configsToMerge.put("tmpfiles", COMMA_DELIM);
configsToMerge.put("tmpjars", COMMA_DELIM);
configsToMerge.put("tmparchives", COMMA_DELIM);
configsToMerge.put(ShimLoader.getHadoopShims().getHCatShim().getPropertyName(
HadoopShims.HCatHadoopShims.PropertyName.CACHE_ARCHIVES), COMMA_DELIM);
configsToMerge.put(ShimLoader.getHadoopShims().getHCatShim().getPropertyName(
HadoopShims.HCatHadoopShims.PropertyName.CACHE_FILES), COMMA_DELIM);
String fileSep;
if (HCatUtil.isHadoop23()) {
fileSep = ",";
} else {
fileSep = System.getProperty("path.separator");
}
configsToMerge.put("mapred.job.classpath.archives", fileSep);
configsToMerge.put("mapred.job.classpath.files", fileSep);
}
/**
* Get a JobConfigurer instance that will support configuration of the job
* for multiple output formats.
*
* @param job the mapreduce job to be submitted
* @return JobConfigurer
*/
public static JobConfigurer createConfigurer(Job job) {
return JobConfigurer.create(job);
}
/**
* Get the JobContext with the related OutputFormat configuration populated given the alias
* and the actual JobContext
* @param alias the name given to the OutputFormat configuration
* @param context the JobContext
* @return a copy of the JobContext with the alias configuration populated
*/
public static JobContext getJobContext(String alias, JobContext context) {
String aliasConf = context.getConfiguration().get(getAliasConfName(alias));
JobContext aliasContext = ShimLoader.getHadoopShims().getHCatShim().createJobContext(
context.getConfiguration(), context.getJobID());
addToConfig(aliasConf, aliasContext.getConfiguration());
return aliasContext;
}
/**
* Get the TaskAttemptContext with the related OutputFormat configuration populated given the alias
* and the actual TaskAttemptContext
* @param alias the name given to the OutputFormat configuration
* @param context the Mapper or Reducer Context
* @return a copy of the TaskAttemptContext with the alias configuration populated
*/
public static TaskAttemptContext getTaskAttemptContext(String alias, TaskAttemptContext context) {
String aliasConf = context.getConfiguration().get(getAliasConfName(alias));
TaskAttemptContext aliasContext = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(
context.getConfiguration(), context.getTaskAttemptID());
addToConfig(aliasConf, aliasContext.getConfiguration());
return aliasContext;
}
/**
* Write the output key and value using the OutputFormat defined by the
* alias.
*
* @param alias the name given to the OutputFormat configuration
* @param key the output key to be written
* @param value the output value to be written
* @param context the Mapper or Reducer Context
* @throws IOException
* @throws InterruptedException
*/
public static void write(String alias, K key, V value, TaskInputOutputContext context)
throws IOException, InterruptedException {
KeyValue keyval = new KeyValue(key, value);
context.write(new Text(alias), keyval);
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
for (String alias : getOutputFormatAliases(context)) {
LOGGER.debug("Calling checkOutputSpecs for alias: " + alias);
JobContext aliasContext = getJobContext(alias, context);
OutputFormat, ?> outputFormat = getOutputFormatInstance(aliasContext);
outputFormat.checkOutputSpecs(aliasContext);
// Copy credentials and any new config added back to JobContext
context.getCredentials().addAll(aliasContext.getCredentials());
setAliasConf(alias, context, aliasContext);
}
}
@Override
public RecordWriter getRecordWriter(TaskAttemptContext context)
throws IOException,
InterruptedException {
return new MultiRecordWriter(context);
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException,
InterruptedException {
return new MultiOutputCommitter(context);
}
private static OutputFormat, ?> getOutputFormatInstance(JobContext context) {
OutputFormat, ?> outputFormat;
try {
outputFormat = ReflectionUtils.newInstance(context.getOutputFormatClass(),
context.getConfiguration());
} catch (ClassNotFoundException e) {
throw new IllegalStateException(e);
}
return outputFormat;
}
private static String[] getOutputFormatAliases(JobContext context) {
return context.getConfiguration().getStrings(MO_ALIASES);
}
/**
* Compare the aliasContext with userJob and add the differing configuration
* as mapreduce.multiout.alias..conf to the userJob.
*
* Merge config like tmpjars, tmpfile, tmparchives,
* mapreduce.job.hdfs-servers that are directly handled by JobClient and add
* them to userJob.
*
* Add mapred.output.dir config to userJob.
*
* @param alias alias name associated with a OutputFormat
* @param userJob reference to Job that the user is going to submit
* @param aliasContext JobContext populated with OutputFormat related
* configuration.
*/
private static void setAliasConf(String alias, JobContext userJob, JobContext aliasContext) {
Configuration userConf = userJob.getConfiguration();
StringBuilder builder = new StringBuilder();
for (Entry conf : aliasContext.getConfiguration()) {
String key = conf.getKey();
String value = conf.getValue();
String jobValue = userConf.getRaw(key);
if (jobValue == null || !jobValue.equals(value)) {
if (configsToMerge.containsKey(key)) {
String mergedValue = getMergedConfValue(jobValue, value, configsToMerge.get(key));
userConf.set(key, mergedValue);
} else {
if (configsToOverride.contains(key)) {
userConf.set(key, value);
}
builder.append(key).append(CONF_KEY_DELIM).append(value)
.append(CONF_VALUE_DELIM);
}
}
}
if (builder.length() > CONF_VALUE_DELIM.length()) {
builder.delete(builder.length() - CONF_VALUE_DELIM.length(), builder.length());
userConf.set(getAliasConfName(alias), builder.toString());
}
}
private static String getMergedConfValue(String originalValues, String newValues, String separator) {
if (originalValues == null) {
return newValues;
}
Set mergedValues = new LinkedHashSet();
mergedValues.addAll(Arrays.asList(StringUtils.split(originalValues, separator)));
mergedValues.addAll(Arrays.asList(StringUtils.split(newValues, separator)));
StringBuilder builder = new StringBuilder(originalValues.length() + newValues.length() + 2);
for (String value : mergedValues) {
builder.append(value).append(separator);
}
return builder.substring(0, builder.length() - separator.length());
}
private static String getAliasConfName(String alias) {
return MO_ALIAS + "." + alias + ".conf";
}
private static void addToConfig(String aliasConf, Configuration conf) {
String[] config = aliasConf.split(CONF_KEY_DELIM + "|" + CONF_VALUE_DELIM);
for (int i = 0; i < config.length; i += 2) {
conf.set(config[i], config[i + 1]);
}
}
/**
* Class that supports configuration of the job for multiple output formats.
*/
public static class JobConfigurer {
private final Job job;
private Map outputConfigs = new LinkedHashMap();
private JobConfigurer(Job job) {
this.job = job;
}
private static JobConfigurer create(Job job) {
JobConfigurer configurer = new JobConfigurer(job);
return configurer;
}
/**
* Add a OutputFormat configuration to the Job with a alias name.
*
* @param alias the name to be given to the OutputFormat configuration
* @param outputFormatClass OutputFormat class
* @param keyClass the key class for the output data
* @param valueClass the value class for the output data
* @throws IOException
*/
public void addOutputFormat(String alias,
Class extends OutputFormat> outputFormatClass,
Class> keyClass, Class> valueClass) throws IOException {
Job copy = new Job(this.job.getConfiguration());
outputConfigs.put(alias, copy);
copy.setOutputFormatClass(outputFormatClass);
copy.setOutputKeyClass(keyClass);
copy.setOutputValueClass(valueClass);
}
/**
* Get the Job configuration for a OutputFormat defined by the alias
* name. The job returned by this method should be passed to the
* OutputFormat for any configuration instead of the Job that will be
* submitted to the JobClient.
*
* @param alias the name used for the OutputFormat during
* addOutputFormat
* @return Job
*/
public Job getJob(String alias) {
Job copy = outputConfigs.get(alias);
if (copy == null) {
throw new IllegalArgumentException("OutputFormat with alias " + alias
+ " has not beed added");
}
return copy;
}
/**
* Configure the job with the multiple output formats added. This method
* should be called after all the output formats have been added and
* configured and before the job submission.
*/
public void configure() {
StringBuilder aliases = new StringBuilder();
Configuration jobConf = job.getConfiguration();
for (Entry entry : outputConfigs.entrySet()) {
// Copy credentials
job.getCredentials().addAll(entry.getValue().getCredentials());
String alias = entry.getKey();
aliases.append(alias).append(COMMA_DELIM);
// Store the differing configuration for each alias in the job
// as a setting.
setAliasConf(alias, job, entry.getValue());
}
aliases.delete(aliases.length() - COMMA_DELIM.length(), aliases.length());
jobConf.set(MO_ALIASES, aliases.toString());
}
}
private static class KeyValue implements Writable {
private final K key;
private final V value;
public KeyValue(K key, V value) {
this.key = key;
this.value = value;
}
public K getKey() {
return key;
}
public V getValue() {
return value;
}
@Override
public void write(DataOutput out) throws IOException {
// Ignore. Not required as this will be never
// serialized/deserialized.
}
@Override
public void readFields(DataInput in) throws IOException {
// Ignore. Not required as this will be never
// serialized/deserialized.
}
}
private static class MultiRecordWriter extends RecordWriter {
private final Map baseRecordWriters;
public MultiRecordWriter(TaskAttemptContext context) throws IOException,
InterruptedException {
baseRecordWriters = new LinkedHashMap();
String[] aliases = getOutputFormatAliases(context);
for (String alias : aliases) {
LOGGER.info("Creating record writer for alias: " + alias);
TaskAttemptContext aliasContext = getTaskAttemptContext(alias, context);
Configuration aliasConf = aliasContext.getConfiguration();
// Create output directory if not already created.
String outDir = aliasConf.get("mapred.output.dir");
if (outDir != null) {
Path outputDir = new Path(outDir);
FileSystem fs = outputDir.getFileSystem(aliasConf);
if (!fs.exists(outputDir)) {
fs.mkdirs(outputDir);
}
}
OutputFormat, ?> outputFormat = getOutputFormatInstance(aliasContext);
baseRecordWriters.put(alias,
new BaseRecordWriterContainer(outputFormat.getRecordWriter(aliasContext),
aliasContext));
}
}
@Override
public void write(Writable key, Writable value) throws IOException, InterruptedException {
Text _key = (Text) key;
KeyValue _value = (KeyValue) value;
String alias = new String(_key.getBytes(), 0, _key.getLength());
BaseRecordWriterContainer baseRWContainer = baseRecordWriters.get(alias);
if (baseRWContainer == null) {
throw new IllegalArgumentException("OutputFormat with alias " + alias
+ " has not been added");
}
baseRWContainer.getRecordWriter().write(_value.getKey(), _value.getValue());
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
for (Entry entry : baseRecordWriters.entrySet()) {
BaseRecordWriterContainer baseRWContainer = entry.getValue();
LOGGER.info("Closing record writer for alias: " + entry.getKey());
baseRWContainer.getRecordWriter().close(baseRWContainer.getContext());
}
}
}
private static class BaseRecordWriterContainer {
private final RecordWriter recordWriter;
private final TaskAttemptContext context;
public BaseRecordWriterContainer(RecordWriter recordWriter, TaskAttemptContext context) {
this.recordWriter = recordWriter;
this.context = context;
}
public RecordWriter getRecordWriter() {
return recordWriter;
}
public TaskAttemptContext getContext() {
return context;
}
}
public class MultiOutputCommitter extends OutputCommitter {
private final Map outputCommitters;
public MultiOutputCommitter(TaskAttemptContext context) throws IOException,
InterruptedException {
outputCommitters = new LinkedHashMap();
String[] aliases = getOutputFormatAliases(context);
for (String alias : aliases) {
LOGGER.info("Creating output committer for alias: " + alias);
TaskAttemptContext aliasContext = getTaskAttemptContext(alias, context);
OutputCommitter baseCommitter = getOutputFormatInstance(aliasContext)
.getOutputCommitter(aliasContext);
outputCommitters.put(alias,
new BaseOutputCommitterContainer(baseCommitter, aliasContext));
}
}
@Override
public void setupJob(JobContext jobContext) throws IOException {
for (String alias : outputCommitters.keySet()) {
LOGGER.info("Calling setupJob for alias: " + alias);
BaseOutputCommitterContainer outputContainer = outputCommitters.get(alias);
outputContainer.getBaseCommitter().setupJob(outputContainer.getContext());
}
}
@Override
public void setupTask(TaskAttemptContext taskContext) throws IOException {
for (String alias : outputCommitters.keySet()) {
LOGGER.info("Calling setupTask for alias: " + alias);
BaseOutputCommitterContainer outputContainer = outputCommitters.get(alias);
outputContainer.getBaseCommitter().setupTask(outputContainer.getContext());
}
}
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) throws IOException {
boolean needTaskCommit = false;
for (String alias : outputCommitters.keySet()) {
BaseOutputCommitterContainer outputContainer = outputCommitters.get(alias);
needTaskCommit = needTaskCommit
|| outputContainer.getBaseCommitter().needsTaskCommit(
outputContainer.getContext());
}
return needTaskCommit;
}
@Override
public void commitTask(TaskAttemptContext taskContext) throws IOException {
for (String alias : outputCommitters.keySet()) {
BaseOutputCommitterContainer outputContainer = outputCommitters.get(alias);
OutputCommitter baseCommitter = outputContainer.getBaseCommitter();
TaskAttemptContext committerContext = outputContainer.getContext();
if (baseCommitter.needsTaskCommit(committerContext)) {
LOGGER.info("Calling commitTask for alias: " + alias);
baseCommitter.commitTask(committerContext);
}
}
}
@Override
public void abortTask(TaskAttemptContext taskContext) throws IOException {
for (String alias : outputCommitters.keySet()) {
LOGGER.info("Calling abortTask for alias: " + alias);
BaseOutputCommitterContainer outputContainer = outputCommitters.get(alias);
outputContainer.getBaseCommitter().abortTask(outputContainer.getContext());
}
}
@Override
public void commitJob(JobContext jobContext) throws IOException {
for (String alias : outputCommitters.keySet()) {
LOGGER.info("Calling commitJob for alias: " + alias);
BaseOutputCommitterContainer outputContainer = outputCommitters.get(alias);
outputContainer.getBaseCommitter().commitJob(outputContainer.getContext());
}
}
@Override
public void abortJob(JobContext jobContext, State state) throws IOException {
for (String alias : outputCommitters.keySet()) {
LOGGER.info("Calling abortJob for alias: " + alias);
BaseOutputCommitterContainer outputContainer = outputCommitters.get(alias);
outputContainer.getBaseCommitter().abortJob(outputContainer.getContext(), state);
}
}
}
private static class BaseOutputCommitterContainer {
private final OutputCommitter outputCommitter;
private final TaskAttemptContext context;
public BaseOutputCommitterContainer(OutputCommitter outputCommitter,
TaskAttemptContext context) {
this.outputCommitter = outputCommitter;
this.context = context;
}
public OutputCommitter getBaseCommitter() {
return outputCommitter;
}
public TaskAttemptContext getContext() {
return context;
}
}
}