org.apache.hadoop.mapred.lib.MultipleOutputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hadoop-apache Show documentation
Show all versions of hadoop-apache Show documentation
Shaded version of Apache Hadoop for Presto
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred.lib;
import java.io.IOException;
import java.util.Iterator;
import java.util.TreeMap;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.util.Progressable;
/**
* This abstract class extends the FileOutputFormat, allowing to write the
* output data to different output files. There are three basic use cases for
* this class.
*
* Case one: This class is used for a map reduce job with at least one reducer.
* The reducer wants to write data to different files depending on the actual
* keys. It is assumed that a key (or value) encodes the actual key (value)
* and the desired location for the actual key (value).
*
* Case two: This class is used for a map only job. The job wants to use an
* output file name that is either a part of the input file name of the input
* data, or some derivation of it.
*
* Case three: This class is used for a map only job. The job wants to use an
* output file name that depends on both the keys and the input file name,
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public abstract class MultipleOutputFormat
extends FileOutputFormat {
/**
* Create a composite record writer that can write key/value data to different
* output files
*
* @param fs
* the file system to use
* @param job
* the job conf for the job
* @param name
* the leaf file name for the output file (such as part-00000")
* @param arg3
* a progressable for reporting progress.
* @return a composite record writer
* @throws IOException
*/
public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
String name, Progressable arg3) throws IOException {
final FileSystem myFS = fs;
final String myName = generateLeafFileName(name);
final JobConf myJob = job;
final Progressable myProgressable = arg3;
return new RecordWriter() {
// a cache storing the record writers for different output files.
TreeMap> recordWriters = new TreeMap>();
public void write(K key, V value) throws IOException {
// get the file name based on the key
String keyBasedPath = generateFileNameForKeyValue(key, value, myName);
// get the file name based on the input file name
String finalPath = getInputFileBasedOutputFileName(myJob, keyBasedPath);
// get the actual key
K actualKey = generateActualKey(key, value);
V actualValue = generateActualValue(key, value);
RecordWriter rw = this.recordWriters.get(finalPath);
if (rw == null) {
// if we don't have the record writer yet for the final path, create
// one
// and add it to the cache
rw = getBaseRecordWriter(myFS, myJob, finalPath, myProgressable);
this.recordWriters.put(finalPath, rw);
}
rw.write(actualKey, actualValue);
};
public void close(Reporter reporter) throws IOException {
Iterator keys = this.recordWriters.keySet().iterator();
while (keys.hasNext()) {
RecordWriter rw = this.recordWriters.get(keys.next());
rw.close(reporter);
}
this.recordWriters.clear();
};
};
}
/**
* Generate the leaf name for the output file name. The default behavior does
* not change the leaf file name (such as part-00000)
*
* @param name
* the leaf file name for the output file
* @return the given leaf file name
*/
protected String generateLeafFileName(String name) {
return name;
}
/**
* Generate the file output file name based on the given key and the leaf file
* name. The default behavior is that the file name does not depend on the
* key.
*
* @param key
* the key of the output data
* @param name
* the leaf file name
* @return generated file name
*/
protected String generateFileNameForKeyValue(K key, V value, String name) {
return name;
}
/**
* Generate the actual key from the given key/value. The default behavior is that
* the actual key is equal to the given key
*
* @param key
* the key of the output data
* @param value
* the value of the output data
* @return the actual key derived from the given key/value
*/
protected K generateActualKey(K key, V value) {
return key;
}
/**
* Generate the actual value from the given key and value. The default behavior is that
* the actual value is equal to the given value
*
* @param key
* the key of the output data
* @param value
* the value of the output data
* @return the actual value derived from the given key/value
*/
protected V generateActualValue(K key, V value) {
return value;
}
/**
* Generate the outfile name based on a given anme and the input file name. If
* the {@link JobContext#MAP_INPUT_FILE} does not exists (i.e. this is not for a map only job),
* the given name is returned unchanged. If the config value for
* "num.of.trailing.legs.to.use" is not set, or set 0 or negative, the given
* name is returned unchanged. Otherwise, return a file name consisting of the
* N trailing legs of the input file name where N is the config value for
* "num.of.trailing.legs.to.use".
*
* @param job
* the job config
* @param name
* the output file name
* @return the outfile name based on a given anme and the input file name.
*/
protected String getInputFileBasedOutputFileName(JobConf job, String name) {
String infilepath = job.get(MRJobConfig.MAP_INPUT_FILE);
if (infilepath == null) {
// if the {@link JobContext#MAP_INPUT_FILE} does not exists,
// then return the given name
return name;
}
int numOfTrailingLegsToUse = job.getInt("mapred.outputformat.numOfTrailingLegs", 0);
if (numOfTrailingLegsToUse <= 0) {
return name;
}
Path infile = new Path(infilepath);
Path parent = infile.getParent();
String midName = infile.getName();
Path outPath = new Path(midName);
for (int i = 1; i < numOfTrailingLegsToUse; i++) {
if (parent == null) break;
midName = parent.getName();
if (midName.length() == 0) break;
parent = parent.getParent();
outPath = new Path(midName, outPath);
}
return outPath.toString();
}
/**
*
* @param fs
* the file system to use
* @param job
* a job conf object
* @param name
* the name of the file over which a record writer object will be
* constructed
* @param arg3
* a progressable object
* @return A RecordWriter object over the given file
* @throws IOException
*/
abstract protected RecordWriter getBaseRecordWriter(FileSystem fs,
JobConf job, String name, Progressable arg3) throws IOException;
}