All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.hadoop.PerMapOutputFormat Maven / Gradle / Ivy

The newest version!
/**
 * Copyright 2012 Internet Archive
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.archive.hadoop;

import java.io.*;
import java.util.*;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileAlreadyExistsException;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * OutputFormat that directs the output to a file named according to
 * the input file.  For instance, if the input file is "foo", then the
 * output file is also named "foo".  A suffix can be easily added, or
 * a regex+replace applied to the input filename to produce an output
 * filename.
 *
 * This class can be used in conjunction with FilenameInputFormat in a
 * map-reduce job that only does a map() function, no reduce.  By
 * combining these input and output formats, it's easy to read from a
 * large set of input files, process each one in a separate map task,
 * and write the output to a file with a name based on the input.
 *
 * For example, suppose you had 1000 WARC files and your map() task
 * just reads a single WARC file and outputs the number of records in
 * it.  Use the FilenameInputFormat and the PerMapOutputFormat,
 * setting the "permap.suffix" property to ".count" and for each WARC
 * input file (e.g. "foo.warc.gz") you'll get a corresponding ".count"
 * file in the output (.e.g. "foo.warc.gz.count").
 *
 * The nice thing about using this class as the OutputFormat is that
 * Hadoop will manage the temporary file for you.  This means that if
 * the map task fails (suppose the task node kernel panics), Hadoop
 * will automatically delete the temp file from the failed task and
 * re-schedule it.
 *
 * This class assumes the actual OutputFormat is a SequenceFile.  If
 * not -- suppose you want to output a MapFile or plain text -- then
 * specify the output format in the "permap.output.format.class"
 * property.
 *
 * This class was insired by Hadoop's
 * 
org.apache.hadoop.mapred.lib.MultipleOutputFormat
*/ public class PerMapOutputFormat extends FileOutputFormat { private String getOutputFilename( JobConf job ) throws IOException { String regex = job.get( "permap.regex" , null ); String replace = job.get( "permap.replace", null ); String suffix = job.get( "permap.suffix" , null ); String inputFilename = job.get("map.input.file"); if ( inputFilename == null ) throw new IOException( "map.input.file is null, not running in map task?" ); String outputFilename = (new Path(inputFilename)).getName(); if ( regex != null && replace != null ) { outputFilename = outputFilename.replaceAll( regex, replace ); } else if ( suffix != null ) { outputFilename += suffix; } if ( outputFilename == null ) throw new IOException( "outputFilename is null" ); return outputFilename; } private OutputFormat getOutputFormat( JobConf job ) { return ReflectionUtils.newInstance( job.getClass( "permap.output.format.class", SequenceFileOutputFormat.class, OutputFormat.class ), job ); } public RecordWriter getRecordWriter( FileSystem fs, JobConf job, String name, Progressable progress ) throws IOException { String outputFilename = getOutputFilename( job ); OutputFormat of = getOutputFormat( job ); return of.getRecordWriter( fs, job, outputFilename, progress ); } /** * Over-ride the default FileOutputFormat's checkOutputSpecs() to * allow for the target directory to already exist. */ public void checkOutputSpecs( FileSystem ignored, JobConf job ) throws FileAlreadyExistsException, InvalidJobConfException, IOException { } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy