Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.mahout.common.AbstractJob Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.mahout.common;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
import com.google.common.base.Preconditions;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.common.lucene.AnalyzerUtils;
import org.apache.mahout.math.VectorWritable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Superclass of many Mahout Hadoop "jobs". A job drives configuration and launch of one or
* more maps and reduces in order to accomplish some task.
*
* Command line arguments available to all subclasses are:
*
*
* --tempDir (path): Specifies a directory where the job may place temp files
* (default "temp")
* --help: Show help message
*
*
* In addition, note some key command line parameters that are parsed by Hadoop, which jobs
* may need to set:
*
*
* -Dmapred.job.name=(name): Sets the Hadoop task names. It will be suffixed by
* the mapper and reducer class names
* -Dmapred.output.compress={true,false}: Compress final output (default true)
* -Dmapred.input.dir=(path): input file, or directory containing input files (required)
* -Dmapred.output.dir=(path): path to write output files (required)
*
*
* Note that because of how Hadoop parses arguments, all "-D" arguments must appear before all other
* arguments.
*/
public abstract class AbstractJob extends Configured implements Tool {
private static final Logger log = LoggerFactory.getLogger(AbstractJob.class);
/** option used to specify the input path */
private Option inputOption;
/** option used to specify the output path */
private Option outputOption;
/** input path, populated by {@link #parseArguments(String[])} */
protected Path inputPath;
protected File inputFile; //the input represented as a file
/** output path, populated by {@link #parseArguments(String[])} */
protected Path outputPath;
protected File outputFile; //the output represented as a file
/** temp path, populated by {@link #parseArguments(String[])} */
protected Path tempPath;
protected Map> argMap;
/** internal list of options that have been added */
private final List options;
private Group group;
protected AbstractJob() {
options = new LinkedList<>();
}
/** Returns the input path established by a call to {@link #parseArguments(String[])}.
* The source of the path may be an input option added using {@link #addInputOption()}
* or it may be the value of the {@code mapred.input.dir} configuration
* property.
*/
protected Path getInputPath() {
return inputPath;
}
/** Returns the output path established by a call to {@link #parseArguments(String[])}.
* The source of the path may be an output option added using {@link #addOutputOption()}
* or it may be the value of the {@code mapred.input.dir} configuration
* property.
*/
protected Path getOutputPath() {
return outputPath;
}
protected Path getOutputPath(String path) {
return new Path(outputPath, path);
}
protected File getInputFile() {
return inputFile;
}
protected File getOutputFile() {
return outputFile;
}
protected Path getTempPath() {
return tempPath;
}
protected Path getTempPath(String directory) {
return new Path(tempPath, directory);
}
@Override
public Configuration getConf() {
Configuration result = super.getConf();
if (result == null) {
return new Configuration();
}
return result;
}
/** Add an option with no argument whose presence can be checked for using
* {@code containsKey} method on the map returned by {@link #parseArguments(String[])};
*/
protected void addFlag(String name, String shortName, String description) {
options.add(buildOption(name, shortName, description, false, false, null));
}
/** Add an option to the the set of options this job will parse when
* {@link #parseArguments(String[])} is called. This options has an argument
* with null as its default value.
*/
protected void addOption(String name, String shortName, String description) {
options.add(buildOption(name, shortName, description, true, false, null));
}
/** Add an option to the the set of options this job will parse when
* {@link #parseArguments(String[])} is called.
*
* @param required if true the {@link #parseArguments(String[])} will throw
* fail with an error and usage message if this option is not specified
* on the command line.
*/
protected void addOption(String name, String shortName, String description, boolean required) {
options.add(buildOption(name, shortName, description, true, required, null));
}
/** Add an option to the the set of options this job will parse when
* {@link #parseArguments(String[])} is called. If this option is not
* specified on the command line the default value will be
* used.
*
* @param defaultValue the default argument value if this argument is not
* found on the command-line. null is allowed.
*/
protected void addOption(String name, String shortName, String description, String defaultValue) {
options.add(buildOption(name, shortName, description, true, false, defaultValue));
}
/** Add an arbitrary option to the set of options this job will parse when
* {@link #parseArguments(String[])} is called. If this option has no
* argument, use {@code containsKey} on the map returned by
* {@code parseArguments} to check for its presence. Otherwise, the
* string value of the option will be placed in the map using a key
* equal to this options long name preceded by '--'.
* @return the option added.
*/
protected Option addOption(Option option) {
options.add(option);
return option;
}
protected Group getGroup() {
return group;
}
/** Add the default input directory option, '-i' which takes a directory
* name as an argument. When {@link #parseArguments(String[])} is
* called, the inputPath will be set based upon the value for this option.
* If this method is called, the input is required.
*/
protected void addInputOption() {
this.inputOption = addOption(DefaultOptionCreator.inputOption().create());
}
/** Add the default output directory option, '-o' which takes a directory
* name as an argument. When {@link #parseArguments(String[])} is
* called, the outputPath will be set based upon the value for this option.
* If this method is called, the output is required.
*/
protected void addOutputOption() {
this.outputOption = addOption(DefaultOptionCreator.outputOption().create());
}
/** Build an option with the given parameters. Name and description are
* required.
*
* @param name the long name of the option prefixed with '--' on the command-line
* @param shortName the short name of the option, prefixed with '-' on the command-line
* @param description description of the option displayed in help method
* @param hasArg true if the option has an argument.
* @param required true if the option is required.
* @param defaultValue default argument value, can be null.
* @return the option.
*/
protected static Option buildOption(String name,
String shortName,
String description,
boolean hasArg,
boolean required,
String defaultValue) {
return buildOption(name, shortName, description, hasArg, 1, 1, required, defaultValue);
}
protected static Option buildOption(String name,
String shortName,
String description,
boolean hasArg, int min, int max,
boolean required,
String defaultValue) {
DefaultOptionBuilder optBuilder = new DefaultOptionBuilder().withLongName(name).withDescription(description)
.withRequired(required);
if (shortName != null) {
optBuilder.withShortName(shortName);
}
if (hasArg) {
ArgumentBuilder argBuilder = new ArgumentBuilder().withName(name).withMinimum(min).withMaximum(max);
if (defaultValue != null) {
argBuilder = argBuilder.withDefault(defaultValue);
}
optBuilder.withArgument(argBuilder.create());
}
return optBuilder.create();
}
/**
* @param name The name of the option
* @return the {@link org.apache.commons.cli2.Option} with the name, else null
*/
protected Option getCLIOption(String name) {
for (Option option : options) {
if (option.getPreferredName().equals(name)) {
return option;
}
}
return null;
}
/** Parse the arguments specified based on the options defined using the
* various {@code addOption} methods. If -h is specified or an
* exception is encountered print help and return null. Has the
* side effect of setting inputPath and outputPath
* if {@code addInputOption} or {@code addOutputOption}
* or {@code mapred.input.dir} or {@code mapred.output.dir}
* are present in the Configuration.
*
* @return a {@code Map} containing options and their argument values.
* The presence of a flag can be tested using {@code containsKey}, while
* argument values can be retrieved using {@code get(optionName)}. The
* names used for keys are the option name parameter prefixed by '--'.
*
* @see #parseArguments(String[], boolean, boolean) -- passes in false, false for the optional args.
*/
public Map> parseArguments(String[] args) throws IOException {
return parseArguments(args, false, false);
}
/**
*
* @param args The args to parse
* @param inputOptional if false, then the input option, if set, need not be present. If true and input is an option
* and there is no input, then throw an error
* @param outputOptional if false, then the output option, if set, need not be present. If true and output is an
* option and there is no output, then throw an error
* @return the args parsed into a map.
*/
public Map> parseArguments(String[] args, boolean inputOptional, boolean outputOptional)
throws IOException {
Option helpOpt = addOption(DefaultOptionCreator.helpOption());
addOption("tempDir", null, "Intermediate output directory", "temp");
addOption("startPhase", null, "First phase to run", "0");
addOption("endPhase", null, "Last phase to run", String.valueOf(Integer.MAX_VALUE));
GroupBuilder gBuilder = new GroupBuilder().withName("Job-Specific Options:");
for (Option opt : options) {
gBuilder = gBuilder.withOption(opt);
}
group = gBuilder.create();
CommandLine cmdLine;
try {
Parser parser = new Parser();
parser.setGroup(group);
parser.setHelpOption(helpOpt);
cmdLine = parser.parse(args);
} catch (OptionException e) {
log.error(e.getMessage());
CommandLineUtil.printHelpWithGenericOptions(group, e);
return null;
}
if (cmdLine.hasOption(helpOpt)) {
CommandLineUtil.printHelpWithGenericOptions(group);
return null;
}
try {
parseDirectories(cmdLine, inputOptional, outputOptional);
} catch (IllegalArgumentException e) {
log.error(e.getMessage());
CommandLineUtil.printHelpWithGenericOptions(group);
return null;
}
argMap = new TreeMap>();
maybePut(argMap, cmdLine, this.options.toArray(new Option[this.options.size()]));
this.tempPath = new Path(getOption("tempDir"));
if (!hasOption("quiet")) {
log.info("Command line arguments: {}", argMap);
}
return argMap;
}
/**
* Build the option key (--name) from the option name
*/
public static String keyFor(String optionName) {
return "--" + optionName;
}
/**
* @return the requested option, or null if it has not been specified
*/
public String getOption(String optionName) {
List list = argMap.get(keyFor(optionName));
if (list != null && !list.isEmpty()) {
return list.get(0);
}
return null;
}
/**
* Get the option, else the default
* @param optionName The name of the option to look up, without the --
* @param defaultVal The default value.
* @return The requested option, else the default value if it doesn't exist
*/
public String getOption(String optionName, String defaultVal) {
String res = getOption(optionName);
if (res == null) {
res = defaultVal;
}
return res;
}
public int getInt(String optionName) {
return Integer.parseInt(getOption(optionName));
}
public int getInt(String optionName, int defaultVal) {
return Integer.parseInt(getOption(optionName, String.valueOf(defaultVal)));
}
public float getFloat(String optionName) {
return Float.parseFloat(getOption(optionName));
}
public float getFloat(String optionName, float defaultVal) {
return Float.parseFloat(getOption(optionName, String.valueOf(defaultVal)));
}
/**
* Options can occur multiple times, so return the list
* @param optionName The unadorned (no "--" prefixing it) option name
* @return The values, else null. If the option is present, but has no values, then the result will be an
* empty list (Collections.emptyList())
*/
public List getOptions(String optionName) {
return argMap.get(keyFor(optionName));
}
/**
* @return if the requested option has been specified
*/
public boolean hasOption(String optionName) {
return argMap.containsKey(keyFor(optionName));
}
/**
* Get the cardinality of the input vectors
*
* @param matrix
* @return the cardinality of the vector
*/
public int getDimensions(Path matrix) throws IOException {
try (SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(getConf()), matrix, getConf())){
Writable row = ClassUtils.instantiateAs(reader.getKeyClass().asSubclass(Writable.class), Writable.class);
Preconditions.checkArgument(reader.getValueClass().equals(VectorWritable.class),
"value type of sequencefile must be a VectorWritable");
VectorWritable vectorWritable = new VectorWritable();
boolean hasAtLeastOneRow = reader.next(row, vectorWritable);
Preconditions.checkState(hasAtLeastOneRow, "matrix must have at least one row");
return vectorWritable.get().size();
}
}
/**
* Obtain input and output directories from command-line options or hadoop
* properties. If {@code addInputOption} or {@code addOutputOption}
* has been called, this method will throw an {@code OptionException} if
* no source (command-line or property) for that value is present.
* Otherwise, {@code inputPath} or {@code outputPath} will be
* non-null only if specified as a hadoop property. Command-line options
* take precedence over hadoop properties.
*
* @throws IllegalArgumentException if either inputOption is present,
* and neither {@code --input} nor {@code -Dmapred.input dir} are
* specified or outputOption is present and neither {@code --output}
* nor {@code -Dmapred.output.dir} are specified.
*/
protected void parseDirectories(CommandLine cmdLine, boolean inputOptional, boolean outputOptional) {
Configuration conf = getConf();
if (inputOption != null && cmdLine.hasOption(inputOption)) {
this.inputPath = new Path(cmdLine.getValue(inputOption).toString());
this.inputFile = new File(cmdLine.getValue(inputOption).toString());
}
if (inputPath == null && conf.get("mapred.input.dir") != null) {
this.inputPath = new Path(conf.get("mapred.input.dir"));
}
if (outputOption != null && cmdLine.hasOption(outputOption)) {
this.outputPath = new Path(cmdLine.getValue(outputOption).toString());
this.outputFile = new File(cmdLine.getValue(outputOption).toString());
}
if (outputPath == null && conf.get("mapred.output.dir") != null) {
this.outputPath = new Path(conf.get("mapred.output.dir"));
}
Preconditions.checkArgument(inputOptional || inputOption == null || inputPath != null,
"No input specified or -Dmapred.input.dir must be provided to specify input directory");
Preconditions.checkArgument(outputOptional || outputOption == null || outputPath != null,
"No output specified: or -Dmapred.output.dir must be provided to specify output directory");
}
protected static void maybePut(Map> args, CommandLine cmdLine, Option... opt) {
for (Option o : opt) {
// the option appeared on the command-line, or it has a value
// (which is likely a default value).
if (cmdLine.hasOption(o) || cmdLine.getValue(o) != null
|| (cmdLine.getValues(o) != null && !cmdLine.getValues(o).isEmpty())) {
// nulls are ok, for cases where options are simple flags.
List> vo = cmdLine.getValues(o);
if (vo != null && !vo.isEmpty()) {
List vals = new ArrayList<>();
for (Object o1 : vo) {
vals.add(o1.toString());
}
args.put(o.getPreferredName(), vals);
} else {
args.put(o.getPreferredName(), null);
}
}
}
}
/**
*
* @param args The input argument map
* @param optName The adorned (including "--") option name
* @return The first value in the match, else null
*/
public static String getOption(Map> args, String optName) {
List res = args.get(optName);
if (res != null && !res.isEmpty()) {
return res.get(0);
}
return null;
}
protected static boolean shouldRunNextPhase(Map> args, AtomicInteger currentPhase) {
int phase = currentPhase.getAndIncrement();
String startPhase = getOption(args, "--startPhase");
String endPhase = getOption(args, "--endPhase");
boolean phaseSkipped = (startPhase != null && phase < Integer.parseInt(startPhase))
|| (endPhase != null && phase > Integer.parseInt(endPhase));
if (phaseSkipped) {
log.info("Skipping phase {}", phase);
}
return !phaseSkipped;
}
protected Job prepareJob(Path inputPath,
Path outputPath,
Class extends InputFormat> inputFormat,
Class extends Mapper> mapper,
Class extends Writable> mapperKey,
Class extends Writable> mapperValue,
Class extends OutputFormat> outputFormat) throws IOException {
return prepareJob(inputPath, outputPath, inputFormat, mapper, mapperKey, mapperValue, outputFormat, null);
}
protected Job prepareJob(Path inputPath,
Path outputPath,
Class extends InputFormat> inputFormat,
Class extends Mapper> mapper,
Class extends Writable> mapperKey,
Class extends Writable> mapperValue,
Class extends OutputFormat> outputFormat,
String jobname) throws IOException {
Job job = HadoopUtil.prepareJob(inputPath, outputPath,
inputFormat, mapper, mapperKey, mapperValue, outputFormat, getConf());
String name =
jobname != null ? jobname : HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class);
job.setJobName(name);
return job;
}
protected Job prepareJob(Path inputPath, Path outputPath, Class extends Mapper> mapper,
Class extends Writable> mapperKey, Class extends Writable> mapperValue, Class extends Reducer> reducer,
Class extends Writable> reducerKey, Class extends Writable> reducerValue) throws IOException {
return prepareJob(inputPath, outputPath, SequenceFileInputFormat.class, mapper, mapperKey, mapperValue, reducer,
reducerKey, reducerValue, SequenceFileOutputFormat.class);
}
protected Job prepareJob(Path inputPath,
Path outputPath,
Class extends InputFormat> inputFormat,
Class extends Mapper> mapper,
Class extends Writable> mapperKey,
Class extends Writable> mapperValue,
Class extends Reducer> reducer,
Class extends Writable> reducerKey,
Class extends Writable> reducerValue,
Class extends OutputFormat> outputFormat) throws IOException {
Job job = HadoopUtil.prepareJob(inputPath, outputPath,
inputFormat, mapper, mapperKey, mapperValue, reducer, reducerKey, reducerValue, outputFormat, getConf());
job.setJobName(HadoopUtil.getCustomJobName(getClass().getSimpleName(), job, mapper, Reducer.class));
return job;
}
/**
* necessary to make this job (having a combined input path) work on Amazon S3, hopefully this is
* obsolete when MultipleInputs is available again
*/
public static void setS3SafeCombinedInputPath(Job job, Path referencePath, Path inputPathOne, Path inputPathTwo)
throws IOException {
FileSystem fs = FileSystem.get(referencePath.toUri(), job.getConfiguration());
FileInputFormat.setInputPaths(job, inputPathOne.makeQualified(fs), inputPathTwo.makeQualified(fs));
}
protected Class extends Analyzer> getAnalyzerClassFromOption() throws ClassNotFoundException {
Class extends Analyzer> analyzerClass = StandardAnalyzer.class;
if (hasOption(DefaultOptionCreator.ANALYZER_NAME_OPTION)) {
String className = getOption(DefaultOptionCreator.ANALYZER_NAME_OPTION);
analyzerClass = Class.forName(className).asSubclass(Analyzer.class);
// try instantiating it, b/c there isn't any point in setting it if
// you can't instantiate it
//ClassUtils.instantiateAs(analyzerClass, Analyzer.class);
AnalyzerUtils.createAnalyzer(analyzerClass);
}
return analyzerClass;
}
/**
* Overrides the base implementation to install the Oozie action configuration resource
* into the provided Configuration object; note that ToolRunner calls setConf on the Tool
* before it invokes run.
*/
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
// If running in an Oozie workflow as a Java action, need to add the
// Configuration resource provided by Oozie to this job's config.
String oozieActionConfXml = System.getProperty("oozie.action.conf.xml");
if (oozieActionConfXml != null && conf != null) {
conf.addResource(new Path("file:///", oozieActionConfXml));
log.info("Added Oozie action Configuration resource {} to the Hadoop Configuration", oozieActionConfXml);
}
}
}