com.mongodb.hadoop.util.MongoTool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mongo-hadoop-core Show documentation
Show all versions of mongo-hadoop-core Show documentation
The MongoDB Connector for Hadoop is a plugin for Hadoop that provides the ability to use MongoDB as an input source and/or an output destination.
The newest version!
/*
* Copyright 2010-2013 10gen Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mongodb.hadoop.util;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import java.io.IOException;
import java.util.Map.Entry;
/**
* Tool for simplifying the setup and usage of Mongo Hadoop jobs using the Tool / Configured interfaces for use w/ a ToolRunner Primarily
* useful in cases of XML Config files.
*
* Main will be a necessary method to run the job - suggested implementation template:
*
*
* public static void main(String[] args) throws Exception {
* System.exit(ToolRunner.run(new <YourClass>(), args));
* }
*
*
* @author Brendan W. McAdams
*/
public class MongoTool extends Configured implements Tool {
private static final Log LOG = LogFactory.getLog(MongoTool.class);
/**
* Defines the name of the job on the cluster. Left non-final to allow tweaking with serial #s, etc. Defaults to the
* getClass().getSimpleName()
*/
private String jobName = getClass().getSimpleName();
public String getJobName() {
return jobName;
}
public void setJobName(final String name) {
jobName = name;
}
@SuppressWarnings("deprecation")
public int run(final String[] args) throws Exception {
/**
* ToolRunner will configure/process/setup the config
* so we need to grab the class level one
* This will be inited with any loaded xml files or -D prop=value params
*/
final Configuration conf = getConf();
boolean mapredJob;
LOG.info(String.format("Created a conf: '%s' on {%s} as job named '%s'", conf, getClass(), getJobName()));
try {
FileSystem.class.getDeclaredField("DEFAULT_FS").get(null);
mapredJob = false;
} catch (NoSuchFieldException e) {
mapredJob = true;
}
if (LOG.isTraceEnabled()) {
for (final Entry entry : conf) {
LOG.trace(String.format("%s=%s\n", entry.getKey(), entry.getValue()));
}
}
if (mapredJob) {
return runMapredJob(conf);
} else {
return runMapReduceJob(conf);
}
}
private int runMapredJob(final Configuration conf) {
final JobConf job = new JobConf(conf, getClass());
/**
* Any arguments specified with -D =
* on the CLI will be picked up and set here
* They override any XML level values
* Note that -D is important - no space will
* not work as it gets picked up by Java itself
*/
// TODO - Do we need to set job name somehow more specifically?
// This may or may not be correct/sane
job.setJarByClass(getClass());
final Class extends org.apache.hadoop.mapred.Mapper> mapper = MapredMongoConfigUtil.getMapper(conf);
LOG.debug("Mapper Class: " + mapper);
LOG.debug("Input URI: " + conf.get(MapredMongoConfigUtil.INPUT_URI));
job.setMapperClass(mapper);
Class extends org.apache.hadoop.mapred.Reducer> combiner = MapredMongoConfigUtil.getCombiner(conf);
if (combiner != null) {
job.setCombinerClass(combiner);
}
job.setReducerClass(MapredMongoConfigUtil.getReducer(conf));
job.setOutputFormat(MapredMongoConfigUtil.getOutputFormat(conf));
job.setOutputKeyClass(MapredMongoConfigUtil.getOutputKey(conf));
job.setOutputValueClass(MapredMongoConfigUtil.getOutputValue(conf));
job.setInputFormat(MapredMongoConfigUtil.getInputFormat(conf));
Class mapOutputKeyClass = MapredMongoConfigUtil.getMapperOutputKey(conf);
Class mapOutputValueClass = MapredMongoConfigUtil.getMapperOutputValue(conf);
if (mapOutputKeyClass != null) {
job.setMapOutputKeyClass(mapOutputKeyClass);
}
if (mapOutputValueClass != null) {
job.setMapOutputValueClass(mapOutputValueClass);
}
/**
* Determines if the job will run verbosely e.g. print debug output
* Only works with foreground jobs
*/
final boolean verbose = MapredMongoConfigUtil.isJobVerbose(conf);
/**
* Run job in foreground aka wait for completion or background?
*/
final boolean background = MapredMongoConfigUtil.isJobBackground(conf);
try {
RunningJob runningJob = JobClient.runJob(job);
if (background) {
LOG.info("Setting up and running MapReduce job in background.");
return 0;
} else {
LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? "
+ verbose + "}");
runningJob.waitForCompletion();
return 0;
}
} catch (final Exception e) {
LOG.error("Exception while executing job... ", e);
return 1;
}
}
private int runMapReduceJob(final Configuration conf) throws IOException {
final Job job = Job.getInstance(conf, getJobName());
/**
* Any arguments specified with -D =
* on the CLI will be picked up and set here
* They override any XML level values
* Note that -D is important - no space will
* not work as it gets picked up by Java itself
*/
// TODO - Do we need to set job name somehow more specifically?
// This may or may not be correct/sane
job.setJarByClass(getClass());
final Class extends Mapper> mapper = MongoConfigUtil.getMapper(conf);
LOG.debug("Mapper Class: " + mapper);
LOG.debug("Input URI: " + conf.get(MongoConfigUtil.INPUT_URI));
job.setMapperClass(mapper);
Class extends Reducer> combiner = MongoConfigUtil.getCombiner(conf);
if (combiner != null) {
job.setCombinerClass(combiner);
}
job.setReducerClass(MongoConfigUtil.getReducer(conf));
job.setOutputFormatClass(MongoConfigUtil.getOutputFormat(conf));
job.setOutputKeyClass(MongoConfigUtil.getOutputKey(conf));
job.setOutputValueClass(MongoConfigUtil.getOutputValue(conf));
job.setInputFormatClass(MongoConfigUtil.getInputFormat(conf));
Class mapOutputKeyClass = MongoConfigUtil.getMapperOutputKey(conf);
Class mapOutputValueClass = MongoConfigUtil.getMapperOutputValue(conf);
if (mapOutputKeyClass != null) {
job.setMapOutputKeyClass(mapOutputKeyClass);
}
if (mapOutputValueClass != null) {
job.setMapOutputValueClass(mapOutputValueClass);
}
/**
* Determines if the job will run verbosely e.g. print debug output
* Only works with foreground jobs
*/
final boolean verbose = MongoConfigUtil.isJobVerbose(conf);
/**
* Run job in foreground aka wait for completion or background?
*/
final boolean background = MongoConfigUtil.isJobBackground(conf);
try {
if (background) {
LOG.info("Setting up and running MapReduce job in background.");
job.submit();
return 0;
} else {
LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? "
+ verbose + "}");
return job.waitForCompletion(true) ? 0 : 1;
}
} catch (final Exception e) {
LOG.error("Exception while executing job... ", e);
return 1;
}
}
}