All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.thinkaurelius.titan.hadoop.scan.HadoopScanRunner Maven / Gradle / Ivy

package com.thinkaurelius.titan.hadoop.scan;

import com.google.common.base.Preconditions;
import com.thinkaurelius.titan.diskstorage.configuration.*;
import com.thinkaurelius.titan.diskstorage.configuration.Configuration;
import com.thinkaurelius.titan.diskstorage.keycolumnvalue.scan.ScanJob;
import com.thinkaurelius.titan.diskstorage.keycolumnvalue.scan.ScanMetrics;
import com.thinkaurelius.titan.graphdb.olap.VertexScanJob;
import com.thinkaurelius.titan.hadoop.compat.HadoopCompatLoader;
import com.thinkaurelius.titan.hadoop.config.ModifiableHadoopConfiguration;
import com.thinkaurelius.titan.hadoop.config.TitanHadoopConfiguration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Map;

import static com.thinkaurelius.titan.hadoop.compat.HadoopCompatLoader.DEFAULT_COMPAT;

/**
 * Utility class to construct and submit Hadoop Jobs that execute a {@link HadoopScanMapper}.
 */
public class HadoopScanRunner {

    private static final Logger log =
            LoggerFactory.getLogger(HadoopScanRunner.class);

    /**
     * Run a ScanJob on Hadoop MapReduce.
     * 

* The {@code confRootField} parameter must be a string in the format * {@code package.package...class#fieldname}, where {@code fieldname} is the * name of a public static field on the class specified by the portion of the * string before the {@code #}. The {@code #} itself is just a separator and * is discarded. *

* When a MapReduce task process prepares to execute the {@code ScanJob}, it will * read the public static field named by {@code confFieldRoot} and cast it to a * {@link ConfigNamespace}. This namespace object becomes the root of a * {@link Configuration} instantiated, populated with the key-value pairs * from the {@code conf} parameter, and then passed into the {@code ScanJob}. *

* This method blocks until the ScanJob completes, then returns the metrics * generated by the job during its execution. It does not timeout. * * @param conf configuration settings for the ScanJob * @param confRootField the root of the ScanJob's configuration * @param hadoopConf the Configuration passed to the MapReduce Job * @param inputFormat the InputFormat> * that reads (row, columns) pairs out of a Titan edgestore * @return metrics generated by the ScanJob * @throws IOException if the job fails for any reason * @throws ClassNotFoundException if {@code scanJob.getClass()} or if Hadoop * MapReduce's internal job-submission-related reflection fails * @throws InterruptedException if interrupted while waiting for the Hadoop * MapReduce job to complete */ public static ScanMetrics runJob(Configuration conf, String confRootField, org.apache.hadoop.conf.Configuration hadoopConf, Class inputFormat, String jobName, Class mapperClass) throws IOException, InterruptedException, ClassNotFoundException { Preconditions.checkArgument(null != hadoopConf); Preconditions.checkArgument(null != inputFormat); if (null != conf) { Preconditions.checkArgument(null != confRootField, "Configuration root field must be provided when configuration instance is provided"); } ModifiableHadoopConfiguration scanConf = ModifiableHadoopConfiguration.of(TitanHadoopConfiguration.MAPRED_NS, hadoopConf); if (null != confRootField) { // Set the scanjob configuration root scanConf.set(TitanHadoopConfiguration.SCAN_JOB_CONFIG_ROOT, confRootField); // Instantiate scanjob configuration root ConfigNamespace confRoot = HadoopScanMapper.getJobRoot(confRootField); // Create writable view of scanjob configuration atop the Hadoop Configuration instance, where all keys are prefixed with SCAN_JOB_CONFIG_KEYS ModifiableConfiguration hadoopJobConf = ModifiableHadoopConfiguration.prefixView(confRoot, TitanHadoopConfiguration.SCAN_JOB_CONFIG_KEYS, scanConf); // Copy scanjob settings from the Titan Configuration instance to the Hadoop Configuration instance Map jobConfMap = conf.getSubset(confRoot); for (Map.Entry jobConfEntry : jobConfMap.entrySet()) { hadoopJobConf.set((ConfigOption) ConfigElement.parse(confRoot, jobConfEntry.getKey()).element, jobConfEntry.getValue()); } } return runJob(scanConf.getHadoopConfiguration(), inputFormat, jobName, mapperClass); } public static ScanMetrics runJob(org.apache.hadoop.conf.Configuration hadoopConf, Class inputFormat, String jobName, Class mapperClass) throws IOException, InterruptedException, ClassNotFoundException { Job job = Job.getInstance(hadoopConf); //job.setJarByClass(HadoopScanMapper.class); job.setJarByClass(mapperClass); //job.setJobName(HadoopScanMapper.class.getSimpleName() + "[" + scanJob + "]"); job.setJobName(jobName); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setNumReduceTasks(0); //job.setMapperClass(HadoopScanMapper.class); job.setMapperClass(mapperClass); job.setOutputFormatClass(NullOutputFormat.class); job.setInputFormatClass(inputFormat); boolean success = job.waitForCompletion(true); if (!success) { String f; try { // Just in case one of Job's methods throws an exception f = String.format("MapReduce JobID %s terminated abnormally: %s", job.getJobID().toString(), HadoopCompatLoader.DEFAULT_COMPAT.getJobFailureString(job)); } catch (RuntimeException e) { f = "Job failed (unable to read job status programmatically -- see MapReduce logs for information)"; } throw new IOException(f); } else { return DEFAULT_COMPAT.getMetrics(job.getCounters()); } } public static ScanMetrics runScanJob(ScanJob scanJob, Configuration conf, String confRootField, org.apache.hadoop.conf.Configuration hadoopConf, Class inputFormat) throws IOException, InterruptedException, ClassNotFoundException { ModifiableHadoopConfiguration scanConf = ModifiableHadoopConfiguration.of(TitanHadoopConfiguration.MAPRED_NS, hadoopConf); tryToLoadClassByName(scanJob); // Set the ScanJob class scanConf.set(TitanHadoopConfiguration.SCAN_JOB_CLASS, scanJob.getClass().getName()); String jobName = HadoopScanMapper.class.getSimpleName() + "[" + scanJob + "]"; return runJob(conf, confRootField, hadoopConf, inputFormat, jobName, HadoopScanMapper.class); } public static ScanMetrics runVertexScanJob(VertexScanJob vertexScanJob, Configuration conf, String confRootField, org.apache.hadoop.conf.Configuration hadoopConf, Class inputFormat) throws IOException, InterruptedException, ClassNotFoundException { ModifiableHadoopConfiguration scanConf = ModifiableHadoopConfiguration.of(TitanHadoopConfiguration.MAPRED_NS, hadoopConf); tryToLoadClassByName(vertexScanJob); // Set the VertexScanJob class scanConf.set(TitanHadoopConfiguration.SCAN_JOB_CLASS, vertexScanJob.getClass().getName()); String jobName = HadoopScanMapper.class.getSimpleName() + "[" + vertexScanJob + "]"; return runJob(conf, confRootField, hadoopConf, inputFormat, jobName, HadoopVertexScanMapper.class); } private static void tryToLoadClassByName(Object o) throws ClassNotFoundException { // Test that we can find this ScanJob class by its name; better to detect a problem here than in the mappers String scanJobClassname = o.getClass().getName(); try { Class.forName(scanJobClassname); } catch (ClassNotFoundException e) { log.error("Unable to locate class with name {}", scanJobClassname, e); throw e; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy