org.apache.avro.mapred.tether.TetherJob Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of avro-mapred Show documentation
Show all versions of avro-mapred Show documentation
An org.apache.hadoop.mapred compatible API for using Avro Serializatin in Hadoop
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.avro.mapred.tether;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
/**
* Constructs and submits tether jobs. This may be used as an API-based method
* to launch tether jobs.
*/
@SuppressWarnings("deprecation")
public class TetherJob extends Configured {
public static final String TETHER_EXEC = "avro.tether.executable";
public static final String TETHER_EXEC_ARGS = "avro.tether.executable_args";
public static final String TETHER_EXEC_CACHED = "avro.tether.executable_cached";
public static final String TETHER_PROTOCOL = "avro.tether.protocol";
/** Get the URI of the application's executable. */
public static URI getExecutable(JobConf job) {
try {
return new URI(job.get("avro.tether.executable"));
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
}
/** Set the URI for the application's executable. Normally this in HDFS. */
public static void setExecutable(JobConf job, File executable) {
setExecutable(job, executable, Collections.emptyList(), false);
}
/**
* Set the URI for the application's executable (i.e the program to run in a
* subprocess and provides the mapper/reducer).
*
* @param job - Job
* @param executable - The URI of the executable
* @param args - List of additional arguments; Null if no arguments
* @param cached - If true, the executable URI is cached using
* DistributedCache - if false its not cached. I.e if the file
* is already stored on each local file system or if its on a
* NFS share
*/
public static void setExecutable(JobConf job, File executable, List args, boolean cached) {
job.set(TETHER_EXEC, executable.toString());
if (args != null) {
StringBuilder sb = new StringBuilder();
for (String a : args) {
sb.append(a);
sb.append('\n');
}
job.set(TETHER_EXEC_ARGS, sb.toString());
}
job.set(TETHER_EXEC_CACHED, (Boolean.valueOf(cached)).toString());
}
/**
* Extract from the job configuration file an instance of the TRANSPROTO
* enumeration to represent the protocol to use for the communication
*
* @param job
* @return - Get the currently used protocol
*/
public static TetheredProcess.Protocol getProtocol(JobConf job) {
if (job.get(TetherJob.TETHER_PROTOCOL) == null) {
return TetheredProcess.Protocol.NONE;
} else if (job.get(TetherJob.TETHER_PROTOCOL).equals("http")) {
return TetheredProcess.Protocol.HTTP;
} else if (job.get(TetherJob.TETHER_PROTOCOL).equals("sasl")) {
return TetheredProcess.Protocol.SASL;
} else {
throw new RuntimeException("Unknown value for protocol: " + job.get(TetherJob.TETHER_PROTOCOL));
}
}
/**
* Submit a job to the map/reduce cluster. All of the necessary modifications to
* the job to run under tether are made to the configuration.
*/
public static RunningJob runJob(JobConf job) throws IOException {
setupTetherJob(job);
return JobClient.runJob(job);
}
/** Submit a job to the Map-Reduce framework. */
public static RunningJob submitJob(JobConf conf) throws IOException {
setupTetherJob(conf);
return new JobClient(conf).submitJob(conf);
}
/**
* Determines which transport protocol (e.g http or sasl) used to communicate
* between the parent and subprocess
*
* @param job - job configuration
* @param proto - String identifying the protocol currently http or sasl
*/
public static void setProtocol(JobConf job, String proto) throws IOException {
proto = proto.trim().toLowerCase();
if (!(proto.equals("http") || proto.equals("sasl"))) {
throw new IOException("protocol must be 'http' or 'sasl'");
}
job.set(TETHER_PROTOCOL, proto);
}
private static void setupTetherJob(JobConf job) throws IOException {
job.setMapRunnerClass(TetherMapRunner.class);
job.setPartitionerClass(TetherPartitioner.class);
job.setReducerClass(TetherReducer.class);
job.setInputFormat(TetherInputFormat.class);
job.setOutputFormat(TetherOutputFormat.class);
job.setOutputKeyClass(TetherData.class);
job.setOutputKeyComparatorClass(TetherKeyComparator.class);
job.setMapOutputValueClass(NullWritable.class);
// set the map output key class to TetherData
job.setMapOutputKeyClass(TetherData.class);
// if protocol isn't set
if (job.getStrings(TETHER_PROTOCOL) == null) {
job.set(TETHER_PROTOCOL, "sasl");
}
// add TetherKeySerialization to io.serializations
Collection serializations = job.getStringCollection("io.serializations");
if (!serializations.contains(TetherKeySerialization.class.getName())) {
serializations.add(TetherKeySerialization.class.getName());
job.setStrings("io.serializations", serializations.toArray(new String[0]));
}
// determine whether the executable should be added to the cache.
if (job.getBoolean(TETHER_EXEC_CACHED, false)) {
DistributedCache.addCacheFile(getExecutable(job), job);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy