All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.avro.mapred.tether.TetherJob Maven / Gradle / Ivy

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.avro.mapred.tether;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;

/** Constructs and submits tether jobs. This may be used as an API-based
 *  method to launch tether jobs. */
@SuppressWarnings("deprecation")
public class TetherJob extends Configured {

  public static final String TETHER_EXEC="avro.tether.executable";
  public static final String TETHER_EXEC_ARGS="avro.tether.executable_args";
  public static final String TETHER_EXEC_CACHED="avro.tether.executable_cached";
  public static final String TETHER_PROTOCOL="avro.tether.protocol";

  /** Get the URI of the application's executable. */
  public static URI getExecutable(JobConf job) {
    try {
      return new URI(job.get("avro.tether.executable"));
    } catch (URISyntaxException e) {
      throw new RuntimeException(e);
    }
  }

  /** Set the URI for the application's executable. Normally this in HDFS. */
  public static void setExecutable(JobConf job, File executable) {
    setExecutable(job,executable, new ArrayList(),false);
  }

  /**
   * Set the URI for the application's executable (i.e the program to run in a subprocess
   * and provides the mapper/reducer).
   * @param job - Job
   * @param executable - The URI of the executable
   * @param args - List of additional arguments; Null if no arguments
   * @param cached - If true, the executable URI is cached using DistributedCache
   *               - if false its not cached. I.e if the file is already stored on each local file system
   *                or if its on a NFS share
   */
  public static void setExecutable(JobConf job, File executable, List args, boolean cached) {
        job.set(TETHER_EXEC, executable.toString());
        if (args != null){
          StringBuilder sb = new StringBuilder();
          for (String a : args) {
            sb.append(a);
            sb.append('\n');
          }
          job.set(TETHER_EXEC_ARGS, sb.toString());
        }
        job.set(TETHER_EXEC_CACHED,  (new Boolean(cached)).toString());
  }

  /**
   * Extract from the job configuration file an instance of the TRANSPROTO enumeration
   * to represent the protocol to use for the communication
   * @param job
   * @return
   */
  public static TetheredProcess.Protocol getProtocol(JobConf job) {

    if (job.get(TetherJob.TETHER_PROTOCOL)==null) {
      return TetheredProcess.Protocol.NONE;
    } else if (job.get(TetherJob.TETHER_PROTOCOL).equals("http")) {
      return TetheredProcess.Protocol.HTTP;
    } else if (job.get(TetherJob.TETHER_PROTOCOL).equals("sasl")) {
      return TetheredProcess.Protocol.SASL;
    } else {
      throw new RuntimeException("Unknown value for protocol: " +job.get(TetherJob.TETHER_PROTOCOL));
    }

  }

  /** Submit a job to the map/reduce cluster. All of the necessary
   * modifications to the job to run under tether are made to the
   * configuration.
   */
  public static RunningJob runJob(JobConf job) throws IOException {
    setupTetherJob(job);
    return JobClient.runJob(job);
  }

  /** Submit a job to the Map-Reduce framework. */
  public static RunningJob submitJob(JobConf conf) throws IOException {
    setupTetherJob(conf);
    return new JobClient(conf).submitJob(conf);
  }

  /**
   * Determines which transport protocol (e.g http or sasl) used to communicate
   * between the parent and subprocess
   *
   * @param job - job configuration
   * @param proto - String identifying the protocol currently http or sasl
   */
  public static void setProtocol(JobConf job, String proto) throws IOException {
    proto=proto.trim().toLowerCase();

    if (!(proto.equals("http") || proto.equals("sasl"))) {
      throw new IOException("protocol must be 'http' or 'sasl'");
    }

    job.set(TETHER_PROTOCOL,proto);

  }

  private static void setupTetherJob(JobConf job) throws IOException {
    job.setMapRunnerClass(TetherMapRunner.class);
    job.setPartitionerClass(TetherPartitioner.class);
    job.setReducerClass(TetherReducer.class);

    job.setInputFormat(TetherInputFormat.class);
    job.setOutputFormat(TetherOutputFormat.class);

    job.setOutputKeyClass(TetherData.class);
    job.setOutputKeyComparatorClass(TetherKeyComparator.class);
    job.setMapOutputValueClass(NullWritable.class);

    // set the map output key class to TetherData
    job.setMapOutputKeyClass(TetherData.class);

    // if protocol isn't set
    if (job.getStrings(TETHER_PROTOCOL)==null) {
      job.set(TETHER_PROTOCOL, "sasl");
    }

    // add TetherKeySerialization to io.serializations
    Collection serializations =
      job.getStringCollection("io.serializations");
    if (!serializations.contains(TetherKeySerialization.class.getName())) {
      serializations.add(TetherKeySerialization.class.getName());
      job.setStrings("io.serializations",
                     serializations.toArray(new String[0]));
    }

    // determine whether the executable should be added to the cache.
    if (job.getBoolean(TETHER_EXEC_CACHED,false)){
      DistributedCache.addCacheFile(getExecutable(job), job);
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy