All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.ReplImplicits.scala Maven / Gradle / Ivy

There is a newer version: 0.16.1-RC3
Show newest version
/*  Copyright 2013 Twitter, inc.
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package com.twitter.scalding

import cascading.flow.FlowDef
import cascading.pipe.Pipe
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{ FsShell, FileSystem }
import typed.KeyedListLike
import scala.util.{ Failure, Success }
import scala.concurrent.{ Future, ExecutionContext => ConcurrentExecutionContext }

/**
 * Object containing various implicit conversions required to create Scalding flows in the REPL.
 * Most of these conversions come from the [[com.twitter.scalding.Job]] class.
 */
trait BaseReplState {

  /** required for switching to hdfs local mode */
  private val mr1Key = "mapred.job.tracker"
  private val mr2Key = "mapreduce.framework.name"
  private val mrLocal = "local"

  /** Implicit flowDef for this Scalding shell session. */
  var flowDef: FlowDef = getEmptyFlowDef
  /** Defaults to running in local mode if no mode is specified. */
  var mode: Mode = com.twitter.scalding.Local(false)
  /**
   * If the repl is started in Hdfs mode, this field is used to preserve the settings
   * when switching Modes.
   */
  var storedHdfsMode: Option[Hdfs] = None

  /** Switch to Local mode */
  def useLocalMode() {
    mode = Local(false)
    printModeBanner()
  }

  def useStrictLocalMode() {
    mode = Local(true)
    printModeBanner()
  }

  /** Switch to Hdfs mode */
  private def useHdfsMode_() {
    storedHdfsMode match {
      case Some(hdfsMode) => mode = hdfsMode
      case None => println("To use HDFS/Hadoop mode, you must *start* the repl in hadoop mode to get the hadoop configuration from the hadoop command.")
    }
  }

  def useHdfsMode() {
    useHdfsMode_()
    customConfig -= mr1Key
    customConfig -= mr2Key
    printModeBanner()
  }

  def useHdfsLocalMode() {
    useHdfsMode_()
    customConfig += mr1Key -> mrLocal
    customConfig += mr2Key -> mrLocal
    printModeBanner()
  }

  private[scalding] def printModeBanner(): Unit = {
    val (modeString, homeDir) = mode match {
      case localMode: Local => {
        (localMode.toString, System.getProperty("user.dir"))
      }
      case hdfsMode: Hdfs => {
        val defaultFs = FileSystem.get(hdfsMode.jobConf)
        val m = customConfig.get(mr2Key) match {
          case Some("local") =>
            s"${hdfsMode.getClass.getSimpleName}Local(${hdfsMode.strict})"
          case _ =>
            s"${hdfsMode.getClass.getSimpleName}(${hdfsMode.strict})"
        }
        (m, defaultFs.getWorkingDirectory.toString)
      }
    }
    println(s"${Console.GREEN}#### Scalding mode: ${modeString}")
    println(s"#### User home: ${homeDir}${Console.RESET}")
  }

  private def modeHadoopConf: Configuration = {
    mode match {
      case hdfsMode: Hdfs => hdfsMode.jobConf
      case _ => new Configuration(false)
    }
  }

  /**
   * Access to Hadoop FsShell
   *
   * @param cmdArgs list of command line parameters for FsShell, one per method argument
   * @return
   */
  def fsShellExp(cmdArgs: String*): Int = {
    new FsShell(modeHadoopConf).run(cmdArgs.toArray)
  }

  /**
   * Access to Hadoop FsShell
   *
   * @param cmdLine command line parameters for FsShell as a single string
   * @return
   */
  def fsShell(cmdLine: String): Int = {
    new FsShell(modeHadoopConf).run(cmdLine.trim.split(" "))
  }

  /**
   * Configuration to use for REPL executions.
   *
   * To make changes, don't forget to assign back to this var:
   * config += "mapred.reduce.tasks" -> 2
   */
  var customConfig = Config.empty

  /* Using getter/setter here lets us get the correct defaults set by the mode
     (and read from command-line, etc) while still allowing the user to customize it */
  def config: Config = Config.defaultFrom(mode) ++ customConfig

  protected def shell: BaseScaldingShell = ScaldingShell

  /** Create config for execution. Tacks on a new jar for each execution. */
  private[scalding] def executionConfig: Config = {
    // Create a jar to hold compiled code for this REPL session in addition to
    // "tempjars" which can be passed in from the command line, allowing code
    // in the repl to be distributed for the Hadoop job to run.
    val replCodeJar: Option[java.io.File] = shell.createReplCodeJar()
    val tmpJarsConfig: Map[String, String] =
      replCodeJar match {
        case Some(jar) =>
          Map("tmpjars" -> {
            // Use tmpjars already in the configuration.
            config.get("tmpjars").map(_ + ",").getOrElse("")
              // And a jar of code compiled by the REPL.
              .concat("file://" + jar.getAbsolutePath)
          })
        case None =>
          // No need to add the tmpjars to the configuration
          Map()
      }
    config ++ tmpJarsConfig
  }

  /**
   * Sets the flow definition in implicit scope to an empty flow definition.
   */
  def resetFlowDef() {
    flowDef = getEmptyFlowDef
  }

  /**
   * Gets a new, empty, flow definition.
   *
   * @return a new, empty flow definition.
   */
  def getEmptyFlowDef: FlowDef = {
    val fd = new FlowDef
    fd.setName("ScaldingShell")
    fd
  }

  /**
   * Runs this pipe as a Scalding job.
   *
   * Automatically cleans up the flowDef to include only sources upstream from tails.
   */
  def run(implicit fd: FlowDef, md: Mode): Option[JobStats] =
    ExecutionContext.newContext(executionConfig)(fd, md).waitFor match {
      case Success(stats) => Some(stats)
      case Failure(e) =>
        println("Flow execution failed!")
        e.printStackTrace()
        None
    }

  /*
   * Starts the Execution, but does not wait for the result
   */
  def asyncExecute[T](execution: Execution[T])(implicit ec: ConcurrentExecutionContext): Future[T] =
    execution.run(executionConfig, mode)

  /*
   * This runs the Execution[T] and waits for the result
   */
  def execute[T](execution: Execution[T]): T =
    execution.waitFor(executionConfig, mode).get
}

object ReplImplicits extends FieldConversions {

  /**
   * Converts a Cascading Pipe to a Scalding RichPipe. This method permits implicit conversions from
   * Pipe to RichPipe.
   *
   * @param pipe to convert to a RichPipe.
   * @return a RichPipe wrapping the specified Pipe.
   */
  implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe)

  /**
   * Converts a Scalding RichPipe to a Cascading Pipe. This method permits implicit conversions from
   * RichPipe to Pipe.
   *
   * @param richPipe to convert to a Pipe.
   * @return the Pipe wrapped by the specified RichPipe.
   */
  implicit def richPipeToPipe(richPipe: RichPipe): Pipe = richPipe.pipe

  /**
   * Converts a Source to a RichPipe. This method permits implicit conversions from Source to
   * RichPipe.
   *
   * @param source to convert to a RichPipe.
   * @return a RichPipe wrapping the result of reading the specified Source.
   */
  implicit def sourceToRichPipe(source: Source)(implicit flowDef: FlowDef, mode: Mode): RichPipe =
    RichPipe(source.read(flowDef, mode))

  /**
   * Converts an iterable into a Source with index (int-based) fields.
   *
   * @param iterable to convert into a Source.
   * @param setter implicitly retrieved and used to convert the specified iterable into a Source.
   * @param converter implicitly retrieved and used to convert the specified iterable into a Source.
   * @return a Source backed by the specified iterable.
   */
  implicit def iterableToSource[T](
    iterable: Iterable[T])(implicit setter: TupleSetter[T],
      converter: TupleConverter[T]): Source = {
    IterableSource[T](iterable)(setter, converter)
  }

  /**
   * Converts an iterable into a Pipe with index (int-based) fields.
   *
   * @param iterable to convert into a Pipe.
   * @param setter implicitly retrieved and used to convert the specified iterable into a Pipe.
   * @param converter implicitly retrieved and used to convert the specified iterable into a Pipe.
   * @return a Pipe backed by the specified iterable.
   */
  implicit def iterableToPipe[T](
    iterable: Iterable[T])(implicit setter: TupleSetter[T],
      converter: TupleConverter[T], flowDef: FlowDef, mode: Mode): Pipe = {
    iterableToSource(iterable)(setter, converter).read
  }

  /**
   * Converts an iterable into a RichPipe with index (int-based) fields.
   *
   * @param iterable to convert into a RichPipe.
   * @param setter implicitly retrieved and used to convert the specified iterable into a RichPipe.
   * @param converter implicitly retrieved and used to convert the specified iterable into a
   *     RichPipe.
   * @return a RichPipe backed by the specified iterable.
   */
  implicit def iterableToRichPipe[T](
    iterable: Iterable[T])(implicit setter: TupleSetter[T],
      converter: TupleConverter[T], flowDef: FlowDef, mode: Mode): RichPipe = {
    RichPipe(iterableToPipe(iterable)(setter, converter, flowDef, mode))
  }

  /**
   * Convert KeyedListLike to enriched ShellTypedPipe
   * (e.g. allows .snapshot to be called on Grouped, CoGrouped, etc)
   */
  implicit def keyedListLikeToShellTypedPipe[K, V, T[K, +V] <: KeyedListLike[K, V, T]](kll: KeyedListLike[K, V, T])(implicit state: BaseReplState) =
    new ShellTypedPipe(kll.toTypedPipe)(state)

  /**
   * Enrich TypedPipe for the shell
   * (e.g. allows .snapshot to be called on it)
   */
  implicit def typedPipeToShellTypedPipe[T](pipe: TypedPipe[T])(implicit state: BaseReplState): ShellTypedPipe[T] =
    new ShellTypedPipe[T](pipe)(state)

  /**
   * Enrich ValuePipe for the shell
   * (e.g. allows .toOption to be called on it)
   */
  implicit def valuePipeToShellValuePipe[T](pipe: ValuePipe[T])(implicit state: BaseReplState): ShellValuePipe[T] =
    new ShellValuePipe[T](pipe)(state)

}

object ReplState extends BaseReplState

/**
 * Implicit FlowDef and Mode, import in the REPL to have the global context implicitly
 * used everywhere.
 */
object ReplImplicitContext {
  /** Implicit execution context for using the Execution monad */
  implicit val executionContext = ConcurrentExecutionContext.global
  /** Implicit repl state used for ShellPipes */
  implicit def stateImpl = ReplState
  /** Implicit flowDef for this Scalding shell session. */
  implicit def flowDefImpl = ReplState.flowDef
  /** Defaults to running in local mode if no mode is specified. */
  implicit def modeImpl = ReplState.mode
  implicit def configImpl = ReplState.config
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy