com.twitter.scalding.Job.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scalding_2.9.1 Show documentation
scalding
There is a newer version: 0.7.3
/*
Copyright 2012 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.twitter.scalding

import cascading.flow.{Flow, FlowDef, FlowProps}
import cascading.pipe.Pipe


//For java -> scala implicits on collections
import scala.collection.JavaConversions._
import java.util.{Map => JMap}

object Job {
  // Uses reflection to create a job by name
  def apply(jobName : String, args : Args) : Job =
    Class.forName(jobName).
      getConstructor(classOf[Args]).
      newInstance(args).
      asInstanceOf[Job]
}

@serializable
class Job(val args : Args) extends TupleConversions with FieldConversions {

  /**
  * you should never call these directly, there are here to make
  * the DSL work.  Just know, you can treat a Pipe as a RichPipe and
  * vice-versa within a Job
  */
  implicit def p2rp(pipe : Pipe) = new RichPipe(pipe)
  implicit def rp2p(rp : RichPipe) = rp.pipe
  implicit def source2rp(src : Source) : RichPipe = RichPipe(src.read)

  // This converts an interable into a Source with index (int-based) fields
  implicit def iterToSource[T](iter : Iterable[T])(implicit set: TupleSetter[T]) : Source = {
    IterableSource[T](iter)(set)
  }
  //
  implicit def iterToPipe[T](iter : Iterable[T])(implicit set: TupleSetter[T]) : Pipe = {
    iterToSource(iter)(set).read
  }
  implicit def iterToRichPipe[T](iter : Iterable[T])(implicit set: TupleSetter[T]) : RichPipe = {
    RichPipe(iterToPipe(iter)(set))
  }

  // Override this if you want change how the mapred.job.name is written in Hadoop
  def name : String = getClass.getCanonicalName

  //This is the FlowDef used by all Sources this job creates
  @transient
  implicit val flowDef = {
    val fd = new FlowDef
    fd.setName(name)
    fd
  }

  // Use reflection to copy this job:
  def clone(nextargs : Args) : Job = {
    this.getClass
    .getConstructor(classOf[Args])
    .newInstance(nextargs)
    .asInstanceOf[Job]
  }

  /**
  * Implement this method if you want some other jobs to run after the current
  * job. These will not execute until the current job has run successfully.
  */
  def next : Option[Job] = None

  // Only very different styles of Jobs should override this.
  def buildFlow(implicit mode : Mode) = {
    validateSources(mode)
    // Sources are good, now connect the flow:
    mode.newFlowConnector(config).connect(flowDef)
  }

  /**
   * By default we only set two keys:
   * io.serializations
   * cascading.tuple.element.comparator.default
   * Override this class, call base and ++ your additional
   * map to set more options
   */
  def config : Map[AnyRef,AnyRef] = {
    val ioserVals = (ioSerializations ++
      List("com.twitter.scalding.KryoHadoopSerialization")).mkString(",")
    Map("io.serializations" -> ioserVals) ++
      (defaultComparator match {
        case Some(defcomp) => Map(FlowProps.DEFAULT_ELEMENT_COMPARATOR -> defcomp)
        case None => Map[String,String]()
      }) ++
    Map("cascading.spill.threshold" -> "100000", //Tune these for better performance
        "cascading.spillmap.threshold" -> "100000")
  }

  //Override this if you need to do some extra processing other than complete the flow
  def run(implicit mode : Mode) = {
    val flow = buildFlow(mode)
    flow.complete
    flow.getFlowStats.isSuccessful
  }
  // Add any serializations you need to deal with here (after these)
  def ioSerializations = List[String](
    "org.apache.hadoop.io.serializer.WritableSerialization",
    "cascading.tuple.hadoop.TupleSerialization"
  )
  // Override this if you want to customize comparisons/hashing for your job
  def defaultComparator : Option[String] = {
    Some("com.twitter.scalding.IntegralComparator")
  }

  //Largely for the benefit of Java jobs
  implicit def read(src : Source) : Pipe = src.read
  def write(pipe : Pipe, src : Source) {src.write(pipe)}

  def validateSources(mode : Mode) {
    flowDef.getSources()
      .asInstanceOf[JMap[String,AnyRef]]
      // this is a map of (name, Tap)
      .foreach { nameTap =>
        // Each named source must be present:
        mode.getSourceNamed(nameTap._1)
          .get
          // This can throw a InvalidSourceException
          .validateTaps(mode)
      }
  }
}

/**
* Sets up an implicit dateRange to use in your sources and an implicit
* timezone.
* Example args: --date 2011-10-02 2011-10-04 --tz UTC
* If no timezone is given, Pacific is assumed.
*/
trait DefaultDateRangeJob extends Job {
  //Get date implicits and PACIFIC and UTC vals.
  import DateOps._

  // Optionally take --tz argument, or use Pacific time.  Derived classes may
  // override defaultTimeZone to change the default.
  def defaultTimeZone = PACIFIC
  implicit val tz = args.optional("tz") match {
                      case Some(tzn) => java.util.TimeZone.getTimeZone(tzn)
                      case None => defaultTimeZone
                    }

  val (start, end) = args.list("date") match {
    case List(s, e) => (RichDate(s), RichDate.upperBound(e))
    case List(o) => (RichDate(o), RichDate.upperBound(o))
    case x => error("--date must have exactly one or two date[time]s. Got: " + x.toString)
  }
  //Make sure the end is not before the beginning:
  assert(start <= end, "end of date range must occur after the start")
  implicit val dateRange = DateRange(start, end)
}

// DefaultDateRangeJob with default time zone as UTC instead of Pacific.
trait UtcDateRangeJob extends DefaultDateRangeJob {
  override def defaultTimeZone = DateOps.UTC
}

/*
 * Run a list of shell commands through bash in the given order. Return success
 * when all commands succeed. Excution stops after the first failure. The
 * failing command is printed to stdout.
 */
class ScriptJob(cmds: Iterable[String]) extends Job(Args("")) {
  override def run(implicit mode : Mode) = {
    try {
      cmds.dropWhile {
        cmd: String => {
          new java.lang.ProcessBuilder("bash", "-c", cmd).start().waitFor() match {
            case x if x != 0 =>
              println(cmd + " failed, exitStatus: " + x)
              false
            case 0 => true
          }
        }
      }.isEmpty
    } catch {
      case e : Exception => {
        e.printStackTrace
        false
      }
    }
  }
}