com.twitter.scalding.Job.scala Maven / Gradle / Ivy
/*
Copyright 2012 Twitter, Inc.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.twitter.scalding
import com.twitter.algebird.monad.Reader
import com.twitter.algebird.Semigroup
import cascading.flow.{ Flow, FlowDef, FlowListener, FlowStep, FlowStepListener, FlowSkipStrategy, FlowStepStrategy }
import cascading.pipe.Pipe
import cascading.property.AppProps
import cascading.stats.CascadingStats
import org.apache.hadoop.io.serializer.{ Serialization => HSerialization }
import scala.concurrent.{ Future, Promise }
import scala.util.Try
import java.io.{ BufferedWriter, FileOutputStream, OutputStreamWriter }
import java.util.{ List => JList }
import java.util.concurrent.{ Executors, TimeUnit, ThreadFactory, Callable, TimeoutException }
import java.util.concurrent.atomic.AtomicInteger
object Job {
/**
* Use reflection to create the job by name. We use the thread's
* context classloader so that classes in the submitted jar and any
* jars included via -libjar can be found.
*/
def apply(jobName: String, args: Args): Job = {
Class.forName(jobName, true, Thread.currentThread().getContextClassLoader)
.getConstructor(classOf[Args])
.newInstance(args)
.asInstanceOf[Job]
}
}
/**
* Job is a convenience class to make using Scalding easier.
* Subclasses of Job automatically have a number of nice implicits to enable more concise
* syntax, including:
* conversion from Pipe, Source or Iterable to RichPipe
* conversion from Source or Iterable to Pipe
* conversion to collections or Tuple[1-22] to cascading.tuple.Fields
*
* Additionally, the job provides an implicit Mode and FlowDef so that functions that
* register starts or ends of a flow graph, specifically anything that reads or writes data
* on Hadoop, has the needed implicits available.
*
* If you want to write code outside of a Job, you will want to either:
*
* make all methods that may read or write data accept implicit FlowDef and Mode parameters.
*
* OR:
*
* write code that rather than returning values, it returns a (FlowDef, Mode) => T,
* these functions can be combined Monadically using algebird.monad.Reader.
*/
class Job(val args: Args) extends FieldConversions with java.io.Serializable {
Tracing.init()
// Set specific Mode
implicit def mode: Mode = Mode.getMode(args).getOrElse(sys.error("No Mode defined"))
/**
* Use this if a map or reduce phase takes a while before emitting tuples.
*/
def keepAlive(): Unit = {
val flowProcess = RuntimeStats.getFlowProcessForUniqueId(uniqueId)
flowProcess.keepAlive()
}
/**
* you should never call this directly, it is here to make
* the DSL work. Just know, you can treat a Pipe as a RichPipe
* within a Job
*/
implicit def pipeToRichPipe(pipe: Pipe): RichPipe = new RichPipe(pipe)
/**
* This implicit is to enable RichPipe methods directly on Source
* objects, such as map/flatMap, etc...
*
* Note that Mappable is a subclass of Source, and Mappable already
* has mapTo and flatMapTo BUT WITHOUT incoming fields used (see
* the Mappable trait). This creates some confusion when using these methods
* (this is an unfortunate mistake in our design that was not noticed until later).
* To remove ambiguity, explicitly call .read on any Source that you begin
* operating with a mapTo/flatMapTo.
*/
implicit def sourceToRichPipe(src: Source): RichPipe = new RichPipe(src.read)
// This converts an Iterable into a Pipe or RichPipe with index (int-based) fields
implicit def toPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): Pipe =
IterableSource[T](iter)(set, conv).read
implicit def iterableToRichPipe[T](iter: Iterable[T])(implicit set: TupleSetter[T], conv: TupleConverter[T]): RichPipe =
RichPipe(toPipe(iter)(set, conv))
// Provide args as an implicit val for extensions such as the Checkpoint extension.
implicit protected def _implicitJobArgs: Args = args
// Override this if you want to change how the mapred.job.name is written in Hadoop
def name: String = Config.defaultFrom(mode).toMap.getOrElse("mapred.job.name", getClass.getName)
//This is the FlowDef used by all Sources this job creates
@transient
implicit protected val flowDef = {
val fd = new FlowDef
fd.setName(name)
fd
}
// Do this before the job is submitted, because the flowDef is transient
private[this] val uniqueId = UniqueID.getIDFor(flowDef)
/**
* Copy this job
* By default, this uses reflection and the single argument Args constructor
*/
def clone(nextargs: Args): Job =
this.getClass
.getConstructor(classOf[Args])
.newInstance(Mode.putMode(mode, nextargs))
.asInstanceOf[Job]
/**
* Implement this method if you want some other jobs to run after the current
* job. These will not execute until the current job has run successfully.
*/
def next: Option[Job] = None
/**
* Keep 100k tuples in memory by default before spilling
* Turn this up as high as you can without getting OOM.
*
* This is ignored if there is a value set in the incoming jobConf on Hadoop
*/
def defaultSpillThreshold: Int = 100 * 1000
/** Override this to control how dates are parsed */
implicit def dateParser: DateParser = DateParser.default
// Generated the MD5 hex of the bytes in the job classfile
def classIdentifier: String = Config.md5Identifier(getClass)
/**
* This is the exact config that is passed to the Cascading FlowConnector.
* By default:
* if there are no spill thresholds in mode.config, we replace with defaultSpillThreshold
* we overwrite io.serializations with ioSerializations
* we overwrite cascading.tuple.element.comparator.default to defaultComparator
* we add some scalding keys for debugging/logging
*
* Tip: override this method, call super, and ++ your additional
* map to add or overwrite more options
*
* This returns Map[AnyRef, AnyRef] for compatibility with older code
*/
def config: Map[AnyRef, AnyRef] = {
val base = Config.empty
.setListSpillThreshold(defaultSpillThreshold)
.setMapSpillThreshold(defaultSpillThreshold)
.setMapSideAggregationThreshold(defaultSpillThreshold)
// This is setting a property for cascading/driven
AppProps.addApplicationFramework(null,
String.format("scalding:%s", scaldingVersion))
val modeConf = mode match {
case h: HadoopMode => Config.fromHadoop(h.jobConf)
case _ => Config.empty
}
val init = base ++ modeConf
defaultComparator.map(init.setDefaultComparator)
.getOrElse(init)
.setSerialization(Right(classOf[serialization.KryoHadoop]), ioSerializations)
.setScaldingVersion
.setCascadingAppName(name)
.setCascadingAppId(name)
.setScaldingFlowClass(getClass)
.setArgs(args)
.maybeSetSubmittedTimestamp()._2
.toMap.toMap[AnyRef, AnyRef] // linter:ignore the second one is to lift from String -> AnyRef
}
/**
* This is here so that Mappable.toIterator can find an implicit config
*/
implicit protected def scaldingConfig: Config = Config.tryFrom(config).get
def skipStrategy: Option[FlowSkipStrategy] = None
/**
* Specify a callback to run before the start of each flow step.
*
* Defaults to what Config.getReducerEstimator specifies.
* @see ExecutionContext.buildFlow
*/
def stepStrategy: Option[FlowStepStrategy[_]] = None
private def executionContext: Try[ExecutionContext] =
Config.tryFrom(config).map { conf =>
ExecutionContext.newContext(conf)(flowDef, mode)
}
/**
* combine the config, flowDef and the Mode to produce a flow
*/
def buildFlow: Flow[_] =
executionContext
.flatMap(_.buildFlow)
.map { flow =>
listeners.foreach { flow.addListener(_) }
stepListeners.foreach { flow.addStepListener(_) }
skipStrategy.foreach { flow.setFlowSkipStrategy(_) }
stepStrategy.foreach { strategy =>
val existing = flow.getFlowStepStrategy
val composed =
if (existing == null)
strategy
else
FlowStepStrategies[Any].plus(
existing.asInstanceOf[FlowStepStrategy[Any]],
strategy.asInstanceOf[FlowStepStrategy[Any]])
flow.setFlowStepStrategy(composed)
}
flow
}
.get
// called before run
// only override if you do not use flowDef
def validate(): Unit = {
FlowStateMap.validateSources(flowDef, mode)
}
// called after successfull run
// only override if you do not use flowDef
def clear(): Unit = {
FlowStateMap.clear(flowDef)
}
protected def handleStats(statsData: CascadingStats): Unit = {
scaldingCascadingStats = Some(statsData)
// TODO: Why the two ways to do stats? Answer: jank-den.
if (args.boolean("scalding.flowstats")) {
val statsFilename = args.getOrElse("scalding.flowstats", name + "._flowstats.json")
val br = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(statsFilename), "utf-8"))
br.write(JobStats(statsData).toJson)
br.close()
}
// Print custom counters unless --scalding.nocounters is used or there are no custom stats
if (!args.boolean("scalding.nocounters")) {
implicit val statProvider = statsData
val jobStats = Stats.getAllCustomCounters
if (!jobStats.isEmpty) {
println("Dumping custom counters:")
jobStats.foreach {
case (counter, value) =>
println("%s\t%s".format(counter, value))
}
}
}
}
// TODO design a better way to test stats.
// This awful name is designed to avoid collision
// with subclasses
@transient
private[scalding] var scaldingCascadingStats: Option[CascadingStats] = None
/**
* Save the Flow object after a run to allow clients to inspect the job.
* @see HadoopPlatformJobTest
*/
@transient
private[scalding] var completedFlow: Option[Flow[_]] = None
//Override this if you need to do some extra processing other than complete the flow
def run(): Boolean = {
val flow = buildFlow
flow.complete()
val statsData = flow.getFlowStats
handleStats(statsData)
completedFlow = Some(flow)
statsData.isSuccessful
}
//override these to add any listeners you need
def listeners: List[FlowListener] = Nil
def stepListeners: List[FlowStepListener] = Nil
/**
* These are user-defined serializations IN-ADDITION to (but deduped)
* with the required serializations
*/
def ioSerializations: List[Class[_ <: HSerialization[_]]] = Nil
/**
* Override this if you want to customize comparisons/hashing for your job
* the config method overwrites using this before sending to cascading
* The one we use by default is needed used to make Joins in the
* Fields-API more robust to Long vs Int differences.
* If you only use the Typed-API, consider changing this to return None
*/
def defaultComparator: Option[Class[_ <: java.util.Comparator[_]]] =
Some(classOf[IntegralComparator])
/**
* This is implicit so that a Source can be used as the argument
* to a join or other method that accepts Pipe.
*/
implicit def read(src: Source): Pipe = src.read
/**
* This is only here for Java jobs which cannot automatically
* access the implicit Pipe => RichPipe which makes: pipe.write( )
* convenient
*/
def write(pipe: Pipe, src: Source): Unit = { src.writeFrom(pipe) }
/*
* Need to be lazy to be used within pipes.
*/
private lazy val timeoutExecutor =
Executors.newSingleThreadExecutor(new NamedPoolThreadFactory("job-timer", true))
/*
* Safely execute some operation within a deadline.
*
* TODO: once we have a mechanism to access FlowProcess from user functions, we can use this
* function to allow long running jobs by notifying Cascading of progress.
*/
def timeout[T](timeout: AbsoluteDuration)(t: => T): Option[T] = {
val f = timeoutExecutor.submit(new Callable[Option[T]] {
def call(): Option[T] = Some(t)
});
try {
f.get(timeout.toMillisecs, TimeUnit.MILLISECONDS)
} catch {
case _: TimeoutException =>
f.cancel(true)
None
}
}
}
/*
* NamedPoolThreadFactory is copied from util.core to avoid dependency.
*/
class NamedPoolThreadFactory(name: String, makeDaemons: Boolean) extends ThreadFactory {
def this(name: String) = this(name, false)
val group = new ThreadGroup(Thread.currentThread().getThreadGroup(), name)
val threadNumber = new AtomicInteger(1)
def newThread(r: Runnable) = {
val thread = new Thread(group, r, name + "-" + threadNumber.getAndIncrement())
thread.setDaemon(makeDaemons)
if (thread.getPriority != Thread.NORM_PRIORITY) {
thread.setPriority(Thread.NORM_PRIORITY)
}
thread
}
}
/**
* Sets up an implicit dateRange to use in your sources and an implicit
* timezone.
* Example args: --date 2011-10-02 2011-10-04 --tz UTC
* If no timezone is given, Pacific is assumed.
*/
trait DefaultDateRangeJob extends Job {
//Get date implicits and PACIFIC and UTC vals.
import DateOps._
// Optionally take --tz argument, or use Pacific time. Derived classes may
// override defaultTimeZone to change the default.
def defaultTimeZone = PACIFIC
implicit lazy val tz = args.optional("tz") match {
case Some(tzn) => java.util.TimeZone.getTimeZone(tzn)
case None => defaultTimeZone
}
// Optionally take a --period, which determines how many days each job runs over (rather
// than over the whole date range)
// --daily and --weekly are aliases for --period 1 and --period 7 respectively
val period =
if (args.boolean("daily"))
1
else if (args.boolean("weekly"))
7
else
args.getOrElse("period", "0").toInt
lazy val (startDate, endDate) = {
val DateRange(s, e) = DateRange.parse(args.list("date"))
(s, e)
}
implicit lazy val dateRange = DateRange(startDate, if (period > 0) startDate + Days(period) - Millisecs(1) else endDate)
override def next: Option[Job] =
if (period > 0) {
val nextStartDate = startDate + Days(period)
if (nextStartDate + Days(period - 1) > endDate)
None // we're done
else // return a new job with the new startDate
Some(clone(args + ("date" -> List(nextStartDate.toString("yyyy-MM-dd"), endDate.toString("yyyy-MM-dd")))))
} else
None
}
// DefaultDateRangeJob with default time zone as UTC instead of Pacific.
trait UtcDateRangeJob extends DefaultDateRangeJob {
override def defaultTimeZone = DateOps.UTC
}
/**
* This is a simple job that allows you to launch Execution[T]
* instances using scalding.Tool and scald.rb. You cannot print
* the graph.
*/
abstract class ExecutionJob[+T](args: Args) extends Job(args) {
import scala.concurrent.{ Await, ExecutionContext => scEC }
/**
* To avoid serialization issues, this should not be a val, but a def,
* and prefer to keep as much as possible inside the method.
*/
def execution: Execution[T]
/*
* Override this to control the execution context used
* to execute futures
*/
protected def concurrentExecutionContext: scEC = scEC.global
@transient private[this] val resultPromise: Promise[T] = Promise[T]()
def result: Future[T] = resultPromise.future
override def buildFlow: Flow[_] =
sys.error("ExecutionJobs do not have a single accessible flow. " +
"You cannot print the graph as it may be dynamically built or recurrent")
final override def run = {
val r = Config.tryFrom(config)
.map { conf =>
Await.result(execution.run(conf, mode)(concurrentExecutionContext),
scala.concurrent.duration.Duration.Inf)
}
if (!resultPromise.tryComplete(r)) {
// The test framework can call this more than once.
println("Warning: run called more than once, should not happen in production")
}
// Force an exception if the run failed
r.get
true
}
}
/*
* Run a list of shell commands through bash in the given order. Return success
* when all commands succeed. Excution stops after the first failure. The
* failing command is printed to stdout.
*/
class ScriptJob(cmds: Iterable[String]) extends Job(Args("")) {
override def run = {
try {
cmds.dropWhile {
cmd: String =>
{
new java.lang.ProcessBuilder("bash", "-c", cmd).start().waitFor() match {
case x if x != 0 =>
println(cmd + " failed, exitStatus: " + x)
false
case 0 => true
}
}
}.isEmpty
} catch {
case e: Exception => {
e.printStackTrace
false
}
}
}
}
/**
* Allows custom counter verification logic when the job completes.
*/
trait CounterVerification extends Job {
/**
* Verify counter values. The job will fail if this returns false or throws an exception.
*/
def verifyCounters(counters: Map[StatKey, Long]): Try[Unit]
/**
* Override this to false to skip counter verification in tests.
*/
def verifyCountersInTest: Boolean = true
override def listeners: List[FlowListener] = {
if (this.mode.isInstanceOf[TestMode] && !this.verifyCountersInTest) {
super.listeners
} else {
super.listeners :+ new StatsFlowListener(this.verifyCounters)
}
}
}
private[scalding] case class FlowStepStrategies[A]() extends Semigroup[FlowStepStrategy[A]] {
/**
* Returns a new FlowStepStrategy that runs both strategies in sequence.
*/
def plus(l: FlowStepStrategy[A], r: FlowStepStrategy[A]): FlowStepStrategy[A] =
new FlowStepStrategy[A] {
override def apply(
flow: Flow[A],
predecessorSteps: JList[FlowStep[A]],
flowStep: FlowStep[A]): Unit = {
l.apply(flow, predecessorSteps, flowStep)
r.apply(flow, predecessorSteps, flowStep)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy