com.twitter.scalding.JobTest.scala Maven / Gradle / Ivy

Go to download
/*
Copyright 2014 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.twitter.scalding

import scala.collection.mutable.{ Buffer, ListBuffer }
import scala.collection.JavaConverters._
import scala.annotation.tailrec
import cascading.tuple.Tuple
import cascading.tuple.TupleEntry
import cascading.stats.CascadingStats
import org.apache.hadoop.mapred.JobConf

import scala.util.Try

object JobTest {
  def apply(jobName: String) = {
    new JobTest((args: Args) => Job(jobName, args))
  }
  def apply(cons: (Args) => Job) = {
    new JobTest(cons)
  }
  def apply[T <: Job: Manifest] = {
    val cons = { (args: Args) =>
      manifest[T].runtimeClass
        .getConstructor(classOf[Args])
        .newInstance(args)
        .asInstanceOf[Job]
    }
    new JobTest(cons)
  }
}

object CascadeTest {
  def apply(jobName: String) = {
    new CascadeTest((args: Args) => Job(jobName, args))
  }
}

/**
 * This class is used to construct unit tests for scalding jobs.
 * You should not use it unless you are writing tests.
 * For examples of how to do that, see the tests included in the
 * main scalding repository:
 * https://github.com/twitter/scalding/tree/master/scalding-core/src/test/scala/com/twitter/scalding
 */
class JobTest(cons: (Args) => Job) {
  private var argsMap = Map[String, List[String]]()
  private val callbacks = Buffer[() => Unit]()
  private val statsCallbacks = Buffer[(CascadingStats) => Unit]()
  // TODO: Switch the following maps and sets from Source to String keys
  // to guard for scala equality bugs
  private var sourceMap: (Source) => Option[Buffer[Tuple]] = { _ => None }
  private var sinkSet = Set[Source]()
  private var fileSet = Set[String]()
  private var validateJob = false

  def arg(inArg: String, value: List[String]) = {
    argsMap += inArg -> value
    this
  }

  def arg(inArg: String, value: String) = {
    argsMap += inArg -> List(value)
    this
  }

  private def sourceBuffer[T: TupleSetter](s: Source, tups: Iterable[T]): JobTest = {
    source { src => if (src == s) Some(tups) else None }
    this
  }

  /** Add a function to produce a mock when a certain source is requested */
  def source[T](fn: Source => Option[Iterable[T]])(implicit setter: TupleSetter[T]): JobTest = {
    val oldSm = sourceMap
    val bufferTupFn = fn.andThen { optItT => optItT.map { _.map(t => setter(t)).toBuffer } }
    // We have to memoize to return the same buffer each time
    val memo = scala.collection.mutable.Map[Source, Option[Buffer[Tuple]]]()
    sourceMap = { (src: Source) => memo.getOrElseUpdate(src, bufferTupFn(src)).orElse(oldSm(src)) }
    this
  }

  /**
   * Enables syntax like:
   * .ifSource { case Tsv("in") => List(1, 2, 3) }
   * We need a different function name from source to help the compiler
   */
  def ifSource[T](fn: PartialFunction[Source, Iterable[T]])(implicit setter: TupleSetter[T]): JobTest =
    source(fn.lift)

  def source[T](s: Source, iTuple: Iterable[T])(implicit setter: TupleSetter[T]): JobTest =
    sourceBuffer(s, iTuple)

  // This use of `_.get` is probably safe, but difficult to prove correct
  @SuppressWarnings(Array("org.brianmckenna.wartremover.warts.OptionPartial"))
  def sink[A](s: Source)(op: Buffer[A] => Unit)(implicit conv: TupleConverter[A]) = {
    if (sourceMap(s).isEmpty) {
      // if s is also used as a source, we shouldn't reset its buffer
      source(s, new ListBuffer[Tuple])
    }
    val buffer = sourceMap(s).get
    /* NOTE: `HadoopTest.finalize` depends on `sinkSet` matching the set of
     * "keys" in the `sourceMap`.  Do not change the following line unless
     * you also modify the `finalize` function accordingly.
     */
    sinkSet += s
    callbacks += (() => op(buffer.map { tup => conv(new TupleEntry(tup)) }))
    this
  }

  def typedSink[A](s: Source with TypedSink[A])(op: Buffer[A] => Unit)(implicit conv: TupleConverter[A]) =
    sink[A](s)(op)

  // Used to pass an assertion about a counter defined by the given group and name.
  // If this test is checking for multiple jobs chained by next, this only checks
  // for the counters in the final job's FlowStat.
  def counter(counter: String, group: String = Stats.ScaldingGroup)(op: Long => Unit) = {
    statsCallbacks += ((stats: CascadingStats) => op(Stats.getCounterValue(counter, group)(stats)))
    this
  }

  // Used to check an assertion on all custom counters of a given scalding job.
  def counters(op: Map[String, Long] => Unit) = {
    statsCallbacks += ((stats: CascadingStats) => op(Stats.getAllCustomCounters()(stats)))
    this
  }

  // Simulates the existance of a file so that mode.fileExists returns true.  We
  // do not simulate the file contents; that should be done through mock
  // sources.
  def registerFile(filename: String) = {
    fileSet += filename
    this
  }

  def run = {
    runJob(initJob(false), true)
    this
  }

  def runWithoutNext(useHadoop: Boolean = false) = {
    runJob(initJob(useHadoop), false)
    this
  }

  def runHadoop = {
    runJob(initJob(true), true)
    this
  }

  def runHadoopWithConf(conf: JobConf) = {
    runJob(initJob(true, Some(conf)), true)
    this
  }

  // This SITS is unfortunately needed to get around Specs
  def finish(): Unit = { () }

  def validate(v: Boolean) = {
    validateJob = v
    this
  }

  // Registers test files, initializes the global mode, and creates a job.
  private def initJob(useHadoop: Boolean, job: Option[JobConf] = None): Job = {
    // Create a global mode to use for testing.
    val testMode: TestMode =
      if (useHadoop) {
        val conf = job.getOrElse(new JobConf)
        // Set the polling to a lower value to speed up tests:
        conf.set("jobclient.completion.poll.interval", "100")
        conf.set("cascading.flow.job.pollinginterval", "5")
        // Work around for local hadoop race
        conf.set("mapred.local.dir", "/tmp/hadoop/%s/mapred/local".format(java.util.UUID.randomUUID))
        HadoopTest(conf, sourceMap)
      } else {
        Test(sourceMap)
      }
    testMode.registerTestFiles(fileSet)
    val args = new Args(argsMap)

    // Construct a job.
    cons(Mode.putMode(testMode, args))
  }

  @tailrec
  private final def runJob(job: Job, runNext: Boolean): Unit = {
    // Disable automatic cascading update
    System.setProperty("cascading.update.skip", "true")

    // create cascading 3.0 planner trace files during tests
    if (System.getenv.asScala.getOrElse("SCALDING_CASCADING3_DEBUG", "0") == "1") {
      System.setProperty("cascading.planner.plan.path", "target/test/cascading/traceplan/" + job.name)
      System.setProperty("cascading.planner.plan.transforms.path", "target/test/cascading/traceplan/" + job.name + "/transform")
      System.setProperty("cascading.planner.stats.path", "target/test/cascading/traceplan/" + job.name + "/stats")
    }

    if (validateJob) {
      job.validate()
    }
    job.run()
    // Make sure to clean the state:
    job.clear()

    val next: Option[Job] = if (runNext) { job.next } else { None }
    next match {
      case Some(nextjob) => runJob(nextjob, runNext)
      case None => {
        job.mode match {
          case hadoopTest @ HadoopTest(_, _) => {
            /* NOTE: `HadoopTest.finalize` depends on `sinkSet` matching the set of
             * "keys" in the `sourceMap`.  Do not change the following line unless
             * you also modify the `finalize` function accordingly.
             */
            // The sinks are written to disk, we need to clean them up:
            sinkSet.foreach{ hadoopTest.finalize(_) }
          }
          case _ => ()
        }
        // Now it is time to check the test conditions:
        callbacks.foreach { cb => cb() }
        statsCallbacks.foreach { cb => cb(job.scaldingCascadingStats.get) }
      }
    }
  }
}

class CascadeTest(cons: (Args) => Job) extends JobTest(cons) {}