com.holdenkarau.spark.testing.StreamingSuiteBase.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-testing-base_2.12 Show documentation
spark-testing-base
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.holdenkarau.spark.testing

import scala.collection.mutable
import scala.reflect.ClassTag

import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.scalactic.Equality
import org.scalatest.{BeforeAndAfterAll, Suite}

/**
 * This is the base trait for Spark Streaming testsuites. This provides basic
 * functionality to run user-defined set of input on user-defined stream operations,
 * and verify the output.
 */
trait StreamingSuiteBase extends BeforeAndAfterAll with Logging
  with StreamingSuiteCommon with SharedSparkContext {

  self: Suite =>

  // Default before function for any streaming test suite. Override this
  // if you want to add your stuff to "before" (i.e., don't call before { } )
  override def beforeAll(): Unit = {
    setupClock()
    super.beforeAll()
  }

  // Default after function for any streaming test suite. Override this
  // if you want to add your stuff to "after" (i.e., don't call after { } )
  override def afterAll(): Unit = {
    System.clearProperty("spark.streaming.clock")
    super.afterAll()
  }

  /**
   * Verify whether the output values after running a DStream operation
   * is same as the expected output values, by comparing the output
   * collections either as lists (order matters) or sets (order does not matter)
   *
   * @param ordered Compare output values with expected output values
   *                within the same output batch ordered or unordered.
   *                Comparing doubles may not work well in case of unordered.
   */
  def verifyOutput[V: ClassTag](
      output: Seq[Seq[V]],
      expectedOutput: Seq[Seq[V]],
      ordered: Boolean
    ) (implicit equality: Equality[V]): Unit = {

    logInfo("--------------------------------")
    logInfo("output.size = " + output.size)
    logInfo("output")
    output.foreach(x => logInfo("[" + x.mkString(",") + "]"))
    logInfo("expected output.size = " + expectedOutput.size)
    logInfo("expected output")
    expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
    logInfo("--------------------------------")

    // Match the output with the expected output
    assert(output.size === expectedOutput.size, "Number of outputs do not match")
    if (ordered) {
      for (i <- output.indices)
        equalsOrdered(output(i), expectedOutput(i))

    } else {
      for (i <- output.indices)
        equalsUnordered(output(i), expectedOutput(i))
    }

    logInfo("Output verified successfully")
  }

  private def equalsUnordered[V](
    output: Seq[V], expected: Seq[V])(implicit equality: Equality[V]) = {

    assert(output.length === expected.length)

    val length = output.length
    val set = new mutable.BitSet(length)

    for (i <- 0 until length) {
      val equalElements = (0 until length).
        filter(x => (!set.contains(x) && output(i) === expected(x))).take(1)

      // only to show the two unequal lists to user
      if (equalElements.isEmpty) {
        assert(output === expected)
      }

      set += equalElements(0)
    }
  }

  private def equalsOrdered[V](
    output: Seq[V], expected: Seq[V])(implicit equality: Equality[V]) = {
    assert(output.length === expected.length)
    for (i <- output.indices)
      assert(output(i) === expected(i))
  }

  // Wrappers with ordered = false
  def testOperation[U: ClassTag, V: ClassTag](
      input: Seq[Seq[U]],
      operation: DStream[U] => DStream[V],
      expectedOutput: Seq[Seq[V]]
  ) (implicit equality: Equality[V]): Unit = {
    testOperation(input, operation, expectedOutput, false)
  }

  def testOperation[U: ClassTag, V: ClassTag, W: ClassTag](
      input1: Seq[Seq[U]],
      input2: Seq[Seq[V]],
      operation: (DStream[U], DStream[V]) => DStream[W],
      expectedOutput: Seq[Seq[W]]
  ) (implicit equality: Equality[W]): Unit = {
    testOperation(input1, input2, operation, expectedOutput, false)
  }

  /**
   * Test unary DStream operation with a list of inputs, with number of
   * batches to run same as the number of input values.
   *
   * Each input micro-batch is a list of values or as null to simulate empty batch.
   *
   * @param input          Sequence of input collections
   * @param operation      Binary DStream operation to be applied to the 2 inputs
   * @param expectedOutput Sequence of expected output collections
   * @param ordered        Compare output values with expected output values
   *                       within the same output batch ordered or unordered.
   *                       Comparing doubles may not work well in case of unordered.
   */
  def testOperation[U: ClassTag, V: ClassTag](
      input: Seq[Seq[U]],
      operation: DStream[U] => DStream[V],
      expectedOutput: Seq[Seq[V]],
      ordered: Boolean
    ) (implicit equality: Equality[V]): Unit = {
    val numBatches = input.size

    withOutputAndStreamingContext(setupStreams[U, V](input, operation)) {
      (outputStream, ssc) =>

      val output: Seq[Seq[V]] = runStreams[V](
        outputStream, ssc, numBatches, expectedOutput.size)
      verifyOutput[V](output, expectedOutput, ordered)
    }
  }

  /**
   * Test binary DStream operation with two lists of inputs, with number of
   * batches to run same as the number of input values. The size of the two input
   * lists should be the same.
   *
   * Each input micro-batch is a list of values or as null to simulate empty batch.
   *
   * @param input1         First sequence of input collections
   * @param input2         Second sequence of input collections
   * @param operation      Binary DStream operation to be applied to the 2 inputs
   * @param expectedOutput Sequence of expected output collections
   * @param ordered        Compare output values with expected output values
   *                       within the same output batch ordered or unOrdered.
   *                       Comparing doubles may not work well in case of unordered.
   */
  def testOperation[U: ClassTag, V: ClassTag, W: ClassTag](
      input1: Seq[Seq[U]],
      input2: Seq[Seq[V]],
      operation: (DStream[U], DStream[V]) => DStream[W],
      expectedOutput: Seq[Seq[W]],
      ordered: Boolean
    ) (implicit equality: Equality[W]): Unit = {
    assert(input1.length === input2.length,
      "Length of the input lists are not equal")

    val numBatches = input1.size

    withOutputAndStreamingContext(setupStreams[U, V, W](input1, input2, operation)) {
      (outputStream, ssc) =>
      val output = runStreams[W](
        outputStream, ssc, numBatches, expectedOutput.size)
      verifyOutput[W](output, expectedOutput, ordered)
    }
  }

  /**
   * Test binary DStream and RDD operation with two lists of inputs, with number of
   * batches to run same as the number of input values corresponding to the DStream.
   *
   * Each input micro-batch is a list of values or as null to simulate empty batch.
   *
   * @param input1         Sequence of input collections corresponding to the DStream
   * @param input2         Sequence of input values corresponding to the RDD
   * @param operation      Binary DStream and RDD operation to be applied to the
   *                       2 inputs
   * @param expectedOutput Sequence of expected output collections
   * @param ordered        Compare output values with expected output values
   *                       within the same output batch ordered or unOrdered.
   *                       Comparing doubles may not work well in case of unordered.
   */
  def testOperationWithRDD[U: ClassTag, V: ClassTag, W: ClassTag](
      input1: Seq[Seq[U]],
      input2: Seq[V],
      operation: (DStream[U], RDD[V]) => DStream[W],
      expectedOutput: Seq[Seq[W]],
      ordered: Boolean
    ) (implicit equality: Equality[W]): Unit = {

    val numBatches = input1.size

    withOutputAndStreamingContext(
      setupStreamAndRDD[U, V, W](input1, input2, operation)) {
      (outputStream, ssc) =>

      val output = runStreams[W](
        outputStream, ssc, numBatches, expectedOutput.size)
      verifyOutput[W](output, expectedOutput, ordered)
    }
  }
}