All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.holdenkarau.spark.testing.JavaStreamingSuitebase.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.holdenkarau.spark.testing

import org.apache.spark.SparkConf
import org.apache.spark.api.java.function.{Function => JFunction, Function2 => JFunction2}
import org.apache.spark.streaming.api.java._
import org.apache.spark.streaming.dstream.DStream
import org.junit.Assert._

import java.util.{List => JList}
import scala.reflect.ClassTag

/**
 * This is the base trait for Spark Streaming testsuite. This provides basic
 * functionality to run user-defined set of input on user-defined stream operations,
 * and verify the output matches as expected.
 *
 * This implementation is designed to work with JUnit for java users.
 *
 * Note: this always uses the manual clock to control Spark Streaming's batches.
 */
class JavaStreamingSuiteBase extends JavaSuiteBase with StreamingSuiteCommon {

  import scala.jdk.CollectionConverters._

  override def conf: SparkConf = super.conf
    .set("spark.streaming.clock", "org.apache.spark.streaming.util.TestManualClock")

  /**
   * Verify whether the output values after running a DStream operation
   * is same as the expected output values, by comparing the output
   * collections either as lists (order matters) or sets (order does not matter)
   */
  def verifyOutput[V: ClassTag](
      output: Seq[Seq[V]],
      expectedOutput: Seq[Seq[V]],
      ordered: Boolean): Unit = {

    logInfo("--------------------------------")
    logInfo("output.size = " + output.size)
    logInfo("output")
    output.foreach(x => logInfo("[" + x.mkString(",") + "]"))

    logInfo("expected output.size = " + expectedOutput.size)
    logInfo("expected output")
    expectedOutput.foreach(x => logInfo("[" + x.mkString(",") + "]"))
    logInfo("--------------------------------")

    // Match the output with the expected output
    assertEquals("Number of outputs do not match", expectedOutput.size, output.size)
    if (ordered) {
      for (i <- output.indices) {
        compareArrays[V](expectedOutput(i).toArray, output(i).toArray)
      }
    } else {
      // Order does not matter which makes our life harder.
      // If we sort by hash code, if we have a hash collision we might get a false negative
      // So instead we convert this to a map and do a comparison
      for (i <- output.indices) {
        assertEquals(
          expectedOutput(i).groupBy(x => x).mapValues(_.size).toMap.asJava,
          output(i).groupBy(x => x).mapValues(_.size).toMap.asJava
        )
      }
    }

    logInfo("Output verified successfully")
  }

  /**
   * Test unary DStream operation with a list of inputs, with number of
   * batches to run same as the number of input values.
   *
   * Each input micro-batch is a list of values or as null to simulate empty batch.
   *
   * @param input          Sequence of input collections
   * @param operation      Binary DStream operation to be applied to the 2 inputs
   * @param expectedOutput Sequence of expected output collections
   */
  def testOperation[U, V](
      input: JList[JList[U]],
      operation: JFunction[JavaDStream[U], JavaDStream[V]],
      expectedOutput: JList[JList[V]]): Unit = {
    testOperation[U, V](input, operation, expectedOutput, false)
  }

  /**
   * Test unary DStream operation with a list of inputs, with number of
   * batches to run same as the number of input values.
   *
   * Each input micro-batch is a list of values or as null to simulate empty batch.
   *
   * @param input          Sequence of input collections
   * @param operation      Binary DStream operation to be applied to the 2 inputs
   * @param expectedOutput Sequence of expected output collections
   * @param ordered        Compare output values with expected output values
   *                       within the same output batch ordered or unordered.
   *                       Comparing doubles may not work well in case of unordered.
   */
  def testOperation[U, V](
      input: JList[JList[U]],
      operation: JFunction[JavaDStream[U], JavaDStream[V]],
      expectedOutput: JList[JList[V]],
      ordered: Boolean): Unit = {

    val numBatches = input.size

    implicit val ctagU = Utils.fakeClassTag[U]
    implicit val ctagV = Utils.fakeClassTag[V]

    val sInput = toSeq(input)
    val sExpectedOutput = toSeq(expectedOutput)

    def wrappedOperation(input: DStream[U]): DStream[V] = {
      operation.call(new JavaDStream[U](input)).dstream
    }

    withOutputAndStreamingContext(
      setupStreams[U, V](sInput, wrappedOperation)) {

      (outputStream, ssc) =>
      val output: Seq[Seq[V]] =
        runStreams[V](outputStream, ssc, numBatches, expectedOutput.size)
      verifyOutput[V](output, sExpectedOutput, ordered)
    }
  }


  /**
   * Test binary DStream operation with two lists of inputs, with number of
   * batches to run same as the number of input values. The size of the two input
   * lists should be equal.
   *
   * Each input micro-batch is a list of values or as null to simulate empty batch.
   *
   * @param input1         First sequence of input collections
   * @param input2         Second sequence of input collections
   * @param operation      Binary DStream operation to be applied to the 2 inputs
   * @param expectedOutput Sequence of expected output collections
   */
  def testOperation[U, V, W](
      input1: JList[JList[U]],
      input2: JList[JList[V]],
      operation: JFunction2[JavaDStream[U], JavaDStream[V], JavaDStream[W]],
      expectedOutput: JList[JList[W]]): Unit = {
    testOperation(input1, input2, operation, expectedOutput, false)
  }

  /**
   * Test binary DStream operation with two lists of inputs, with number of
   * batches to run same as the number of input values. The size of the two input
   * lists should be equal.
   *
   * Each input micro-batch is a list of values or as null to simulate empty batch.
   *
   * @param input1         First sequence of input collections
   * @param input2         Second sequence of input collections
   * @param operation      Binary DStream operation to be applied to the 2 inputs
   * @param expectedOutput Sequence of expected output collections
   * @param ordered        Compare output values with expected output values
   *                       within the same output batch ordered or unOrdered.
   *                       Comparing doubles may not work well in case of unordered.
   */
  def testOperation[U, V, W](
      input1: JList[JList[U]],
      input2: JList[JList[V]],
      operation: JFunction2[JavaDStream[U], JavaDStream[V], JavaDStream[W]],
      expectedOutput: JList[JList[W]],
      ordered: Boolean): Unit = {

    assertEquals("Length of the input lists are not equal",
      input1.size, input2.size)
    val numBatches = input1.size

    implicit val ctagU = Utils.fakeClassTag[U]
    implicit val ctagV = Utils.fakeClassTag[V]
    implicit val ctagW = Utils.fakeClassTag[W]

    val sInput1 = toSeq(input1)
    val sInput2 = toSeq(input2)
    val sExpectedOutput = toSeq(expectedOutput)

    def wrappedOperation(input1: DStream[U], input2: DStream[V]): DStream[W] = {
      operation.call(new JavaDStream[U](input1), new JavaDStream[V](input2)).dstream
    }

    withOutputAndStreamingContext(
      setupStreams[U, V, W](sInput1, sInput2, wrappedOperation)) {

      (outputStream, ssc) =>
      val output = runStreams[W](
        outputStream, ssc, numBatches, expectedOutput.size)
      verifyOutput[W](output, sExpectedOutput, ordered)
    }
  }

  private def toSeq[U](input: JList[JList[U]]) = input.asScala.map(_.asScala.toSeq).toSeq
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy