All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.holdenkarau.spark.testing.RDDComparisons.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * This is a subset of the Spark Utils object that we use for testing
 */
package com.holdenkarau.spark.testing

import org.apache.spark.rdd._
import org.scalatest.Suite

import scala.reflect.ClassTag


trait RDDComparisons extends RDDComparisonsLike with TestSuite { self: Suite =>
}

trait RDDComparisonsLike extends TestSuiteLike {
  // tag::PANDA_ORDERED[]
  /**
   * Asserts two RDDs are equal (with the same order).
   * If they are equal assertion succeeds, otherwise assertion fails.
   */
  def assertRDDEqualsWithOrder[T: ClassTag](
    expected: RDD[T], result: RDD[T]): Unit = {
    assertTrue(compareRDDWithOrder(expected, result).isEmpty)
  }

  /**
   * Compare two RDDs with order (e.g. [1,2,3] != [3,2,1])
   * If the partitioners are not the same this requires multiple passes
   * on the input.
   * If they are equal returns None, otherwise returns Some with the first mismatch.
   * If the lengths are not equal, one of the two components may be None.
   */
  def compareRDDWithOrder[T: ClassTag](
    expected: RDD[T], result: RDD[T]): Option[(Option[T], Option[T])] = {
    // If there is a known partitioner just zip
    if (result.partitioner.map(_ == expected.partitioner.get).getOrElse(false)) {
      compareRDDWithOrderSamePartitioner(expected, result)
    } else {
      // Otherwise index every element
      def indexRDD[T](rdd: RDD[T]): RDD[(Long, T)] = {
        rdd.zipWithIndex.map { case (x, y) => (y, x) }
      }
      val indexedExpected = indexRDD(expected)
      val indexedResult = indexRDD(result)
      indexedExpected.cogroup(indexedResult).filter { case (_, (i1, i2)) =>
        i1.isEmpty || i2.isEmpty || i1.head != i2.head
      }.take(1).headOption.
        map { case (_, (i1, i2)) =>
          (i1.headOption, i2.headOption) }.take(1).headOption
    }
  }

  /**
   * Compare two RDDs. If they are equal returns None, otherwise
   * returns Some with the first mismatch. Assumes we have the same partitioner.
   */
  def compareRDDWithOrderSamePartitioner[T: ClassTag](
    expected: RDD[T], result: RDD[T]): Option[(Option[T], Option[T])] = {
    // Handle mismatched lengths by converting into options and padding with Nones
    expected.zipPartitions(result) {
      (thisIter, otherIter) =>
        new Iterator[(Option[T], Option[T])] {
          def hasNext: Boolean = (thisIter.hasNext || otherIter.hasNext)

          def next(): (Option[T], Option[T]) = {
            (thisIter.hasNext, otherIter.hasNext) match {
              case (false, true) => (Option.empty[T], Some(otherIter.next()))
              case (true, false) => (Some(thisIter.next()), Option.empty[T])
              case (true, true) => (Some(thisIter.next()), Some(otherIter.next()))
              case _ => throw new Exception("next called when elements consumed")
            }
          }
        }
    }.filter { case (v1, v2) => v1 != v2 }.take(1).headOption
  }

  // end::PANDA_ORDERED[]

  // tag::PANDA_UNORDERED[]
  /**
   * Asserts two RDDs are equal (un ordered).
   * If they are equal assertion succeeds, otherwise assertion fails.
   */
  def assertRDDEquals[T: ClassTag](expected: RDD[T], result: RDD[T]): Unit = {
    assertTrue(compareRDD(expected, result).isEmpty)
  }

  /**
   * Compare two RDDs where we do not require the order to be equal.
   * If they are equal returns None, otherwise returns Some with the first mismatch.
   *
   * @return None if the two RDDs are equal, or Some containing
   *              the first mismatch information.
   *              The mismatch information will be Tuple3 of:
   *              (key, number of times this key occur in expected RDD,
   *              number of times this key occur in result RDD)
   */
  def compareRDD[T: ClassTag](expected: RDD[T], result: RDD[T]):
      Option[(T, Int, Int)] = {
    // Key the values and count the number of each unique element
    val expectedKeyed = expected.map(x => (x, 1)).reduceByKey(_ + _)
    val resultKeyed = result.map(x => (x, 1)).reduceByKey(_ + _)
    // Group them together and filter for difference
    expectedKeyed.cogroup(resultKeyed).filter { case (_, (i1, i2)) =>
      i1.isEmpty || i2.isEmpty || i1.head != i2.head
    }
      .take(1).headOption.
      map { case (v, (i1, i2)) =>
        (v, i1.headOption.getOrElse(0), i2.headOption.getOrElse(0)) }
  }

  // end::PANDA_UNORDERED[]
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy