All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.holdenkarau.spark.testing.DatasetGenerator.scala Maven / Gradle / Ivy

The newest version!
package com.holdenkarau.spark.testing

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Encoder, SQLContext}
import org.scalacheck.{Arbitrary, Gen}

import scala.reflect.ClassTag

object DatasetGenerator {

  /**
   * Generate an Dataset Generator of the desired type. Attempt to try different
   * number of partitions so as to catch problems with empty partitions, etc.
   * minPartitions defaults to 1, but when generating data too large for a single
   * machine, choose a larger value.
   *
   * @param sqlCtx        Spark sql Context
   * @param minPartitions defaults to 1
   * @param generator     used to create the generator. This function will be
   *                      used to create the generator as many times as required.
   * @return
   */
  def genDataset[T: ClassTag : Encoder]
    (sqlCtx: SQLContext, minPartitions: Int = 1)
    (generator: => Gen[T]): Gen[Dataset[T]] = {
    arbitraryDataset(sqlCtx, minPartitions)(generator).arbitrary
  }

  /**
    * Generate an Dataset Generator of the desired type with its size accessible.
    * Attempt to try different
    * number of partitions so as to catch problems with empty partitions, etc.
    * minPartitions defaults to 1, but when generating data too large for a single
    * machine, choose a larger value.
    *
    * @param sqlCtx        Spark sql Context
    * @param minPartitions defaults to 1
    * @param generator     used to create the generator. This function will be
    *                      used to create the generator as many times as required.
    * @return
    */
  def genSizedDataset[T: ClassTag : Encoder]
    (sqlCtx: SQLContext, minPartitions: Int = 1)
    (generator: Int => Gen[T]): Gen[Dataset[T]] = {
    arbitrarySizedDataset(sqlCtx, minPartitions)(generator).arbitrary
  }

  /**
   * Generate an Dataset Generator of the desired type. Attempt to try different
   * number of partitions so as to catch problems with empty partitions, etc.
   * minPartitions defaults to 1, but when generating data too large for a single
   * machine, choose a larger value.
   *
   * @param sqlCtx        Spark sql Context
   * @param minPartitions defaults to 1
   * @param generator     used to create the generator. This function will be used to
   *                      create the generator as many times as required.
   * @return
   */
  def arbitraryDataset[T: ClassTag : Encoder]
    (sqlCtx: SQLContext, minPartitions: Int = 1)
    (generator: => Gen[T]): Arbitrary[Dataset[T]] = {

    val rddGen: Gen[RDD[T]] =
      RDDGenerator.genRDD[T](sqlCtx.sparkContext, minPartitions)(generator)
    val datasetGen: Gen[Dataset[T]] =
      rddGen.map(rdd => sqlCtx.createDataset(rdd))

    Arbitrary {
      datasetGen
    }
  }

  /**
    * Generate an Dataset Generator of the desired type. Attempt to try different
    * number of partitions so as to catch problems with empty partitions, etc.
    * minPartitions defaults to 1, but when generating data too large for a single
    * machine, choose a larger value.
    *
    * @param sqlCtx        Spark sql Context
    * @param minPartitions defaults to 1
    * @param generator     used to create the generator. This function will be used
    *                      to create the generator as many times as required.
    * @return
    */
  def arbitrarySizedDataset[T: ClassTag : Encoder]
    (sqlCtx: SQLContext, minPartitions: Int = 1)
    (generator: Int => Gen[T]): Arbitrary[Dataset[T]] = {

    val rddGen: Gen[RDD[T]] =
      RDDGenerator.genSizedRDD[T](sqlCtx.sparkContext, minPartitions)(generator)
    val datasetGen: Gen[Dataset[T]] =
      rddGen.map(rdd => sqlCtx.createDataset(rdd))

    Arbitrary {
      datasetGen
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy