All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.spark.mllib.random.RandomRDDs.scala Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.mllib.random

import scala.reflect.ClassTag

import org.apache.spark.SparkContext
import org.apache.spark.annotation.Since
import org.apache.spark.api.java.{JavaDoubleRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.java.JavaSparkContext.fakeClassTag
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.rdd.{RandomRDD, RandomVectorRDD}
import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils

/**
 * Generator methods for creating RDDs comprised of `i.i.d.` samples from some distribution.
 */
@Since("1.1.0")
object RandomRDDs {

  /**
   * Generates an RDD comprised of `i.i.d.` samples from the uniform distribution `U(0.0, 1.0)`.
   *
   * To transform the distribution in the generated RDD from `U(0.0, 1.0)` to `U(a, b)`, use
   * `RandomRDDs.uniformRDD(sc, n, p, seed).map(v => a + (b - a) * v)`.
   *
   * @param sc SparkContext used to create the RDD.
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Double] comprised of `i.i.d.` samples ~ `U(0.0, 1.0)`.
   */
  @Since("1.1.0")
  def uniformRDD(
      sc: SparkContext,
      size: Long,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Double] = {
    val uniform = new UniformGenerator()
    randomRDD(sc, uniform, size, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.uniformRDD`.
   */
  @Since("1.1.0")
  def uniformJavaRDD(
      jsc: JavaSparkContext,
      size: Long,
      numPartitions: Int,
      seed: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(uniformRDD(jsc.sc, size, numPartitions, seed))
  }

  /**
   * `RandomRDDs.uniformJavaRDD` with the default seed.
   */
  @Since("1.1.0")
  def uniformJavaRDD(jsc: JavaSparkContext, size: Long, numPartitions: Int): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(uniformRDD(jsc.sc, size, numPartitions))
  }

  /**
   * `RandomRDDs.uniformJavaRDD` with the default number of partitions and the default seed.
   */
  @Since("1.1.0")
  def uniformJavaRDD(jsc: JavaSparkContext, size: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(uniformRDD(jsc.sc, size))
  }

  /**
   * Generates an RDD comprised of `i.i.d.` samples from the standard normal distribution.
   *
   * To transform the distribution in the generated RDD from standard normal to some other normal
   * `N(mean, sigma^2^)`, use `RandomRDDs.normalRDD(sc, n, p, seed).map(v => mean + sigma * v)`.
   *
   * @param sc SparkContext used to create the RDD.
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Double] comprised of `i.i.d.` samples ~ N(0.0, 1.0).
   */
  @Since("1.1.0")
  def normalRDD(
      sc: SparkContext,
      size: Long,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Double] = {
    val normal = new StandardNormalGenerator()
    randomRDD(sc, normal, size, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.normalRDD`.
   */
  @Since("1.1.0")
  def normalJavaRDD(
      jsc: JavaSparkContext,
      size: Long,
      numPartitions: Int,
      seed: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(normalRDD(jsc.sc, size, numPartitions, seed))
  }

  /**
   * `RandomRDDs.normalJavaRDD` with the default seed.
   */
  @Since("1.1.0")
  def normalJavaRDD(jsc: JavaSparkContext, size: Long, numPartitions: Int): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(normalRDD(jsc.sc, size, numPartitions))
  }

  /**
   * `RandomRDDs.normalJavaRDD` with the default number of partitions and the default seed.
   */
  @Since("1.1.0")
  def normalJavaRDD(jsc: JavaSparkContext, size: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(normalRDD(jsc.sc, size))
  }

  /**
   * Generates an RDD comprised of `i.i.d.` samples from the Poisson distribution with the input
   * mean.
   *
   * @param sc SparkContext used to create the RDD.
   * @param mean Mean, or lambda, for the Poisson distribution.
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
   */
  @Since("1.1.0")
  def poissonRDD(
      sc: SparkContext,
      mean: Double,
      size: Long,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Double] = {
    val poisson = new PoissonGenerator(mean)
    randomRDD(sc, poisson, size, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.poissonRDD`.
   */
  @Since("1.1.0")
  def poissonJavaRDD(
      jsc: JavaSparkContext,
      mean: Double,
      size: Long,
      numPartitions: Int,
      seed: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(poissonRDD(jsc.sc, mean, size, numPartitions, seed))
  }

  /**
   * `RandomRDDs.poissonJavaRDD` with the default seed.
   */
  @Since("1.1.0")
  def poissonJavaRDD(
      jsc: JavaSparkContext,
      mean: Double,
      size: Long,
      numPartitions: Int): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(poissonRDD(jsc.sc, mean, size, numPartitions))
  }

  /**
   * `RandomRDDs.poissonJavaRDD` with the default number of partitions and the default seed.
   */
  @Since("1.1.0")
  def poissonJavaRDD(jsc: JavaSparkContext, mean: Double, size: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(poissonRDD(jsc.sc, mean, size))
  }

  /**
   * Generates an RDD comprised of `i.i.d.` samples from the exponential distribution with
   * the input mean.
   *
   * @param sc SparkContext used to create the RDD.
   * @param mean Mean, or 1 / lambda, for the exponential distribution.
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
   */
  @Since("1.3.0")
  def exponentialRDD(
      sc: SparkContext,
      mean: Double,
      size: Long,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Double] = {
    val exponential = new ExponentialGenerator(mean)
    randomRDD(sc, exponential, size, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.exponentialRDD`.
   */
  @Since("1.3.0")
  def exponentialJavaRDD(
      jsc: JavaSparkContext,
      mean: Double,
      size: Long,
      numPartitions: Int,
      seed: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size, numPartitions, seed))
  }

  /**
   * `RandomRDDs.exponentialJavaRDD` with the default seed.
   */
  @Since("1.3.0")
  def exponentialJavaRDD(
      jsc: JavaSparkContext,
      mean: Double,
      size: Long,
      numPartitions: Int): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size, numPartitions))
  }

  /**
   * `RandomRDDs.exponentialJavaRDD` with the default number of partitions and the default seed.
   */
  @Since("1.3.0")
  def exponentialJavaRDD(jsc: JavaSparkContext, mean: Double, size: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(exponentialRDD(jsc.sc, mean, size))
  }

  /**
   * Generates an RDD comprised of `i.i.d.` samples from the gamma distribution with the input
   *  shape and scale.
   *
   * @param sc SparkContext used to create the RDD.
   * @param shape shape parameter (greater than 0) for the gamma distribution
   * @param scale scale parameter (greater than 0) for the gamma distribution
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
   */
  @Since("1.3.0")
  def gammaRDD(
      sc: SparkContext,
      shape: Double,
      scale: Double,
      size: Long,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Double] = {
    val gamma = new GammaGenerator(shape, scale)
    randomRDD(sc, gamma, size, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.gammaRDD`.
   */
  @Since("1.3.0")
  def gammaJavaRDD(
      jsc: JavaSparkContext,
      shape: Double,
      scale: Double,
      size: Long,
      numPartitions: Int,
      seed: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size, numPartitions, seed))
  }

  /**
   * `RandomRDDs.gammaJavaRDD` with the default seed.
   */
  @Since("1.3.0")
  def gammaJavaRDD(
      jsc: JavaSparkContext,
      shape: Double,
      scale: Double,
      size: Long,
      numPartitions: Int): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size, numPartitions))
  }

  /**
   * `RandomRDDs.gammaJavaRDD` with the default number of partitions and the default seed.
   */
  @Since("1.3.0")
  def gammaJavaRDD(
      jsc: JavaSparkContext,
      shape: Double,
      scale: Double,
      size: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(gammaRDD(jsc.sc, shape, scale, size))
  }

  /**
   * Generates an RDD comprised of `i.i.d.` samples from the log normal distribution with the input
   *  mean and standard deviation
   *
   * @param sc SparkContext used to create the RDD.
   * @param mean mean for the log normal distribution
   * @param std standard deviation for the log normal distribution
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Double] comprised of `i.i.d.` samples ~ Pois(mean).
   */
  @Since("1.3.0")
  def logNormalRDD(
      sc: SparkContext,
      mean: Double,
      std: Double,
      size: Long,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Double] = {
    val logNormal = new LogNormalGenerator(mean, std)
    randomRDD(sc, logNormal, size, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.logNormalRDD`.
   */
  @Since("1.3.0")
  def logNormalJavaRDD(
      jsc: JavaSparkContext,
      mean: Double,
      std: Double,
      size: Long,
      numPartitions: Int,
      seed: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size, numPartitions, seed))
  }

  /**
   * `RandomRDDs.logNormalJavaRDD` with the default seed.
   */
  @Since("1.3.0")
  def logNormalJavaRDD(
      jsc: JavaSparkContext,
      mean: Double,
      std: Double,
      size: Long,
      numPartitions: Int): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size, numPartitions))
  }

  /**
   * `RandomRDDs.logNormalJavaRDD` with the default number of partitions and the default seed.
   */
  @Since("1.3.0")
  def logNormalJavaRDD(
      jsc: JavaSparkContext,
      mean: Double,
      std: Double,
      size: Long): JavaDoubleRDD = {
    JavaDoubleRDD.fromRDD(logNormalRDD(jsc.sc, mean, std, size))
  }


  /**
   * Generates an RDD comprised of `i.i.d.` samples produced by the input RandomDataGenerator.
   *
   * @param sc SparkContext used to create the RDD.
   * @param generator RandomDataGenerator used to populate the RDD.
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[T] comprised of `i.i.d.` samples produced by generator.
   */
  @Since("1.1.0")
  def randomRDD[T: ClassTag](
      sc: SparkContext,
      generator: RandomDataGenerator[T],
      size: Long,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[T] = {
    new RandomRDD[T](sc, size, numPartitionsOrDefault(sc, numPartitions), generator, seed)
  }

  /**
   * Generates an RDD comprised of `i.i.d.` samples produced by the input RandomDataGenerator.
   *
   * @param jsc JavaSparkContext used to create the RDD.
   * @param generator RandomDataGenerator used to populate the RDD.
   * @param size Size of the RDD.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[T] comprised of `i.i.d.` samples produced by generator.
   */
  @Since("1.6.0")
  def randomJavaRDD[T](
      jsc: JavaSparkContext,
      generator: RandomDataGenerator[T],
      size: Long,
      numPartitions: Int,
      seed: Long): JavaRDD[T] = {
    implicit val ctag: ClassTag[T] = fakeClassTag
    val rdd = randomRDD(jsc.sc, generator, size, numPartitions, seed)
    JavaRDD.fromRDD(rdd)
  }

  /**
   * `RandomRDDs.randomJavaRDD` with the default seed.
   */
  @Since("1.6.0")
  def randomJavaRDD[T](
    jsc: JavaSparkContext,
    generator: RandomDataGenerator[T],
    size: Long,
    numPartitions: Int): JavaRDD[T] = {
    randomJavaRDD(jsc, generator, size, numPartitions, Utils.random.nextLong())
  }

  /**
   * `RandomRDDs.randomJavaRDD` with the default seed & numPartitions
   */
  @Since("1.6.0")
  def randomJavaRDD[T](
      jsc: JavaSparkContext,
      generator: RandomDataGenerator[T],
      size: Long): JavaRDD[T] = {
    randomJavaRDD(jsc, generator, size, 0)
  }

  // TODO Generate RDD[Vector] from multivariate distributions.

  /**
   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
   * uniform distribution on `U(0.0, 1.0)`.
   *
   * @param sc SparkContext used to create the RDD.
   * @param numRows Number of Vectors in the RDD.
   * @param numCols Number of elements in each Vector.
   * @param numPartitions Number of partitions in the RDD.
   * @param seed Seed for the RNG that generates the seed for the generator in each partition.
   * @return RDD[Vector] with vectors containing i.i.d samples ~ `U(0.0, 1.0)`.
   */
  @Since("1.1.0")
  def uniformVectorRDD(
      sc: SparkContext,
      numRows: Long,
      numCols: Int,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
    val uniform = new UniformGenerator()
    randomVectorRDD(sc, uniform, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.uniformVectorRDD`.
   */
  @Since("1.1.0")
  def uniformJavaVectorRDD(
      jsc: JavaSparkContext,
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): JavaRDD[Vector] = {
    uniformVectorRDD(jsc.sc, numRows, numCols, numPartitions, seed).toJavaRDD()
  }

  /**
   * `RandomRDDs.uniformJavaVectorRDD` with the default seed.
   */
  @Since("1.1.0")
  def uniformJavaVectorRDD(
      jsc: JavaSparkContext,
      numRows: Long,
      numCols: Int,
      numPartitions: Int): JavaRDD[Vector] = {
    uniformVectorRDD(jsc.sc, numRows, numCols, numPartitions).toJavaRDD()
  }

  /**
   * `RandomRDDs.uniformJavaVectorRDD` with the default number of partitions and the default seed.
   */
  @Since("1.1.0")
  def uniformJavaVectorRDD(
      jsc: JavaSparkContext,
      numRows: Long,
      numCols: Int): JavaRDD[Vector] = {
    uniformVectorRDD(jsc.sc, numRows, numCols).toJavaRDD()
  }

  /**
   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
   * standard normal distribution.
   *
   * @param sc SparkContext used to create the RDD.
   * @param numRows Number of Vectors in the RDD.
   * @param numCols Number of elements in each Vector.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ `N(0.0, 1.0)`.
   */
  @Since("1.1.0")
  def normalVectorRDD(
      sc: SparkContext,
      numRows: Long,
      numCols: Int,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
    val normal = new StandardNormalGenerator()
    randomVectorRDD(sc, normal, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.normalVectorRDD`.
   */
  @Since("1.1.0")
  def normalJavaVectorRDD(
      jsc: JavaSparkContext,
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): JavaRDD[Vector] = {
    normalVectorRDD(jsc.sc, numRows, numCols, numPartitions, seed).toJavaRDD()
  }

  /**
   * `RandomRDDs.normalJavaVectorRDD` with the default seed.
   */
  @Since("1.1.0")
  def normalJavaVectorRDD(
      jsc: JavaSparkContext,
      numRows: Long,
      numCols: Int,
      numPartitions: Int): JavaRDD[Vector] = {
    normalVectorRDD(jsc.sc, numRows, numCols, numPartitions).toJavaRDD()
  }

  /**
   * `RandomRDDs.normalJavaVectorRDD` with the default number of partitions and the default seed.
   */
  @Since("1.1.0")
  def normalJavaVectorRDD(
      jsc: JavaSparkContext,
      numRows: Long,
      numCols: Int): JavaRDD[Vector] = {
    normalVectorRDD(jsc.sc, numRows, numCols).toJavaRDD()
  }

  /**
   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from a
   * log normal distribution.
   *
   * @param sc SparkContext used to create the RDD.
   * @param mean Mean of the log normal distribution.
   * @param std Standard deviation of the log normal distribution.
   * @param numRows Number of Vectors in the RDD.
   * @param numCols Number of elements in each Vector.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Vector] with vectors containing `i.i.d.` samples.
   */
  @Since("1.3.0")
  def logNormalVectorRDD(
      sc: SparkContext,
      mean: Double,
      std: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
    val logNormal = new LogNormalGenerator(mean, std)
    randomVectorRDD(sc, logNormal, numRows, numCols,
      numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.logNormalVectorRDD`.
   */
  @Since("1.3.0")
  def logNormalJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      std: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): JavaRDD[Vector] = {
    logNormalVectorRDD(jsc.sc, mean, std, numRows, numCols, numPartitions, seed).toJavaRDD()
  }

  /**
   * `RandomRDDs.logNormalJavaVectorRDD` with the default seed.
   */
  @Since("1.3.0")
  def logNormalJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      std: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int): JavaRDD[Vector] = {
    logNormalVectorRDD(jsc.sc, mean, std, numRows, numCols, numPartitions).toJavaRDD()
  }

  /**
   * `RandomRDDs.logNormalJavaVectorRDD` with the default number of partitions and
   * the default seed.
   */
  @Since("1.3.0")
  def logNormalJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      std: Double,
      numRows: Long,
      numCols: Int): JavaRDD[Vector] = {
    logNormalVectorRDD(jsc.sc, mean, std, numRows, numCols).toJavaRDD()
  }

  /**
   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
   * Poisson distribution with the input mean.
   *
   * @param sc SparkContext used to create the RDD.
   * @param mean Mean, or lambda, for the Poisson distribution.
   * @param numRows Number of Vectors in the RDD.
   * @param numCols Number of elements in each Vector.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Pois(mean).
   */
  @Since("1.1.0")
  def poissonVectorRDD(
      sc: SparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
    val poisson = new PoissonGenerator(mean)
    randomVectorRDD(sc, poisson, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.poissonVectorRDD`.
   */
  @Since("1.1.0")
  def poissonJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): JavaRDD[Vector] = {
    poissonVectorRDD(jsc.sc, mean, numRows, numCols, numPartitions, seed).toJavaRDD()
  }

  /**
   * `RandomRDDs.poissonJavaVectorRDD` with the default seed.
   */
  @Since("1.1.0")
  def poissonJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int): JavaRDD[Vector] = {
    poissonVectorRDD(jsc.sc, mean, numRows, numCols, numPartitions).toJavaRDD()
  }

  /**
   * `RandomRDDs.poissonJavaVectorRDD` with the default number of partitions and the default seed.
   */
  @Since("1.1.0")
  def poissonJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int): JavaRDD[Vector] = {
    poissonVectorRDD(jsc.sc, mean, numRows, numCols).toJavaRDD()
  }

  /**
   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
   * exponential distribution with the input mean.
   *
   * @param sc SparkContext used to create the RDD.
   * @param mean Mean, or 1 / lambda, for the Exponential distribution.
   * @param numRows Number of Vectors in the RDD.
   * @param numCols Number of elements in each Vector.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
   */
  @Since("1.3.0")
  def exponentialVectorRDD(
      sc: SparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
    val exponential = new ExponentialGenerator(mean)
    randomVectorRDD(sc, exponential, numRows, numCols,
      numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.exponentialVectorRDD`.
   */
  @Since("1.3.0")
  def exponentialJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): JavaRDD[Vector] = {
    exponentialVectorRDD(jsc.sc, mean, numRows, numCols, numPartitions, seed).toJavaRDD()
  }

  /**
   * `RandomRDDs.exponentialJavaVectorRDD` with the default seed.
   */
  @Since("1.3.0")
  def exponentialJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int): JavaRDD[Vector] = {
    exponentialVectorRDD(jsc.sc, mean, numRows, numCols, numPartitions).toJavaRDD()
  }

  /**
   * `RandomRDDs.exponentialJavaVectorRDD` with the default number of partitions
   * and the default seed.
   */
  @Since("1.3.0")
  def exponentialJavaVectorRDD(
      jsc: JavaSparkContext,
      mean: Double,
      numRows: Long,
      numCols: Int): JavaRDD[Vector] = {
    exponentialVectorRDD(jsc.sc, mean, numRows, numCols).toJavaRDD()
  }


  /**
   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples drawn from the
   * gamma distribution with the input shape and scale.
   *
   * @param sc SparkContext used to create the RDD.
   * @param shape shape parameter (greater than 0) for the gamma distribution.
   * @param scale scale parameter (greater than 0) for the gamma distribution.
   * @param numRows Number of Vectors in the RDD.
   * @param numCols Number of elements in each Vector.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`)
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Vector] with vectors containing `i.i.d.` samples ~ Exp(mean).
   */
  @Since("1.3.0")
  def gammaVectorRDD(
      sc: SparkContext,
      shape: Double,
      scale: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
    val gamma = new GammaGenerator(shape, scale)
    randomVectorRDD(sc, gamma, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.gammaVectorRDD`.
   */
  @Since("1.3.0")
  def gammaJavaVectorRDD(
      jsc: JavaSparkContext,
      shape: Double,
      scale: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): JavaRDD[Vector] = {
    gammaVectorRDD(jsc.sc, shape, scale, numRows, numCols, numPartitions, seed).toJavaRDD()
  }

  /**
   * `RandomRDDs.gammaJavaVectorRDD` with the default seed.
   */
  @Since("1.3.0")
  def gammaJavaVectorRDD(
      jsc: JavaSparkContext,
      shape: Double,
      scale: Double,
      numRows: Long,
      numCols: Int,
      numPartitions: Int): JavaRDD[Vector] = {
    gammaVectorRDD(jsc.sc, shape, scale, numRows, numCols, numPartitions).toJavaRDD()
  }

  /**
   * `RandomRDDs.gammaJavaVectorRDD` with the default number of partitions and the default seed.
   */
  @Since("1.3.0")
  def gammaJavaVectorRDD(
      jsc: JavaSparkContext,
      shape: Double,
      scale: Double,
      numRows: Long,
      numCols: Int): JavaRDD[Vector] = {
    gammaVectorRDD(jsc.sc, shape, scale, numRows, numCols).toJavaRDD()
  }


  /**
   * Generates an RDD[Vector] with vectors containing `i.i.d.` samples produced by the
   * input RandomDataGenerator.
   *
   * @param sc SparkContext used to create the RDD.
   * @param generator RandomDataGenerator used to populate the RDD.
   * @param numRows Number of Vectors in the RDD.
   * @param numCols Number of elements in each Vector.
   * @param numPartitions Number of partitions in the RDD (default: `sc.defaultParallelism`).
   * @param seed Random seed (default: a random long integer).
   * @return RDD[Vector] with vectors containing `i.i.d.` samples produced by generator.
   */
  @Since("1.1.0")
  def randomVectorRDD(sc: SparkContext,
      generator: RandomDataGenerator[Double],
      numRows: Long,
      numCols: Int,
      numPartitions: Int = 0,
      seed: Long = Utils.random.nextLong()): RDD[Vector] = {
    new RandomVectorRDD(
      sc, numRows, numCols, numPartitionsOrDefault(sc, numPartitions), generator, seed)
  }

  /**
   * Java-friendly version of `RandomRDDs.randomVectorRDD`.
   */
  @Since("1.6.0")
  def randomJavaVectorRDD(
      jsc: JavaSparkContext,
      generator: RandomDataGenerator[Double],
      numRows: Long,
      numCols: Int,
      numPartitions: Int,
      seed: Long): JavaRDD[Vector] = {
    randomVectorRDD(jsc.sc, generator, numRows, numCols, numPartitions, seed).toJavaRDD()
  }

  /** ::
   * `RandomRDDs.randomJavaVectorRDD` with the default seed.
   */
  @Since("1.6.0")
  def randomJavaVectorRDD(
      jsc: JavaSparkContext,
      generator: RandomDataGenerator[Double],
      numRows: Long,
      numCols: Int,
      numPartitions: Int): JavaRDD[Vector] = {
    randomVectorRDD(jsc.sc, generator, numRows, numCols, numPartitions).toJavaRDD()
  }

  /**
   * `RandomRDDs.randomJavaVectorRDD` with the default number of partitions and the default seed.
   */
  @Since("1.6.0")
  def randomJavaVectorRDD(
      jsc: JavaSparkContext,
      generator: RandomDataGenerator[Double],
      numRows: Long,
      numCols: Int): JavaRDD[Vector] = {
    randomVectorRDD(jsc.sc, generator, numRows, numCols).toJavaRDD()
  }

  /**
   * Returns `numPartitions` if it is positive, or `sc.defaultParallelism` otherwise.
   */
  private def numPartitionsOrDefault(sc: SparkContext, numPartitions: Int): Int = {
    if (numPartitions > 0) numPartitions else sc.defaultMinPartitions
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy