All Downloads are FREE. Search and download functionalities are using the official Maven repository.

spark.streaming.examples.RawNetworkGrep.scala Maven / Gradle / Ivy

package spark.streaming.examples

import spark.util.IntParam
import spark.storage.StorageLevel

import spark.streaming._
import spark.streaming.util.RawTextHelper

/**
 * Receives text from multiple rawNetworkStreams and counts how many '\n' delimited
 * lines have the word 'the' in them. This is useful for benchmarking purposes. This
 * will only work with spark.streaming.util.RawTextSender running on all worker nodes
 * and with Spark using Kryo serialization (set Java property "spark.serializer" to
 * "spark.KryoSerializer").
 * Usage: RawNetworkGrep     
 *    is the Spark master URL
 *    is the number rawNetworkStreams, which should be same as number
 *               of work nodes in the cluster
 *    is "localhost".
 *    is the port on which RawTextSender is running in the worker nodes.
 *    is the Spark Streaming batch duration in milliseconds.
 */

object RawNetworkGrep {
  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println("Usage: RawNetworkGrep     ")
      System.exit(1)
    }

    val Array(master, IntParam(numStreams), host, IntParam(port), IntParam(batchMillis)) = args

    // Create the context
    val ssc = new StreamingContext(master, "RawNetworkGrep", Milliseconds(batchMillis),
      System.getenv("SPARK_HOME"), Seq(System.getenv("SPARK_EXAMPLES_JAR")))

    // Warm up the JVMs on master and slave for JIT compilation to kick in
    RawTextHelper.warmUp(ssc.sparkContext)

    val rawStreams = (1 to numStreams).map(_ =>
      ssc.rawSocketStream[String](host, port, StorageLevel.MEMORY_ONLY_SER_2)).toArray
    val union = ssc.union(rawStreams)
    union.filter(_.contains("the")).count().foreach(r =>
      println("Grep count: " + r.collect().mkString))
    ssc.start()
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy