All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.lucidworks.spark.example.events.EventsimIndexer.scala Maven / Gradle / Ivy

package com.lucidworks.spark.example.events

import java.net.URL
import java.util.{Calendar, TimeZone}
import com.lucidworks.spark.SparkApp.RDDProcessor
import com.lucidworks.spark.fusion.FusionPipelineClient
import org.apache.commons.cli.{CommandLine, Option}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.JavaConverters.bufferAsJavaList
import scala.collection.mutable.ListBuffer

class EventsimIndexer extends RDDProcessor {
  val DEFAULT_ENDPOINT =
    "http://localhost:8764/api/apollo/index-pipelines/eventsim-default/collections/eventsim/index"

  def getName: String = "eventsim"

  def getOptions: Array[Option] = {
    Array(
      Option.builder()
        .hasArg().required(true)
        .desc("Path to an eventsim JSON file")
        .longOpt("eventsimJson").build,
      Option.builder()
        .hasArg()
        .desc("Fusion endpoint(s); default is " + DEFAULT_ENDPOINT)
        .longOpt("fusion").build,
      Option.builder()
        .hasArg()
        .desc("Fusion username; default is admin")
        .longOpt("fusionUser").build,
      Option.builder()
        .hasArg()
        .desc("Fusion password; required if fusionAuthEnbled=true")
        .longOpt("fusionPass").build,
      Option.builder()
        .hasArg()
        .desc("Fusion security realm; default is native")
        .longOpt("fusionRealm").build,
      Option.builder()
        .hasArg()
        .desc("Fusion authentication enabled; default is true")
        .longOpt("fusionAuthEnabled").build,
      Option.builder()
        .hasArg()
        .desc("Fusion indexing batch size; default is 100")
        .longOpt("fusionBatchSize").build
    )
  }

  def run(conf: SparkConf, cli: CommandLine): Int = {
    val fusionEndpoints: String = cli.getOptionValue("fusion", DEFAULT_ENDPOINT)
    val fusionAuthEnabled: Boolean =
      "true".equalsIgnoreCase(cli.getOptionValue("fusionAuthEnabled", "true"))
    val fusionUser: String = cli.getOptionValue("fusionUser", "admin")

    val fusionPass: String = cli.getOptionValue("fusionPass")
    if (fusionAuthEnabled && (fusionPass == null || fusionPass.isEmpty))
      throw new IllegalArgumentException("Fusion password is required when authentication is enabled!")

    val fusionRealm: String = cli.getOptionValue("fusionRealm", "native")
    val fusionBatchSize: Int = cli.getOptionValue("fusionBatchSize", "100").toInt

    val urls = fusionEndpoints.split(",").distinct
    val url = new URL(urls(0))
    val pipelinePath = url.getPath

    val sparkSession: SparkSession = SparkSession.builder().config(conf).getOrCreate()

    sparkSession.read.json(cli.getOptionValue("eventsimJson")).foreachPartition((rows: Iterator[Row]) => {
      val fusion: FusionPipelineClient =
        if (fusionAuthEnabled) new FusionPipelineClient(fusionEndpoints, fusionUser, fusionPass, fusionRealm)
        else new FusionPipelineClient(fusionEndpoints)

      val batch = new ListBuffer[Map[String, _]]()
      rows.foreach(next => {
        var userId: String = ""
        var sessionId: String = ""
        var ts: Long = 0

        val fields = new ListBuffer[Map[String, _]]()
        for (c <- 0 until next.length) {
          val obj = next.get(c)
          if (obj != null) {
            var colValue = obj
            val fieldName = next.schema.fieldNames(c)
            if ("ts" == fieldName || "registration" == fieldName) {
              ts = obj.asInstanceOf[Long]
              val cal = Calendar.getInstance(TimeZone.getTimeZone("UTC"))
              cal.setTimeInMillis(ts)
              colValue = cal.getTime.toInstant.toString
            } else if ("userId" == fieldName) {
              userId = obj.toString
            } else if ("sessionId" == fieldName) {
              sessionId = obj.toString
            }
            fields += Map("name" -> fieldName, "value" -> colValue)
          }
        }

        batch += Map("id" -> s"$userId-$sessionId-$ts", "fields" -> fields)

        if (batch.size == fusionBatchSize) {
          fusion.postBatchToPipeline(pipelinePath, bufferAsJavaList(batch))
          batch.clear
        }
      })

      // post the final batch if any left over
      if (batch.nonEmpty) {
        fusion.postBatchToPipeline(pipelinePath, bufferAsJavaList(batch))
        batch.clear
      }
    })

    sparkSession.stop()

    0
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy