org.apache.spark.sql.MetricsCollector.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of prophecy-libs_2.12 Show documentation
Prophecy Spark Libraries
There is a newer version: 6.3.0-3.3.0
package org.apache.spark.sql

import org.apache.spark.annotation.Py4JWhitelist
import org.apache.spark.sql.prophecy.{JobMetricsMetadata, ProphecyEventSendingListener}
import org.slf4j.{Logger,                                 LoggerFactory}

import java.lang.System.currentTimeMillis
import java.util.UUID
import scala.collection.concurrent.TrieMap

object MetricsCollector extends Serializable {
  private lazy val LOGGER: Logger = LoggerFactory.getLogger(this.getClass)
  private val sparkSessionToIdMap          = new TrieMap[SparkSession, String]
  private val sessionIdToListener          = new TrieMap[String, ProphecyEventSendingListener]
  private val sparkSessionIdToSparkSession = new TrieMap[String, SparkSession]()

  // we can't use hashCode because hashCode doesn't guarantee uniqueness unless coupled with equals,
  // Since MetricsCollector will be on driver machines, we are generally good with mapping with a UUID.
  private val groupIdKey              = "spark.jobGroup.id"
  private val sparkConfPipelineUriKey = "prophecy.metadata.pipeline.uri"
  private val sparkConfFabricIdKey    = "prophecy.metadata.fabric.id"
  private val sparkConfServiceUrlKey  = "prophecy.execution.service.url"
  private val sparkConfRunTypeKey     = "prophecy.metadata.is.interactive.run"
  private val sparkConfJobUriKey      = "prophecy.metadata.job.uri"
  private val sparkConfUserIdKey      = "prophecy.metadata.user.id"

  def getUniqueSessionId(session: SparkSession): Option[String] =
    sparkSessionToIdMap.get(session)

  def getSession(sessionForInteractive: String): String =
    if (sessionForInteractive.isEmpty) UUID.randomUUID().toString else sessionForInteractive

  def getTaskIdFromGroup(spark: SparkSession): String = {
    val jobGroup = spark.sparkContext.getLocalProperty(groupIdKey)
    // Eg job group : 337054160472983689_5254029628661214643_job-512883596425071-run-32306365
    val split     = jobGroup.split('_')
    val jobAndRun = split.find(str ⇒ str.contains("job") && str.contains("run"))
    jobAndRun
      .map { str ⇒
        str.split('-')(3) // extract task id
      }
      .getOrElse(UUID.randomUUID().toString)
  }

  @Py4JWhitelist
  def initializeMetrics(spark: SparkSession): Unit = {
    spark.experimental.extraStrategies = InterimStrategy(spark) :: Nil
    InterimStore.reset(spark)
  }

  private def addSparkListener(
    spark:        SparkSession,
    executionUrl: String,
    session:      String
  ): Unit = {
    sessionIdToListener(session) = new ProphecyEventSendingListener(executionUrl, session)

    /**
      * SparkContext is generally one per JVM hence this command just yet another listener to the global object of context.
      */
    spark.sparkContext.addSparkListener(sessionIdToListener(session))
  }

  @Py4JWhitelist
  def start(spark: SparkSession, pipelineId: String, sessionForInteractive: String = ""): Unit = {
    val session = getSession(sessionForInteractive)
    val pipelineUri = if (ProphecyListener.isBlank(pipelineId)) {
      // for older python libs, pipelineId will be empty string.
      // this will work as a fallback mechanism.
      spark.conf.getOption(sparkConfPipelineUriKey).getOrElse(pipelineId)
    } else {
      pipelineId
    }

    LOGGER.info(s"""
                   | MetricsCollector.start method with spark ${session} pipelineId ${pipelineUri} and sessionForInteractive ${sessionForInteractive}
                   |""".stripMargin)

    sparkSessionIdToSparkSession.put(session, spark) // a single sessionID is attached to one single session.
    sparkSessionToIdMap.put(spark,            session)
    val executionUrlOption = getSparkExecutionUrl(spark, session)
    for (executionUrl ← executionUrlOption) {
      LOGGER.info(s"Execution ServiceURL: $executionUrl")
      val isDatabricksJob: Boolean = sessionForInteractive.isEmpty // for one time job run and scheduled runs
      synchronized {

        /**
          * TODO
          * Need some sort of optimization for interactive execution, because with current logic we are trying to attach listener for every run.
          */
        val listener =
          if (!sessionIdToListener.contains(session)) {
            LOGGER.info(s"Creating new EventSendingListener for session: $session, isDatabricksJob: $isDatabricksJob")
            addSparkListener(spark, executionUrl, session)
          } else if (!sessionIdToListener.get(session).exists(_.isActive)) {
            LOGGER.info(
              s"EventSendingListener is inactive for session: $session, isDatabricksJob: $isDatabricksJob. " +
                s"Creating new EventSendingListener"
            )
            addSparkListener(spark, executionUrl, session)
          }
      }

      if (isDatabricksJob) {
        val taskRunId = getTaskIdFromGroup(spark)
        val jobMetricsMetadata: JobMetricsMetadata = JobMetricsMetadata(
          jobURI = spark.conf.getOption(sparkConfJobUriKey),
          fabricId = spark.conf.getOption(sparkConfFabricIdKey),
          timeStarted = currentTimeMillis(),
          isInteractive = spark.conf.getOption(sparkConfRunTypeKey).map(_.toBoolean),
          taskRunId = taskRunId,
          pipelineUri = Some(pipelineUri),
          userId = spark.conf.getOption(sparkConfUserIdKey)
        )
        LOGGER.info(s"JobsMetricsEvent ${jobMetricsMetadata} spark ${session} with pipelineUri ${pipelineUri}")
        sessionIdToListener.get(session).foreach(_.sendJobMetricsMetadata(jobMetricsMetadata))
      }
    }
  }

  @Py4JWhitelist
  def end(spark: SparkSession): Unit = {
    sparkSessionToIdMap
      .get(spark)
      .flatMap(sessionId ⇒ sessionIdToListener.get(sessionId))
      .foreach(_.sendJobPipelineEndEvent("Succeeded"))
    // let things get overwrite by itself.
    sparkSessionToIdMap.remove(spark).foreach(sessionId ⇒ sparkSessionIdToSparkSession.remove(sessionId))
    InterimStore.reset(spark)
  }

  def end(): Unit = {
    if (sparkSessionToIdMap.nonEmpty) {
      sessionIdToListener.head._2.sendJobPipelineEndEvent("Succeeded")
      sparkSessionToIdMap.remove(sparkSessionToIdMap.head._1) // remove listener for job metrics
    }
  }

  private def getSparkExecutionUrl(spark: SparkSession, session: String): Option[String] =
    spark.conf.getOption(sparkConfServiceUrlKey).map(_ + s"/$session")
}