org.apache.spark.sql.MetricsCollector.scala Maven / Gradle / Ivy
package org.apache.spark.sql
import org.apache.spark.annotation.Py4JWhitelist
import org.apache.spark.sql.prophecy.{JobMetricsMetadata, ProphecyEventSendingListener}
import org.slf4j.{Logger, LoggerFactory}
import java.lang.System.currentTimeMillis
import java.util.UUID
import scala.collection.concurrent.TrieMap
object MetricsCollector extends Serializable {
private lazy val LOGGER: Logger = LoggerFactory.getLogger(this.getClass)
private val sparkSessionToIdMap = new TrieMap[SparkSession, String]
private val sessionIdToListener = new TrieMap[String, ProphecyEventSendingListener]
private val sparkSessionIdToSparkSession = new TrieMap[String, SparkSession]()
// we can't use hashCode because hashCode doesn't guarantee uniqueness unless coupled with equals,
// Since MetricsCollector will be on driver machines, we are generally good with mapping with a UUID.
private val groupIdKey = "spark.jobGroup.id"
private val sparkConfPipelineUriKey = "prophecy.metadata.pipeline.uri"
private val sparkConfFabricIdKey = "prophecy.metadata.fabric.id"
private val sparkConfServiceUrlKey = "prophecy.execution.service.url"
private val sparkConfRunTypeKey = "prophecy.metadata.is.interactive.run"
private val sparkConfJobUriKey = "prophecy.metadata.job.uri"
private val sparkConfUserIdKey = "prophecy.metadata.user.id"
def getUniqueSessionId(session: SparkSession): Option[String] =
sparkSessionToIdMap.get(session)
def getSession(sessionForInteractive: String): String =
if (sessionForInteractive.isEmpty) UUID.randomUUID().toString else sessionForInteractive
def getTaskIdFromGroup(spark: SparkSession): String = {
val jobGroup = spark.sparkContext.getLocalProperty(groupIdKey)
// Eg job group : 337054160472983689_5254029628661214643_job-512883596425071-run-32306365
val split = jobGroup.split('_')
val jobAndRun = split.find(str ⇒ str.contains("job") && str.contains("run"))
jobAndRun
.map { str ⇒
str.split('-')(3) // extract task id
}
.getOrElse(UUID.randomUUID().toString)
}
@Py4JWhitelist
def initializeMetrics(spark: SparkSession): Unit = {
spark.experimental.extraStrategies = InterimStrategy(spark) :: Nil
InterimStore.reset(spark)
}
private def addSparkListener(
spark: SparkSession,
executionUrl: String,
session: String
): Unit = {
sessionIdToListener(session) = new ProphecyEventSendingListener(executionUrl, session)
/**
* SparkContext is generally one per JVM hence this command just yet another listener to the global object of context.
*/
spark.sparkContext.addSparkListener(sessionIdToListener(session))
}
@Py4JWhitelist
def start(spark: SparkSession, pipelineId: String, sessionForInteractive: String = ""): Unit = {
val session = getSession(sessionForInteractive)
val pipelineUri = if (ProphecyListener.isBlank(pipelineId)) {
// for older python libs, pipelineId will be empty string.
// this will work as a fallback mechanism.
spark.conf.getOption(sparkConfPipelineUriKey).getOrElse(pipelineId)
} else {
pipelineId
}
LOGGER.info(s"""
| MetricsCollector.start method with spark ${session} pipelineId ${pipelineUri} and sessionForInteractive ${sessionForInteractive}
|""".stripMargin)
sparkSessionIdToSparkSession.put(session, spark) // a single sessionID is attached to one single session.
sparkSessionToIdMap.put(spark, session)
val executionUrlOption = getSparkExecutionUrl(spark, session)
for (executionUrl ← executionUrlOption) {
LOGGER.info(s"Execution ServiceURL: $executionUrl")
val isDatabricksJob: Boolean = sessionForInteractive.isEmpty // for one time job run and scheduled runs
synchronized {
/**
* TODO
* Need some sort of optimization for interactive execution, because with current logic we are trying to attach listener for every run.
*/
val listener =
if (!sessionIdToListener.contains(session)) {
LOGGER.info(s"Creating new EventSendingListener for session: $session, isDatabricksJob: $isDatabricksJob")
addSparkListener(spark, executionUrl, session)
} else if (!sessionIdToListener.get(session).exists(_.isActive)) {
LOGGER.info(
s"EventSendingListener is inactive for session: $session, isDatabricksJob: $isDatabricksJob. " +
s"Creating new EventSendingListener"
)
addSparkListener(spark, executionUrl, session)
}
}
if (isDatabricksJob) {
val taskRunId = getTaskIdFromGroup(spark)
val jobMetricsMetadata: JobMetricsMetadata = JobMetricsMetadata(
jobURI = spark.conf.getOption(sparkConfJobUriKey),
fabricId = spark.conf.getOption(sparkConfFabricIdKey),
timeStarted = currentTimeMillis(),
isInteractive = spark.conf.getOption(sparkConfRunTypeKey).map(_.toBoolean),
taskRunId = taskRunId,
pipelineUri = Some(pipelineUri),
userId = spark.conf.getOption(sparkConfUserIdKey)
)
LOGGER.info(s"JobsMetricsEvent ${jobMetricsMetadata} spark ${session} with pipelineUri ${pipelineUri}")
sessionIdToListener.get(session).foreach(_.sendJobMetricsMetadata(jobMetricsMetadata))
}
}
}
@Py4JWhitelist
def end(spark: SparkSession): Unit = {
sparkSessionToIdMap
.get(spark)
.flatMap(sessionId ⇒ sessionIdToListener.get(sessionId))
.foreach(_.sendJobPipelineEndEvent("Succeeded"))
// let things get overwrite by itself.
sparkSessionToIdMap.remove(spark).foreach(sessionId ⇒ sparkSessionIdToSparkSession.remove(sessionId))
InterimStore.reset(spark)
}
def end(): Unit = {
if (sparkSessionToIdMap.nonEmpty) {
sessionIdToListener.head._2.sendJobPipelineEndEvent("Succeeded")
sparkSessionToIdMap.remove(sparkSessionToIdMap.head._1) // remove listener for job metrics
}
}
private def getSparkExecutionUrl(spark: SparkSession, session: String): Option[String] =
spark.conf.getOption(sparkConfServiceUrlKey).map(_ + s"/$session")
}