ch.cern.sparkmeasure.InfluxDBSink.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-measure_2.12 Show documentation
Show all versions of spark-measure_2.12 Show documentation
sparkMeasure is a tool for performance troubleshooting of Apache Spark workloads.
The newest version!
package ch.cern.sparkmeasure
import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd, SparkListenerApplicationStart, SparkListenerEvent, SparkListenerExecutorAdded, SparkListenerJobEnd, SparkListenerJobStart, SparkListenerStageCompleted, SparkListenerStageSubmitted, SparkListenerTaskEnd, SparkListenerTaskStart}
import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart}
import org.apache.spark.sql.SparkSession
import org.apache.spark.SparkConf
import org.influxdb.InfluxDBFactory
import org.influxdb.BatchOptions
import org.influxdb.dto.Point
import java.util.concurrent.TimeUnit
import org.slf4j.LoggerFactory
/**
* InfluxDBSink: write Spark metrics and application info in near real-time to InfluxDB v 1.x
* use this mode to monitor Spark execution workload
* use for Grafana dashboard and analytics of job execution
* How to use: attach the InfluxDBSInk to a Spark Context using the extra listener infrastructure.
* Note: this is for InfluxDB v1.x
* Example:
* --conf spark.extraListeners=ch.cern.sparkmeasure.InfluxDBSink
*
* Configuration for InfluxDBSink is handled with Spark conf parameters:
*
* spark.sparkmeasure.influxdbURL (default "http://localhost:8086")
* spark.sparkmeasure.influxdbUsername (default "", this can be empty if InfluxDB is configured with no authentication)
* spark.sparkmeasure.influxdbPassword (default "")
* spark.sparkmeasure.influxdbName (default "sparkmeasure")
* spark.sparkmeasure.influxdbStagemetrics, (boolean, default is false)
* spark.sparkmeasure.influxdbEnableBatch, boolean, default true
* Note: this is to improve write performance,
* but it requires to explicitly stopping Spark Session for clean exit: spark.stop()
* consider setting it to false if this is an issue
*
* This code depends on "influxdb.java", you may need to add the dependency:
* --packages org.influxdb:influxdb-java:2.14
* Note currently we need to use version 2.14 as newer versions generate jar conflicts (tested up to Spark 3.3.0)
*
* InfluxDBExtended: provides additional and verbose info on Task execution
* use: --conf spark.extraListeners=ch.cern.sparkmeasure.InfluxDBSinkExtended
*
* InfluxDBSink: the amount of data generated is relatively small in most applications: O(number_of_stages)
* InfluxDBSInkExtended can generate a large amount of data O(Number_of_tasks), use with care
*/
class InfluxDBSink(conf: SparkConf) extends SparkListener {
lazy val logger = LoggerFactory.getLogger(this.getClass.getName)
logger.warn("Custom monitoring listener with InfluxDB sink initializing. Now attempting to connect to InfluxDB")
// Initialize InfluxDB connection
val url = Utils.parseInfluxDBURL(conf, logger)
val (username, password) = Utils.parseInfluxDBCredentials(conf, logger)
// Tries to connect to InfluxDB, using the given URL and credentials
val influxDB = username match {
case username if username.isEmpty =>
// no username and password, InfluxDB must be running with auth-enabled=false
InfluxDBFactory.connect(url)
case _ => InfluxDBFactory.connect(url, username, password)
}
val dbName = Utils.parseInfluxDBName(conf, logger)
if (!influxDB.databaseExists(dbName)) {
influxDB.createDatabase(dbName)
}
val database = influxDB.setDatabase(dbName)
logger.info((s"using InfluxDB database $dbName"))
val logStageMetrics = Utils.parseInfluxDBStagemetrics(conf, logger)
val enableBatch = conf.getBoolean("spark.sparkmeasure.influxdbEnableBatch", true)
if (enableBatch) {
// Flush every 1000 Points, at least every 1000ms
influxDB.enableBatch(BatchOptions.DEFAULTS.actions(1000).flushDuration(1000))
}
var appId = "noAppId"
var appName = "noAppName"
var dynamicAllocationEnabled = conf.get("spark.dynamicAllocation.enabled", "false")
appId = SparkSession.getActiveSession match {
case Some(sparkSession) => sparkSession.sparkContext.applicationId
case _ => "noAppId"
}
override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = {
val executorId = executorAdded.executorId
val executorInfo = executorAdded.executorInfo
val startTime = executorAdded.time
val point = Point.measurement("executors_started")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.addField("executorId", executorId)
.addField("executorHost", executorInfo.executorHost)
.addField("totalCores", executorInfo.totalCores)
.time(startTime, TimeUnit.MILLISECONDS)
.build()
database.write(point)
}
override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = {
val submissionTime = stageSubmitted.stageInfo.submissionTime.getOrElse(0L)
val attemptNumber = stageSubmitted.stageInfo.attemptNumber()
val stageId = stageSubmitted.stageInfo.stageId
val point = Point.measurement("stages_started")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.addField("stageId", stageId)
.addField("attemptNUmber", attemptNumber)
.time(submissionTime, TimeUnit.MILLISECONDS)
.build()
database.write(point)
}
override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
val stageId = stageCompleted.stageInfo.stageId
val submissionTime = stageCompleted.stageInfo.submissionTime.getOrElse(0L)
val completionTime = stageCompleted.stageInfo.completionTime.getOrElse(0L)
val attemptNumber = stageCompleted.stageInfo.attemptNumber()
val point1 = Point.measurement("stages_ended")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(completionTime, TimeUnit.MILLISECONDS)
.addField("stageId", stageId)
.addField("attemptNumber", attemptNumber)
.addField("submissionTime", submissionTime)
.build()
database.write(point1)
if (logStageMetrics) {
val taskmetrics = stageCompleted.stageInfo.taskMetrics
val point2 = Point.measurement("stage_metrics")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(completionTime, TimeUnit.MILLISECONDS)
.addField("stageId", stageId)
.addField("attemptNumber", attemptNumber)
.addField("failureReason", stageCompleted.stageInfo.failureReason.getOrElse(""))
.addField("submissionTime", submissionTime)
.addField("completionTime", completionTime)
.addField("executorRunTime", taskmetrics.executorRunTime)
.addField("executorCpuTime", taskmetrics.executorCpuTime)
.addField("executorDeserializeCpuTime", taskmetrics.executorDeserializeCpuTime)
.addField("executorDeserializeTime", taskmetrics.executorDeserializeTime)
.addField("jvmGCTime", taskmetrics.jvmGCTime)
.addField("memoryBytesSpilled", taskmetrics.memoryBytesSpilled)
.addField("peakExecutionMemory", taskmetrics.peakExecutionMemory)
.addField("resultSerializationTime", taskmetrics.resultSerializationTime)
.addField("resultSize", taskmetrics.resultSize)
.addField("bytesRead", taskmetrics.inputMetrics.bytesRead)
.addField("recordsRead", taskmetrics.inputMetrics.recordsRead)
.addField("bytesWritten", taskmetrics.outputMetrics.bytesWritten)
.addField("recordsWritten", taskmetrics.outputMetrics.recordsWritten)
.addField("shuffleTotalBytesRead", taskmetrics.shuffleReadMetrics.totalBytesRead)
.addField("shuffleRemoteBytesRead", taskmetrics.shuffleReadMetrics.remoteBytesRead)
.addField("shuffleRemoteBytesReadToDisk", taskmetrics.shuffleReadMetrics.remoteBytesReadToDisk)
.addField("shuffleLocalBytesRead", taskmetrics.shuffleReadMetrics.localBytesRead)
.addField("shuffleTotalBlocksFetched", taskmetrics.shuffleReadMetrics.totalBlocksFetched)
.addField("shuffleLocalBlocksFetched", taskmetrics.shuffleReadMetrics.localBlocksFetched)
.addField("shuffleRemoteBlocksFetched", taskmetrics.shuffleReadMetrics.remoteBlocksFetched)
.addField("shuffleRecordsRead", taskmetrics.shuffleReadMetrics.recordsRead)
.addField("shuffleFetchWaitTime", taskmetrics.shuffleReadMetrics.fetchWaitTime)
.addField("shuffleBytesWritten", taskmetrics.shuffleWriteMetrics.bytesWritten)
.addField("shuffleRecordsWritten", taskmetrics.shuffleWriteMetrics.recordsWritten)
.addField("shuffleWriteTime", taskmetrics.shuffleWriteMetrics.writeTime)
.build()
database.write(point2)
}
}
override def onOtherEvent(event: SparkListenerEvent): Unit = {
event match {
case e: SparkListenerSQLExecutionStart => {
val startTime = e.time
val queryId = e.executionId
val description = e.description
// val details = e.details
val point = Point.measurement("queries_started")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(startTime, TimeUnit.MILLISECONDS)
.addField("description", description)
.addField("queryId", queryId)
.build()
database.write(point)
}
case e: SparkListenerSQLExecutionEnd => {
val endTime = e.time
val queryId = e.executionId
val point = Point.measurement("queries_ended")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(endTime, TimeUnit.MILLISECONDS)
.addField("queryId", queryId)
.build()
database.write(point)
}
case _ => None // Ignore
}
}
override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
val startTime = jobStart.time
val jobId = jobStart.jobId
val point = Point.measurement("jobs_started")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(startTime, TimeUnit.MILLISECONDS)
.addField("jobID", jobId)
.build()
database.write(point)
}
override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
val completionTime = jobEnd.time
val jobId = jobEnd.jobId
val point = Point.measurement("jobs_ended")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(completionTime, TimeUnit.MILLISECONDS)
.addField("jobID", jobId)
.build()
database.write(point)
}
override def onApplicationStart(applicationStart: SparkListenerApplicationStart): Unit = {
appId = applicationStart.appId.getOrElse("noAppId")
appName = applicationStart.appName
val point = Point.measurement("applications_started")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(applicationStart.time, TimeUnit.MILLISECONDS)
.field("startTime", conf.getLong("spark.app.startTime", 0))
.field("submitTime", conf.getLong("spark.app.submitTime", 0))
.field("totalCoresRequested", conf.getLong("spark.cores.max", 0))
.field("sparkDriverHost", conf.get("spark.driver.host", ""))
.field("sparkDriverPort", conf.getInt("spark.driver.port", 0))
.field("deployMode", conf.get("spark.submit.deployMode", ""))
.build()
database.write(point)
}
override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = {
logger.info(s"Spark application ended, timestamp = ${applicationEnd.time}, closing InfluxDB connection.")
val point = Point.measurement("applications_ended")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(applicationEnd.time, TimeUnit.MILLISECONDS)
.field("duration", (applicationEnd.time - conf.getLong("spark.app.startTime", 0))/1000)
.build()
database.write(point)
influxDB.close()
}
}
/**
* InfluxDBSinkExtended extends the basic Influx Sink functionality with a verbose dump of
* Task metrics and task info into InfluxDB
* Note: this can generate a large amount of data O(Number_of_tasks)
* Configuration parameters and how-to use: see InfluxDBSink
*/
class InfluxDBSinkExtended(conf: SparkConf) extends InfluxDBSink(conf: SparkConf) {
override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = {
val taskInfo = taskStart.taskInfo
val point = Point.measurement("tasks_started")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(taskInfo.launchTime, TimeUnit.MICROSECONDS)
.addField("taskId", taskInfo.taskId)
.addField("attemptNumber", taskInfo.attemptNumber)
.addField("stageId", taskStart.stageId)
.build()
database.write(point)
}
override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
val taskInfo = taskEnd.taskInfo
val taskmetrics = taskEnd.taskMetrics
val point1 = Point.measurement("tasks_ended")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(taskInfo.finishTime, TimeUnit.MILLISECONDS)
.addField("taskId", taskInfo.taskId)
.addField("attemptNumber", taskInfo.attemptNumber)
.addField("launchTime", taskInfo.launchTime)
.addField("stageId", taskEnd.stageId)
.build()
database.write(point1)
val point2 = Point.measurement("task_metrics")
.tag("applicationId", appId)
.tag("spark.app.name", appName)
.tag("spark.dynamicAllocation.enabled", dynamicAllocationEnabled)
.time(taskInfo.finishTime, TimeUnit.MILLISECONDS)
// task info
.addField("taskId", taskInfo.taskId)
.addField("attemptNumber", taskInfo.attemptNumber)
.addField("stageId", taskEnd.stageId)
.addField("launchTime", taskInfo.launchTime)
.addField("completionTime", taskInfo.finishTime)
.addField("failed", taskInfo.failed)
.addField("speculative", taskInfo.speculative)
.addField("killed", taskInfo.killed)
.addField("finished", taskInfo.finished)
.addField("executorId", taskInfo.executorId)
.addField("duration", taskInfo.duration)
.addField("successful", taskInfo.successful)
.addField("host", taskInfo.host)
.addField("taskLocality", Utils.encodeTaskLocality(taskInfo.taskLocality))
// task metrics
.addField("executorRunTime", taskmetrics.executorRunTime)
.addField("executorCpuTime", taskmetrics.executorCpuTime)
.addField("executorDeserializeCpuTime", taskmetrics.executorDeserializeCpuTime)
.addField("executorDeserializeTime", taskmetrics.executorDeserializeTime)
.addField("jvmGCTime", taskmetrics.jvmGCTime)
.addField("memoryBytesSpilled", taskmetrics.memoryBytesSpilled)
.addField("peakExecutionMemory", taskmetrics.peakExecutionMemory)
.addField("resultSerializationTime", taskmetrics.resultSerializationTime)
.addField("resultSize", taskmetrics.resultSize)
.addField("bytesRead", taskmetrics.inputMetrics.bytesRead)
.addField("recordsRead", taskmetrics.inputMetrics.recordsRead)
.addField("bytesWritten", taskmetrics.outputMetrics.bytesWritten)
.addField("recordsWritten", taskmetrics.outputMetrics.recordsWritten)
.addField("shuffleTotalBytesRead", taskmetrics.shuffleReadMetrics.totalBytesRead)
.addField("shuffleRemoteBytesRead", taskmetrics.shuffleReadMetrics.remoteBytesRead)
.addField("shuffleLocalBytesRead", taskmetrics.shuffleReadMetrics.localBytesRead)
.addField("shuffleTotalBlocksFetched", taskmetrics.shuffleReadMetrics.totalBlocksFetched)
.addField("shuffleLocalBlocksFetched", taskmetrics.shuffleReadMetrics.localBlocksFetched)
.addField("shuffleRemoteBlocksFetched", taskmetrics.shuffleReadMetrics.remoteBlocksFetched)
.addField("shuffleRecordsRead", taskmetrics.shuffleReadMetrics.recordsRead)
// this requires spark2.3 and above .addField("remoteBytesReadToDisk", taskmetrics.shuffleReadMetrics.remoteBytesReadToDisk)
.addField("shuffleFetchWaitTime", taskmetrics.shuffleReadMetrics.fetchWaitTime)
.addField("shuffleBytesWritten", taskmetrics.shuffleWriteMetrics.bytesWritten)
.addField("shuffleRecordsWritten", taskmetrics.shuffleWriteMetrics.recordsWritten)
.addField("shuffleWriteTime", taskmetrics.shuffleWriteMetrics.writeTime)
.build()
database.write(point2)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy