ch.cern.sparkmeasure.Utils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-measure_2.12 Show documentation
Show all versions of spark-measure_2.12 Show documentation
sparkMeasure is a tool for performance troubleshooting of Apache Spark workloads.
The newest version!
package ch.cern.sparkmeasure
import org.apache.spark.SparkConf
import org.apache.spark.scheduler.TaskLocality
import org.slf4j.Logger
import scala.collection.mutable.LinkedHashMap
/**
* @param serverIPnPort String with prometheus pushgateway hostIP:Port
* @param jobName the name of the spark job
* @param connectionTimeoutMs connection timeout for the http client, default is 5000ms
* @param readTimeoutMs read timeout for the http client, default is 5000ms
*/
case class PushgatewayConfig(serverIPnPort: String, jobName: String, connectionTimeoutMs: Int = 5000, readTimeoutMs: Int = 5000)
/**
* The object Utils contains some helper code for the sparkMeasure package
* The methods formatDuration and formatBytes are used for printing stage metrics reports
*/
object Utils {
/** boilerplate code for pretty printing, formatDuration code borrowed from Spark UIUtils */
def formatDuration(milliseconds: Long): String = {
if (milliseconds < 100) {
return "%d ms".format(milliseconds)
}
val seconds = milliseconds.toDouble / 1000
if (seconds < 1) {
return "%.1f s".format(seconds)
}
if (seconds < 60) {
return "%.0f s".format(seconds)
}
val minutes = seconds / 60
if (minutes < 10) {
return "%.1f min".format(minutes)
} else if (minutes < 60) {
return "%.0f min".format(minutes)
}
val hours = minutes / 60
"%.1f h".format(hours)
}
def formatBytes(bytes: Long): String = {
val trillion = 1024L * 1024L * 1024L * 1024L
val billion = 1024L * 1024L * 1024L
val million = 1024L * 1024L
val thousand = 1024L
val bytesDouble = bytes.toDouble
val (value, unit): (Double, String) = {
if (bytesDouble >= 2 * trillion) {
(bytesDouble / trillion, " TB")
} else if (bytes >= 2 * billion) {
(bytesDouble / billion, " GB")
} else if (bytes >= 2 * million) {
(bytesDouble / million, " MB")
} else if (bytes >= 2 * thousand) {
(bytesDouble / thousand, " KB")
} else {
(bytesDouble, " Bytes")
}
}
if (unit == " Bytes") {
"%d%s".format(value.toInt, unit)
} else {
"%.1f%s".format(value, unit)
}
}
// Return the data structure use to compute metrics reports for StageMetrics
def zeroMetricsStage(): LinkedHashMap[String, Long] = {
val zeroedMetrics = LinkedHashMap(
"numStages" -> 0L,
"numTasks" -> 0L,
"elapsedTime" -> 0L,
"stageDuration" -> 0L,
"executorRunTime" -> 0L,
"executorCpuTime" -> 0L,
"executorDeserializeTime" -> 0L,
"executorDeserializeCpuTime" -> 0L,
"resultSerializationTime" -> 0L,
"jvmGCTime" -> 0L,
"shuffleFetchWaitTime" -> 0L,
"shuffleWriteTime" -> 0L,
"resultSize" -> 0L,
"diskBytesSpilled" -> 0L,
"memoryBytesSpilled" -> 0L,
"peakExecutionMemory" -> 0L,
"recordsRead" -> 0L,
"bytesRead" -> 0L,
"recordsWritten" -> 0L,
"bytesWritten" -> 0L,
"shuffleRecordsRead" -> 0L,
"shuffleTotalBlocksFetched" -> 0L,
"shuffleLocalBlocksFetched" -> 0L,
"shuffleRemoteBlocksFetched" -> 0L,
"shuffleTotalBytesRead" -> 0L,
"shuffleLocalBytesRead" -> 0L,
"shuffleRemoteBytesRead" -> 0L,
"shuffleRemoteBytesReadToDisk" -> 0L,
"shuffleBytesWritten" -> 0L,
"shuffleRecordsWritten" -> 0L
)
zeroedMetrics
}
// Return the data structure use to compute metrics reports for TaskMetrics
def zeroMetricsTask(): LinkedHashMap[String, Long] = {
val zeroedMetrics = LinkedHashMap(
"numTasks" -> 0L,
"successful tasks" -> 0L,
"speculative tasks" -> 0L,
"taskDuration" -> 0L,
"schedulerDelayTime" -> 0L,
"executorRunTime" -> 0L,
"executorCpuTime" -> 0L,
"executorDeserializeTime" -> 0L,
"executorDeserializeCpuTime" -> 0L,
"resultSerializationTime" -> 0L,
"jvmGCTime" -> 0L,
"shuffleFetchWaitTime" -> 0L,
"shuffleWriteTime" -> 0L,
"gettingResultTime" -> 0L,
"resultSize" -> 0L,
"diskBytesSpilled" -> 0L,
"memoryBytesSpilled" -> 0L,
"peakExecutionMemory" -> 0L,
"recordsRead" -> 0L,
"bytesRead" -> 0L,
"recordsWritten" -> 0L,
"bytesWritten" -> 0L,
"shuffleRecordsRead" -> 0L,
"shuffleTotalBlocksFetched" -> 0L,
"shuffleLocalBlocksFetched" -> 0L,
"shuffleRemoteBlocksFetched" -> 0L,
"shuffleTotalBytesRead" -> 0L,
"shuffleLocalBytesRead" -> 0L,
"shuffleRemoteBytesRead" -> 0L,
"shuffleRemoteBytesReadToDisk" -> 0L,
"shuffleBytesWritten" -> 0L,
"shuffleRecordsWritten" -> 0L
)
zeroedMetrics
}
def prettyPrintValues(metric: String, value: Long): String = {
val name = metric.toLowerCase()
val basicValue = value.toString
val optionalValueWithUnits = {
if (name.contains("time") || name.contains("duration")) {
" (" + formatDuration(value) + ")"
}
else if (name.contains("bytes") || name.contains("size")) {
" (" + formatBytes(value) + ")"
}
else ""
}
metric + " => " + basicValue + optionalValueWithUnits
}
def encodeTaskLocality(taskLocality: TaskLocality.TaskLocality): Int = {
taskLocality match {
case TaskLocality.PROCESS_LOCAL => 0
case TaskLocality.NODE_LOCAL => 1
case TaskLocality.RACK_LOCAL => 2
case TaskLocality.NO_PREF => 3
case TaskLocality.ANY => 4
case _ => -1 // Flag an unknown situation
}
}
// handle metrics format parameter
def parseMetricsFormat(conf: SparkConf, logger: Logger, defaultFormat: String): String = {
// handle metrics format parameter
val metricsFormat = conf.get("spark.sparkmeasure.outputFormat", defaultFormat)
metricsFormat match {
case "json" | "java" | "json_to_hadoop" =>
logger.info(s"Using $metricsFormat as serialization format.")
case _ => logger.warn(s"Invalid serialization format: $metricsFormat." +
" Configure with: spark.sparkmeasure.outputFormat=json|javaser|json_to_hadoop")
}
metricsFormat
}
def parsePrintToStdout(conf: SparkConf, logger: Logger, defaultVal: Boolean): Boolean = {
val printToStdout = conf.getBoolean("spark.sparkmeasure.printToStdout", defaultVal)
if (printToStdout == true) {
logger.info(s"Will print metrics output to stdout in JSON format")
}
printToStdout
}
// handle metrics file name parameter except if writing to string / stdout
def parseMetricsFilename(conf: SparkConf, logger: Logger, defaultFileName: String): String = {
val metricsFileName = conf.get("spark.sparkmeasure.outputFilename", defaultFileName)
if (metricsFileName.isEmpty) {
logger.warn("No output file will be written. If you want to write the output to a file, " +
"configure with spark.sparkmeasure.outputFilename= © 2015 - 2025 Weber Informatics LLC | Privacy Policy