![JAR search and dependency download from the Maven repository](/logo.png)
com.nvidia.spark.rapids.tool.EventLogPathProcessor.scala Maven / Gradle / Ivy
/*
* Copyright (c) 2021-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nvidia.spark.rapids.tool
import java.io.FileNotFoundException
import java.time.LocalDateTime
import java.util.zip.ZipOutputStream
import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, LinkedHashMap}
import scala.util.control.NonFatal
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileContext, FileStatus, FileSystem, Path, PathFilter}
import org.apache.spark.deploy.history.{EventLogFileReader, EventLogFileWriter}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.rapids.tool.AppFilterImpl
import org.apache.spark.sql.rapids.tool.profiling.ApplicationInfo
import org.apache.spark.sql.rapids.tool.util.FSUtils
import org.apache.spark.sql.rapids.tool.util.StringUtils
sealed trait EventLogInfo {
def eventLog: Path
}
case class EventLogFileSystemInfo(timestamp: Long, size: Long)
case class ApacheSparkEventLog(override val eventLog: Path) extends EventLogInfo
case class DatabricksEventLog(override val eventLog: Path) extends EventLogInfo
case class FailedEventLog(override val eventLog: Path,
private val reason: String) extends EventLogInfo {
def getReason: String = {
StringUtils.renderStr(reason, doEscapeMetaCharacters = true, maxLength = 0)
}
}
object EventLogPathProcessor extends Logging {
// Apache Spark event log prefixes
private val EVENT_LOG_DIR_NAME_PREFIX = "eventlog_v2_"
private val DB_EVENT_LOG_FILE_NAME_PREFIX = "eventlog"
private def isEventLogDir(status: FileStatus): Boolean = {
status.isDirectory && isEventLogDir(status.getPath.getName)
}
// This only checks the name of the path
private def isEventLogDir(path: String): Boolean = {
path.startsWith(EVENT_LOG_DIR_NAME_PREFIX)
}
private def isDBEventLogFile(fileName: String): Boolean = {
fileName.startsWith(DB_EVENT_LOG_FILE_NAME_PREFIX)
}
def isDBEventLogFile(status: FileStatus): Boolean = {
status.isFile && isDBEventLogFile(status.getPath.getName)
}
// scalastyle:off line.size.limit
// https://github.com/apache/spark/blob/0494dc90af48ce7da0625485a4dc6917a244d580/core/src/main/scala/org/apache/spark/io/CompressionCodec.scala#L67
// scalastyle:on line.size.limit
private val SPARK_SHORT_COMPRESSION_CODEC_NAMES = Set("lz4", "lzf", "snappy", "zstd")
// Apache Spark ones plus gzip
private val SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER =
SPARK_SHORT_COMPRESSION_CODEC_NAMES ++ Set("gz")
// Files having these keywords are not considered as event logs
private val EXCLUDED_EVENTLOG_NAME_KEYWORDS = Set("stdout", "stderr", "log4j", ".log.")
/**
* Filter to identify valid event log files based on the criteria:
* - File should either not have any suffix or have a supported compression codec suffix
* - File should not contain any of the EXCLUDED_EVENTLOG_NAME_KEYWORDS keywords in its name
* @param logFile File to be filtered.
* @return True if the file is a valid event log, false otherwise.
*/
private def eventLogNameFilter(logFile: Path): Boolean = {
val hasValidSuffix = EventLogFileWriter.codecName(logFile)
.forall(suffix => SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.contains(suffix))
val hasExcludedKeyword = EXCLUDED_EVENTLOG_NAME_KEYWORDS.exists(logFile.getName.contains)
hasValidSuffix && !hasExcludedKeyword
}
// Databricks has the latest events in file named eventlog and then any rolled in format
// eventlog-2021-06-14--20-00.gz, here we assume that if any files start with eventlog
// then the directory is a Databricks event log directory.
private def isDatabricksEventLogDir(dir: FileStatus, fs: FileSystem): Boolean = {
lazy val dbLogFiles = fs.listStatus(dir.getPath, new PathFilter {
override def accept(path: Path): Boolean = {
isDBEventLogFile(path.getName)
}
})
dir.isDirectory && dbLogFiles.size > 1
}
/**
* Identifies if the input file or directory is a valid event log.
*
* TODO - Need to handle size of files in directory, for now document its not supported.
* Reference: https://github.com/NVIDIA/spark-rapids-tools/pull/1275
*
* @param s FileStatus to be identified.
* @param fs FileSystem instance for file system operations.
* @return Option[EventLogInfo] if valid, None otherwise.
*/
private def identifyEventLog(s: FileStatus, fs: FileSystem): Option[EventLogInfo] = {
if (s.isFile && eventLogNameFilter(s.getPath)) {
// Regular event log file. See function `eventLogNameFilter` for criteria.
Some(ApacheSparkEventLog(s.getPath))
} else if (isEventLogDir(s)) {
// Apache Spark event log directory (starting with "eventlog_v2_").
Some(ApacheSparkEventLog(s.getPath))
} else if (isDatabricksEventLogDir(s, fs)) {
// Databricks event log directory (files starting with "eventlog").
Some(DatabricksEventLog(s.getPath))
} else {
// Ignore other types of files.
None
}
}
/**
* Retrieves all event logs from the input path.
*
* Note:
* - If the input path is a directory and recursive search is enabled, this function will
* search for event logs in all subdirectories.
* - This is done using a queue to avoid stack overflow.
*
* @param inputPath Root path to search for event logs.
* @param fs FileSystem instance for file system operations.
* @param recursiveSearchEnabled If enabled, search for event logs in all subdirectories.
* @return List of (EventLogInfo, EventLogFileSystemInfo) tuples.
*/
private def getEventLogInfoInternal(inputPath: Path, fs: FileSystem,
recursiveSearchEnabled: Boolean): List[(EventLogInfo, EventLogFileSystemInfo)] = {
val results = ArrayBuffer[(EventLogInfo, EventLogFileSystemInfo)]()
val queue = mutable.Queue[FileStatus]()
queue.enqueue(fs.getFileStatus(inputPath))
while (queue.nonEmpty) {
// Note: currentEntry can be a file, directory, or a symlink.
val currentEntry = queue.dequeue()
identifyEventLog(currentEntry, fs) match {
case Some(eventLogInfo) =>
// If currentEntry is an event log (either a file or a directory), add it to the results
// with its modification time and size.
results += eventLogInfo -> EventLogFileSystemInfo(
currentEntry.getModificationTime, currentEntry.getLen)
case None if currentEntry.isDirectory =>
// If currentEntry is a directory (but not an event log) enqueue all contained files for
// further processing. Include subdirectories only if recursive search is enabled.
fs.listStatus(currentEntry.getPath)
.filter(child => child.isFile || (child.isDirectory && recursiveSearchEnabled))
.foreach(queue.enqueue(_))
case _ =>
// Using a debug log (instead of warn) to avoid excessive logging due to recursive nature.
val supportedTypes = SPARK_SHORT_COMPRESSION_CODEC_NAMES_FOR_FILTER.mkString(", ")
logDebug(s"File: ${currentEntry.getPath} is not a supported file type. " +
s"Supported compression types are: $supportedTypes. Skipping this file.")
}
}
results.toList
}
/**
* If the user provides wildcard in the eventlogs, then the path processor needs
* to process the path as a pattern. Otherwise, HDFS throws an exception mistakenly that
* no such file exists.
* Once the glob path is checked, the list of eventlogs is the result of the flattenMap
* of all the processed files.
*
* @return List of (rawPath, List[processedPaths])
*/
private def processWildcardsLogs(eventLogsPaths: List[String],
hadoopConf: Configuration): List[(String, List[String])] = {
eventLogsPaths.map { rawPath =>
if (!rawPath.contains("*")) {
(rawPath, List(rawPath))
} else {
try {
val globPath = new Path(rawPath)
val fileContext = FileContext.getFileContext(globPath.toUri, hadoopConf)
val fileStatuses = fileContext.util().globStatus(globPath)
val paths = fileStatuses.map(_.getPath.toString).toList
(rawPath, paths)
} catch {
case _ : Throwable =>
// Do not fail in this block.
// Instead, ignore the error and add the file as is; then the caller should fail when
// processing the file.
// This will make handling errors more consistent during the processing of the analysis
logWarning(s"Processing pathLog with wildCard has failed: $rawPath")
(rawPath, List.empty)
}
}
}
}
def getEventLogInfo(pathString: String, hadoopConf: Configuration,
recursiveSearchEnabled: Boolean = true): Map[EventLogInfo, Option[EventLogFileSystemInfo]] = {
val inputPath = new Path(pathString)
try {
// Note that some cloud storage APIs may throw FileNotFoundException when the pathPrefix
// (i.e., bucketName) is not visible to the current configuration instance.
val fs = FSUtils.getFSForPath(inputPath, hadoopConf)
val eventLogInfos = getEventLogInfoInternal(inputPath, fs, recursiveSearchEnabled)
if (eventLogInfos.isEmpty) {
val message = "No valid event logs found in the path. Check the path or set the " +
"log level to DEBUG for more details."
Map(FailedEventLog(inputPath, message) -> None)
} else {
eventLogInfos.toMap.mapValues(Some(_))
}
} catch {
case fnfEx: FileNotFoundException =>
logWarning(s"$pathString not found, skipping!")
Map(FailedEventLog(new Path(pathString), fnfEx.getMessage) -> None)
case NonFatal(e) =>
logWarning(s"Unexpected exception occurred reading $pathString, skipping!", e)
Map(FailedEventLog(new Path(pathString), e.getMessage) -> None)
}
}
/**
* Function to process event log paths passed in and evaluate which ones are really event
* logs and filter based on user options.
*
* @param filterNLogs Number of event logs to be selected
* @param matchlogs Keyword to match file names in the directory
* @param eventLogsPaths Array of event log paths
* @param hadoopConf Hadoop Configuration
* @param recursiveSearchEnabled If enabled, search for event logs in all subdirectories.
* Enabled by default.
* @param maxEventLogSize Optional maximum event log size as a string
* @param minEventLogSize Optional minimum event log size as a string
* @param fsStartTime Filesystem date and time for filtering event logs based on
* start time
* @param fsEndTime Filesystem date and time for filtering event logs based on
* end time
* @return (Seq[EventLogInfo], Seq[EventLogInfo]) - Tuple indicating paths of event logs in
* filesystem. First element contains paths of event logs after applying filters and
* second element contains paths of all event logs.
*/
def processAllPaths(
filterNLogs: Option[String],
matchlogs: Option[String],
eventLogsPaths: List[String],
hadoopConf: Configuration,
recursiveSearchEnabled: Boolean = true,
maxEventLogSize: Option[String] = None,
minEventLogSize: Option[String] = None,
fsStartTime: Option[String] = None,
fsEndTime: Option[String] = None): (Seq[EventLogInfo], Seq[EventLogInfo]) = {
val logsPathNoWildCards = processWildcardsLogs(eventLogsPaths, hadoopConf)
val logsWithTimestamp = logsPathNoWildCards.flatMap {
case (rawPath, processedPaths) if processedPaths.isEmpty =>
// If no event logs are found in the path after wildcard expansion, return a failed event
Map(FailedEventLog(new Path(rawPath), s"No event logs found in $rawPath") -> None)
case (_, processedPaths) =>
processedPaths.flatMap(getEventLogInfo(_, hadoopConf, recursiveSearchEnabled))
}.toMap
logDebug("Paths after stringToPath: " + logsWithTimestamp)
// Filter the event logs to be processed based on the criteria. If it is not provided in the
// command line, then return all the event logs processed above.
val matchedLogs = matchlogs.map { strMatch =>
logsWithTimestamp.filterKeys(_.eventLog.getName.contains(strMatch))
}.getOrElse(logsWithTimestamp)
val filteredLogs = if ((filterNLogs.nonEmpty && !filterByAppCriteria(filterNLogs)) ||
maxEventLogSize.isDefined || minEventLogSize.isDefined ||
fsStartTime.isDefined || fsEndTime.isDefined) {
val validMatchedLogs = matchedLogs.collect {
case (info, Some(ts)) => info -> ts
}
val filteredByMinSize = if (minEventLogSize.isDefined) {
val minSizeInBytes = if (StringUtils.isMemorySize(minEventLogSize.get)) {
// if it is memory return the bytes unit
StringUtils.convertMemorySizeToBytes(minEventLogSize.get)
} else {
// size is assumed to be mb
StringUtils.convertMemorySizeToBytes(minEventLogSize.get + "m")
}
val (matched, filtered) = validMatchedLogs.partition(info => info._2.size >= minSizeInBytes)
logInfo(s"Filtering eventlogs by size, minimum size is ${minSizeInBytes}b. The logs " +
s"filtered out include: ${filtered.keys.map(_.eventLog.toString).mkString(",")}")
matched
} else {
validMatchedLogs
}
val filteredByMaxSize = if (maxEventLogSize.isDefined) {
val maxSizeInBytes = if (StringUtils.isMemorySize(maxEventLogSize.get)) {
// if it is memory return the bytes unit
StringUtils.convertMemorySizeToBytes(maxEventLogSize.get)
} else {
// size is assumed to be mb
StringUtils.convertMemorySizeToBytes(maxEventLogSize.get + "m")
}
val (matched, filtered) =
filteredByMinSize.partition(info => info._2.size <= maxSizeInBytes)
logInfo(s"Filtering eventlogs by size, max size is ${maxSizeInBytes}b. The logs filtered " +
s"out include: ${filtered.keys.map(_.eventLog.toString).mkString(",")}")
matched
} else {
filteredByMinSize
}
// filter by date time range first
val filteredByDateTime = if (fsStartTime.isDefined || fsEndTime.isDefined) {
if (fsStartTime.isDefined && fsEndTime.isDefined) {
val startTimeMs = AppFilterImpl.parseDateTimePeriod(fsStartTime.get).get
val endTimeMs = AppFilterImpl.parseDateTimePeriod(fsEndTime.get).get
val (matched, filtered) = filteredByMaxSize.partition { case (_, v) =>
(v.timestamp >= startTimeMs) && (v.timestamp <= endTimeMs)
}
logInfo(s"Filtered out event logs based on both fs start time: ${fsStartTime.get} " +
s"and fs end time: ${fsEndTime.get}. The logs filtered out include: " +
s"${filtered.keys.map(_.eventLog.toString).mkString(",")}")
matched
} else if (fsStartTime.isDefined) {
val startTimeMs = AppFilterImpl.parseDateTimePeriod(fsStartTime.get).get
val (matched, filtered) =
filteredByMaxSize.partition { case (_, v) => v.timestamp >= startTimeMs }
logInfo(s"Filtered out event logs based on fs start time: ${fsStartTime.get} " +
s"The logs filtered out include: " +
s"${filtered.keys.map(_.eventLog.toString).mkString(",")}")
matched
} else {
val endTimeMs = AppFilterImpl.parseDateTimePeriod(fsEndTime.get).get
val (matched, filtered) =
filteredByMaxSize.partition { case (_, v) => v.timestamp <= endTimeMs }
logInfo(s"Filtered out event logs based on fs end time: ${fsEndTime.get} " +
s"The logs filtered out include: " +
s"${filtered.keys.map(_.eventLog.toString).mkString(",")}")
matched
}
} else {
filteredByMaxSize
}
if (filterNLogs.nonEmpty && !filterByAppCriteria(filterNLogs)) {
val filteredInfo = filterNLogs.get.split("-")
val numberofEventLogs = filteredInfo(0).toInt
val criteria = filteredInfo(1)
// Before filtering based on user criteria, remove the failed event logs
// (i.e. logs without timestamp) from the list.
val matched = if (criteria.equals("newest")) {
LinkedHashMap(filteredByDateTime.toSeq.sortWith(_._2.timestamp > _._2.timestamp): _*)
} else if (criteria.equals("oldest")) {
LinkedHashMap(filteredByDateTime.toSeq.sortWith(_._2.timestamp < _._2.timestamp): _*)
} else {
logError("Criteria should be either newest-filesystem or oldest-filesystem")
Map.empty[EventLogInfo, Long]
}
matched.take(numberofEventLogs)
} else {
filteredByDateTime
}
} else {
matchedLogs
}
(filteredLogs.keys.toSeq, logsWithTimestamp.keys.toSeq)
}
def filterByAppCriteria(filterNLogs: Option[String]): Boolean = {
filterNLogs.get.endsWith("-oldest") || filterNLogs.get.endsWith("-newest") ||
filterNLogs.get.endsWith("per-app-name")
}
def logApplicationInfo(app: ApplicationInfo) = {
logInfo(s"============== ${app.appId} (index=${app.index}) ==============")
}
def getDBEventLogFileDate(eventLogFileName: String): LocalDateTime = {
if (!isDBEventLogFile(eventLogFileName)) {
logError(s"$eventLogFileName Not an event log file!")
}
val fileParts = eventLogFileName.split("--")
if (fileParts.size < 2) {
// assume this is the current log and we want that one to be read last
LocalDateTime.now()
} else {
val date = fileParts(0).split("-")
val day = Integer.parseInt(date(3))
val month = Integer.parseInt(date(2))
val year = Integer.parseInt(date(1))
val time = fileParts(1).split("-")
val minParse = time(1).split('.')
val hour = Integer.parseInt(time(0))
val min = Integer.parseInt(minParse(0))
LocalDateTime.of(year, month, day, hour, min)
}
}
}
/**
* The reader which will read the information of Databricks rolled multiple event log files.
*/
class DatabricksRollingEventLogFilesFileReader(
fs: FileSystem,
path: Path) extends EventLogFileReader(fs, path) with Logging {
private lazy val files: Seq[FileStatus] = {
val ret = fs.listStatus(rootPath).toSeq
if (!ret.exists(EventLogPathProcessor.isDBEventLogFile)) {
Seq.empty[FileStatus]
} else {
ret
}
}
private lazy val eventLogFiles: Seq[FileStatus] = {
files.filter(EventLogPathProcessor.isDBEventLogFile).sortWith { (status1, status2) =>
val dateTime = EventLogPathProcessor.getDBEventLogFileDate(status1.getPath.getName)
val dateTime2 = EventLogPathProcessor.getDBEventLogFileDate(status2.getPath.getName)
dateTime.isBefore(dateTime2)
}
}
override def completed: Boolean = true
override def modificationTime: Long = lastEventLogFile.getModificationTime
private def lastEventLogFile: FileStatus = eventLogFiles.last
override def listEventLogFiles: Seq[FileStatus] = eventLogFiles
// unused functions
override def compressionCodec: Option[String] = None
override def totalSize: Long = 0
override def zipEventLogFiles(zipStream: ZipOutputStream): Unit = {}
override def fileSizeForLastIndexForDFS: Option[Long] = None
override def fileSizeForLastIndex: Long = lastEventLogFile.getLen
override def lastIndex: Option[Long] = None
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy