![JAR search and dependency download from the Maven repository](/logo.png)
org.apache.spark.sql.v2.YtInMemoryFileIndex.scala Maven / Gradle / Ivy
package org.apache.spark.sql.v2
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.hadoop.hdfs.DistributedFileSystem
import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
import org.apache.spark.SparkContext
import org.apache.spark.internal.Logging
import org.apache.spark.metrics.source.HiveCatalogMetrics
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.execution.datasources.{FileStatusCache, NoopCache, PartitionSpec, PartitioningAwareFileIndex}
import org.apache.spark.sql.execution.streaming.FileStreamSink
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.v2.YtInMemoryFileIndex.bulkListLeafFiles
import org.apache.spark.util.SerializableConfiguration
import java.io.FileNotFoundException
import scala.annotation.tailrec
import scala.collection.mutable
/**
* Copy-paste from [[org.apache.spark.sql.execution.datasources.InMemoryFileIndex]]
* with changes to lookup nodes with `@` attributes (transactions, timestamps etc)
*/
class YtInMemoryFileIndex(
sparkSession: SparkSession,
rootPathsSpecified: Seq[Path],
parameters: Map[String, String],
userSpecifiedSchema: Option[StructType],
fileStatusCache: FileStatusCache = NoopCache,
userSpecifiedPartitionSpec: Option[PartitionSpec] = None,
override val metadataOpsTimeNs: Option[Long] = None)
extends PartitioningAwareFileIndex(
sparkSession, parameters, userSpecifiedSchema, fileStatusCache) {
// Filter out streaming metadata dirs or files such as "/.../_spark_metadata" (the metadata dir)
// or "/.../_spark_metadata/0" (a file in the metadata dir). `rootPathsSpecified` might contain
// such streaming metadata dir or files, e.g. when after globbing "basePath/*" where "basePath"
// is the output of a streaming query.
override val rootPaths =
rootPathsSpecified.filterNot(FileStreamSink.ancestorIsMetadataDirectory(_, hadoopConf))
@volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
@volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
@volatile private var cachedPartitionSpec: PartitionSpec = _
refresh0()
override def partitionSpec(): PartitionSpec = {
if (cachedPartitionSpec == null) {
if (userSpecifiedPartitionSpec.isDefined) {
cachedPartitionSpec = userSpecifiedPartitionSpec.get
} else {
cachedPartitionSpec = inferPartitioning()
}
}
logTrace(s"Partition spec: $cachedPartitionSpec")
cachedPartitionSpec
}
override protected def leafFiles: mutable.LinkedHashMap[Path, FileStatus] = {
cachedLeafFiles
}
override protected def leafDirToChildrenFiles: Map[Path, Array[FileStatus]] = {
cachedLeafDirToChildrenFiles
}
override def refresh(): Unit = {
fileStatusCache.invalidateAll()
refresh0()
}
@tailrec
private def normalizedPath(file: Path): Path =
if (file.getName.startsWith("@"))
normalizedPath(file.getParent)
else
file
private def refresh0(): Unit = {
val files = listLeafFiles(rootPaths)
cachedLeafFiles = new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
cachedLeafDirToChildrenFiles = files.toArray
.map(f => normalizedPath(f.getPath.getParent) -> f)
.groupBy(_._1)
.mapValues(_.map(_._2).toSet.toArray)
cachedPartitionSpec = null
}
override def equals(other: Any): Boolean = other match {
case hdfs: YtInMemoryFileIndex => rootPaths.toSet == hdfs.rootPaths.toSet
case _ => false
}
override def hashCode(): Int = rootPaths.toSet.hashCode()
/**
* List leaf files of given paths. This method will submit a Spark job to do parallel
* listing whenever there is a path having more files than the parallel partition discovery
* discovery threshold.
*
* This is publicly visible for testing.
*/
def listLeafFiles(paths: Seq[Path]): mutable.LinkedHashSet[FileStatus] = {
val startTime = System.nanoTime()
val output = mutable.LinkedHashSet[FileStatus]()
val pathsToFetch = mutable.ArrayBuffer[Path]()
for (path <- paths) {
fileStatusCache.getLeafFiles(path) match {
case Some(files) =>
HiveCatalogMetrics.incrementFileCacheHits(files.length)
output ++= files
case None =>
pathsToFetch += path
}
() // for some reasons scalac 2.12 needs this; return type doesn't matter
}
val filter = FileInputFormat.getInputPathFilter(new JobConf(hadoopConf, this.getClass))
val discovered = bulkListLeafFiles(pathsToFetch, hadoopConf, filter, sparkSession, areRootPaths = true)
discovered.foreach { case (path, leafFiles) =>
HiveCatalogMetrics.incrementFilesDiscovered(leafFiles.size)
fileStatusCache.putLeafFiles(path, leafFiles.toArray)
output ++= leafFiles
}
logInfo(s"It took ${(System.nanoTime() - startTime) / (1000 * 1000)} ms to list leaf files" +
s" for ${paths.length} paths.")
output
}
def getPathAttrsSuffix(path: Path): Set[String] = {
@tailrec
def get(p: Path, acc: Set[String]): Set[String] =
if (p.getName.startsWith("@"))
get(p.getParent, acc + p.getName)
else
acc
get(path, Set.empty)
}
def containsPathAttrs(path: Path, requiredAttrs: Set[String]): Boolean =
getPathAttrsSuffix(path).intersect(requiredAttrs).size == requiredAttrs.size
protected lazy val pathGlobFilter: Option[GlobFilter] = {
val baseClass = this.getClass.getSuperclass
val field = baseClass.getDeclaredField("caseInsensitiveMap")
field.setAccessible(true)
val caseInsensitiveMap = field.get(this).asInstanceOf[CaseInsensitiveMap[String]]
val value = caseInsensitiveMap.get("pathGlobFilter").map(new GlobFilter(_))
field.setAccessible(false)
value
}
protected def matchGlobPattern(file: FileStatus): Boolean = {
pathGlobFilter.forall(_.accept(file.getPath))
}
override def allFiles(): Seq[FileStatus] = {
val files = if (partitionSpec().partitionColumns.isEmpty && !recursiveFileLookup) {
// For each of the root input paths, get the list of files inside them
rootPaths.flatMap { path =>
// Make the path qualified (consistent with listLeafFiles and bulkListLeafFiles).
val fs = path.getFileSystem(hadoopConf)
val qualifiedPathPre = fs.makeQualified(path)
val qualifiedPath: Path = if (qualifiedPathPre.isRoot && !qualifiedPathPre.isAbsolute) {
// SPARK-17613: Always append `Path.SEPARATOR` to the end of parent directories,
// because the `leafFile.getParent` would have returned an absolute path with the
// separator at the end.
new Path(qualifiedPathPre, Path.SEPARATOR)
} else {
qualifiedPathPre
}
// There are three cases possible with each path
// 1. The path is a directory and has children files in it. Then it must be present in
// leafDirToChildrenFiles as those children files will have been found as leaf files.
// Find its children files from leafDirToChildrenFiles and include them.
// 2. The path is a file, then it will be present in leafFiles. Include this path.
// 3. The path is a directory, but has no children files. Do not include this path.
leafDirToChildrenFiles.get(normalizedPath(qualifiedPath))
.orElse { leafFiles.get(normalizedPath(qualifiedPath)).map(Array(_)) }
.map(_.filter(s => containsPathAttrs(s.getPath.getParent, getPathAttrsSuffix(qualifiedPath))))
.getOrElse(Array.empty)
}
} else {
leafFiles.values.toSeq
}
files.filter(matchGlobPattern)
}
}
object YtInMemoryFileIndex extends Logging {
/** A serializable variant of HDFS's BlockLocation. */
private case class SerializableBlockLocation(
names: Array[String],
hosts: Array[String],
offset: Long,
length: Long)
/** A serializable variant of HDFS's FileStatus. */
private case class SerializableFileStatus(
path: String,
length: Long,
isDir: Boolean,
blockReplication: Short,
blockSize: Long,
modificationTime: Long,
accessTime: Long,
blockLocations: Array[SerializableBlockLocation])
/**
* Lists a collection of paths recursively. Picks the listing strategy adaptively depending
* on the number of paths to list.
*
* This may only be called on the driver.
*
* @return for each input path, the set of discovered files for the path
*/
private[sql] def bulkListLeafFiles(
paths: Seq[Path],
hadoopConf: Configuration,
filter: PathFilter,
sparkSession: SparkSession,
areRootPaths: Boolean): Seq[(Path, Seq[FileStatus])] = {
val ignoreMissingFiles = sparkSession.sessionState.conf.ignoreMissingFiles
val ignoreLocality = sparkSession.sessionState.conf.ignoreDataLocality
// Short-circuits parallel listing when serial listing is likely to be faster.
if (paths.size <= sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
return paths.map { path =>
val leafFiles = listLeafFiles(
path,
hadoopConf,
filter,
Some(sparkSession),
ignoreMissingFiles = ignoreMissingFiles,
ignoreLocality = ignoreLocality,
isRootPath = areRootPaths)
(path, leafFiles)
}
}
logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." +
s" The first several paths are: ${paths.take(10).mkString(", ")}.")
HiveCatalogMetrics.incrementParallelListingJobCount(1)
val sparkContext = sparkSession.sparkContext
val serializableConfiguration = new SerializableConfiguration(hadoopConf)
val serializedPaths = paths.map(_.toString)
val parallelPartitionDiscoveryParallelism =
sparkSession.sessionState.conf.parallelPartitionDiscoveryParallelism
// Set the number of parallelism to prevent following file listing from generating many tasks
// in case of large #defaultParallelism.
val numParallelism = Math.min(paths.size, parallelPartitionDiscoveryParallelism)
val previousJobDescription = sparkContext.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
val statusMap = try {
val description = paths.size match {
case 0 =>
s"Listing leaf files and directories 0 paths"
case 1 =>
s"Listing leaf files and directories for 1 path:
${paths(0)}"
case s =>
s"Listing leaf files and directories for $s paths:
${paths(0)}, ..."
}
sparkContext.setJobDescription(description)
sparkContext
.parallelize(serializedPaths, numParallelism)
.mapPartitions { pathStrings =>
val hadoopConf = serializableConfiguration.value
pathStrings.map(new Path(_)).toSeq.map { path =>
val leafFiles = listLeafFiles(
path,
hadoopConf,
filter,
None,
ignoreMissingFiles = ignoreMissingFiles,
ignoreLocality = ignoreLocality,
isRootPath = areRootPaths)
(path, leafFiles)
}.iterator
}.map { case (path, statuses) =>
val serializableStatuses = statuses.map { status =>
// Turn FileStatus into SerializableFileStatus so we can send it back to the driver
val blockLocations = status match {
case f: LocatedFileStatus =>
f.getBlockLocations.map { loc =>
SerializableBlockLocation(
loc.getNames,
loc.getHosts,
loc.getOffset,
loc.getLength)
}
case _ =>
Array.empty[SerializableBlockLocation]
}
SerializableFileStatus(
status.getPath.toString,
status.getLen,
status.isDirectory,
status.getReplication,
status.getBlockSize,
status.getModificationTime,
status.getAccessTime,
blockLocations)
}
(path.toString, serializableStatuses)
}.collect()
} finally {
sparkContext.setJobDescription(previousJobDescription)
}
// turn SerializableFileStatus back to Status
statusMap.map { case (path, serializableStatuses) =>
val statuses = serializableStatuses.map { f =>
val blockLocations = f.blockLocations.map { loc =>
new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
}
new LocatedFileStatus(
new FileStatus(
f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime,
new Path(f.path)),
blockLocations)
}
(new Path(path), statuses)
}
}
/**
* Lists a single filesystem path recursively. If a SparkSession object is specified, this
* function may launch Spark jobs to parallelize listing.
*
* If sessionOpt is None, this may be called on executors.
*
* @return all children of path that match the specified filter.
*/
private def listLeafFiles(
path: Path,
hadoopConf: Configuration,
filter: PathFilter,
sessionOpt: Option[SparkSession],
ignoreMissingFiles: Boolean,
ignoreLocality: Boolean,
isRootPath: Boolean): Seq[FileStatus] = {
logTrace(s"Listing $path")
val fs = path.getFileSystem(hadoopConf)
// Note that statuses only include FileStatus for the files and dirs directly under path,
// and does not include anything else recursively.
val statuses: Array[FileStatus] = try {
fs match {
// DistributedFileSystem overrides listLocatedStatus to make 1 single call to namenode
// to retrieve the file status with the file block location. The reason to still fallback
// to listStatus is because the default implementation would potentially throw a
// FileNotFoundException which is better handled by doing the lookups manually below.
case _: DistributedFileSystem if !ignoreLocality =>
val remoteIter = fs.listLocatedStatus(path)
new Iterator[LocatedFileStatus]() {
def next(): LocatedFileStatus = remoteIter.next
def hasNext(): Boolean = remoteIter.hasNext
}.toArray
case _ => fs.listStatus(path)
}
} catch {
// If we are listing a root path (e.g. a top level directory of a table), we need to
// ignore FileNotFoundExceptions during this root level of the listing because
//
// (a) certain code paths might construct an InMemoryFileIndex with root paths that
// might not exist (i.e. not all callers are guaranteed to have checked
// path existence prior to constructing InMemoryFileIndex) and,
// (b) we need to ignore deleted root paths during REFRESH TABLE, otherwise we break
// existing behavior and break the ability drop SessionCatalog tables when tables'
// root directories have been deleted (which breaks a number of Spark's own tests).
//
// If we are NOT listing a root path then a FileNotFoundException here means that the
// directory was present in a previous level of file listing but is absent in this
// listing, likely indicating a race condition (e.g. concurrent table overwrite or S3
// list inconsistency).
//
// The trade-off in supporting existing behaviors / use-cases is that we won't be
// able to detect race conditions involving root paths being deleted during
// InMemoryFileIndex construction. However, it's still a net improvement to detect and
// fail-fast on the non-root cases. For more info see the SPARK-27676 review discussion.
case _: FileNotFoundException if isRootPath || ignoreMissingFiles =>
logWarning(s"The directory $path was not found. Was it deleted very recently?")
Array.empty[FileStatus]
}
val filteredStatuses = statuses.filterNot(status => shouldFilterOut(status.getPath.getName))
val allLeafStatuses = {
val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
val nestedFiles: Seq[FileStatus] = sessionOpt match {
case Some(session) =>
bulkListLeafFiles(
dirs.map(_.getPath),
hadoopConf,
filter,
session,
areRootPaths = false
).flatMap(_._2)
case _ =>
dirs.flatMap { dir =>
listLeafFiles(
dir.getPath,
hadoopConf,
filter,
sessionOpt,
ignoreMissingFiles = ignoreMissingFiles,
ignoreLocality = ignoreLocality,
isRootPath = false)
}
}
val allFiles = topLevelFiles ++ nestedFiles
if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
}
val missingFiles = mutable.ArrayBuffer.empty[String]
val filteredLeafStatuses = allLeafStatuses.filterNot(
status => shouldFilterOut(status.getPath.getName))
val resolvedLeafStatuses = filteredLeafStatuses.flatMap {
case f: LocatedFileStatus =>
Some(f)
// NOTE:
//
// - Although S3/S3A/S3N file system can be quite slow for remote file metadata
// operations, calling `getFileBlockLocations` does no harm here since these file system
// implementations don't actually issue RPC for this method.
//
// - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
// be a big deal since we always use to `bulkListLeafFiles` when the number of
// paths exceeds threshold.
case f if !ignoreLocality =>
// The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
// which is very slow on some file system (RawLocalFileSystem, which is launch a
// subprocess and parse the stdout).
try {
val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc =>
// Store BlockLocation objects to consume less memory
if (loc.getClass == classOf[BlockLocation]) {
loc
} else {
new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
}
}
val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
if (f.isSymlink) {
lfs.setSymlink(f.getSymlink)
}
Some(lfs)
} catch {
case _: FileNotFoundException if ignoreMissingFiles =>
missingFiles += f.getPath.toString
None
}
case f => Some(f)
}
if (missingFiles.nonEmpty) {
logWarning(
s"the following files were missing during file scan:\n ${missingFiles.mkString("\n ")}")
}
resolvedLeafStatuses
}
/** Checks if we should filter out this path name. */
def shouldFilterOut(pathName: String): Boolean = {
// We filter follow paths:
// 1. everything that starts with _ and ., except _common_metadata and _metadata
// because Parquet needs to find those metadata files from leaf files returned by this method.
// We should refactor this logic to not mix metadata files with data files.
// 2. everything that ends with `._COPYING_`, because this is a intermediate state of file. we
// should skip this file in case of double reading.
val exclude = (pathName.startsWith("_") && !pathName.contains("=")) ||
pathName.startsWith(".") || pathName.endsWith("._COPYING_")
val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata")
exclude && !include
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy