org.apache.spark.HoodieHadoopFSUtils.scala Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.hadoop.fs.viewfs.ViewFileSystem
import org.apache.hadoop.hdfs.DistributedFileSystem
import org.apache.spark.internal.Logging
import org.apache.spark.metrics.source.HiveCatalogMetrics
import org.apache.spark.util.SerializableConfiguration
import java.io.FileNotFoundException
import scala.collection.mutable
/**
* NOTE: This method class is replica of HadoopFSUtils from Spark 3.2.1, with the following adjustments
*
* - Filtering out of the listed files is adjusted to include files starting w/ "." (to include Hoodie Delta Log
* files)
*/
object HoodieHadoopFSUtils extends Logging {
/**
* Lists a collection of paths recursively. Picks the listing strategy adaptively depending
* on the number of paths to list.
*
* This may only be called on the driver.
*
* @param sc Spark context used to run parallel listing.
* @param paths Input paths to list
* @param hadoopConf Hadoop configuration
* @param filter Path filter used to exclude leaf files from result
* @param ignoreMissingFiles Ignore missing files that happen during recursive listing
* (e.g., due to race conditions)
* @param ignoreLocality Whether to fetch data locality info when listing leaf files. If false,
* this will return `FileStatus` without `BlockLocation` info.
* @param parallelismThreshold The threshold to enable parallelism. If the number of input paths
* is smaller than this value, this will fallback to use
* sequential listing.
* @param parallelismMax The maximum parallelism for listing. If the number of input paths is
* larger than this value, parallelism will be throttled to this value
* to avoid generating too many tasks.
* @return for each input path, the set of discovered files for the path
*/
def parallelListLeafFiles(sc: SparkContext,
paths: Seq[Path],
hadoopConf: Configuration,
filter: PathFilter,
ignoreMissingFiles: Boolean,
ignoreLocality: Boolean,
parallelismThreshold: Int,
parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
parallelListLeafFilesInternal(sc, paths, hadoopConf, filter, isRootLevel = true,
ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax)
}
// scalastyle:off parameter.number
private def parallelListLeafFilesInternal(sc: SparkContext,
paths: Seq[Path],
hadoopConf: Configuration,
filter: PathFilter,
isRootLevel: Boolean,
ignoreMissingFiles: Boolean,
ignoreLocality: Boolean,
parallelismThreshold: Int,
parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
// Short-circuits parallel listing when serial listing is likely to be faster.
if (paths.size <= parallelismThreshold) {
// scalastyle:off return
return paths.map { path =>
val leafFiles = listLeafFiles(
path,
hadoopConf,
filter,
Some(sc),
ignoreMissingFiles = ignoreMissingFiles,
ignoreLocality = ignoreLocality,
isRootPath = isRootLevel,
parallelismThreshold = parallelismThreshold,
parallelismMax = parallelismMax)
(path, leafFiles)
}
// scalastyle:on return
}
logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." +
s" The first several paths are: ${paths.take(10).mkString(", ")}.")
HiveCatalogMetrics.incrementParallelListingJobCount(1)
val serializableConfiguration = new SerializableConfiguration(hadoopConf)
val serializedPaths = paths.map(_.toString)
// Set the number of parallelism to prevent following file listing from generating many tasks
// in case of large #defaultParallelism.
val numParallelism = Math.min(paths.size, parallelismMax)
val previousJobDescription = sc.getLocalProperty(SparkContext.SPARK_JOB_DESCRIPTION)
val statusMap = try {
val description = paths.size match {
case 0 =>
"Listing leaf files and directories 0 paths"
case 1 =>
s"Listing leaf files and directories for 1 path:
${paths(0)}"
case s =>
s"Listing leaf files and directories for $s paths:
${paths(0)}, ..."
}
sc.setJobDescription(description)
sc
.parallelize(serializedPaths, numParallelism)
.mapPartitions { pathStrings =>
val hadoopConf = serializableConfiguration.value
pathStrings.map(new Path(_)).toSeq.map { path =>
val leafFiles = listLeafFiles(
path = path,
hadoopConf = hadoopConf,
filter = filter,
contextOpt = None, // Can't execute parallel scans on workers
ignoreMissingFiles = ignoreMissingFiles,
ignoreLocality = ignoreLocality,
isRootPath = isRootLevel,
parallelismThreshold = Int.MaxValue,
parallelismMax = 0)
(path, leafFiles)
}.iterator
}.map { case (path, statuses) =>
val serializableStatuses = statuses.map { status =>
// Turn FileStatus into SerializableFileStatus so we can send it back to the driver
val blockLocations = status match {
case f: LocatedFileStatus =>
f.getBlockLocations.map { loc =>
SerializableBlockLocation(
loc.getNames,
loc.getHosts,
loc.getOffset,
loc.getLength)
}
case _ =>
Array.empty[SerializableBlockLocation]
}
SerializableFileStatus(
status.getPath.toString,
status.getLen,
status.isDirectory,
status.getReplication,
status.getBlockSize,
status.getModificationTime,
status.getAccessTime,
blockLocations)
}
(path.toString, serializableStatuses)
}.collect()
} finally {
sc.setJobDescription(previousJobDescription)
}
// turn SerializableFileStatus back to Status
statusMap.map { case (path, serializableStatuses) =>
val statuses = serializableStatuses.map { f =>
val blockLocations = f.blockLocations.map { loc =>
new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
}
new LocatedFileStatus(
new FileStatus(
f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime,
new Path(f.path)),
blockLocations)
}
(new Path(path), statuses)
}
}
// scalastyle:on parameter.number
// scalastyle:off parameter.number
/**
* Lists a single filesystem path recursively. If a `SparkContext` object is specified, this
* function may launch Spark jobs to parallelize listing based on `parallelismThreshold`.
*
* If sessionOpt is None, this may be called on executors.
*
* @return all children of path that match the specified filter.
*/
private def listLeafFiles(path: Path,
hadoopConf: Configuration,
filter: PathFilter,
contextOpt: Option[SparkContext],
ignoreMissingFiles: Boolean,
ignoreLocality: Boolean,
isRootPath: Boolean,
parallelismThreshold: Int,
parallelismMax: Int): Seq[FileStatus] = {
logTrace(s"Listing $path")
val fs = path.getFileSystem(hadoopConf)
// Note that statuses only include FileStatus for the files and dirs directly under path,
// and does not include anything else recursively.
val statuses: Array[FileStatus] = try {
fs match {
// DistributedFileSystem overrides listLocatedStatus to make 1 single call to namenode
// to retrieve the file status with the file block location. The reason to still fallback
// to listStatus is because the default implementation would potentially throw a
// FileNotFoundException which is better handled by doing the lookups manually below.
case (_: DistributedFileSystem | _: ViewFileSystem) if !ignoreLocality =>
val remoteIter = fs.listLocatedStatus(path)
new Iterator[LocatedFileStatus]() {
def next(): LocatedFileStatus = remoteIter.next
def hasNext(): Boolean = remoteIter.hasNext
}.toArray
case _ => fs.listStatus(path)
}
} catch {
// If we are listing a root path for SQL (e.g. a top level directory of a table), we need to
// ignore FileNotFoundExceptions during this root level of the listing because
//
// (a) certain code paths might construct an InMemoryFileIndex with root paths that
// might not exist (i.e. not all callers are guaranteed to have checked
// path existence prior to constructing InMemoryFileIndex) and,
// (b) we need to ignore deleted root paths during REFRESH TABLE, otherwise we break
// existing behavior and break the ability drop SessionCatalog tables when tables'
// root directories have been deleted (which breaks a number of Spark's own tests).
//
// If we are NOT listing a root path then a FileNotFoundException here means that the
// directory was present in a previous level of file listing but is absent in this
// listing, likely indicating a race condition (e.g. concurrent table overwrite or S3
// list inconsistency).
//
// The trade-off in supporting existing behaviors / use-cases is that we won't be
// able to detect race conditions involving root paths being deleted during
// InMemoryFileIndex construction. However, it's still a net improvement to detect and
// fail-fast on the non-root cases. For more info see the SPARK-27676 review discussion.
case _: FileNotFoundException if isRootPath || ignoreMissingFiles =>
logWarning(s"The directory $path was not found. Was it deleted very recently?")
Array.empty[FileStatus]
}
val filteredStatuses =
statuses.filterNot(status => shouldFilterOutPathName(status.getPath.getName))
val allLeafStatuses = {
val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
val nestedFiles: Seq[FileStatus] = contextOpt match {
case Some(context) if dirs.size > parallelismThreshold =>
parallelListLeafFilesInternal(
context,
dirs.map(_.getPath),
hadoopConf = hadoopConf,
filter = filter,
isRootLevel = false,
ignoreMissingFiles = ignoreMissingFiles,
ignoreLocality = ignoreLocality,
parallelismThreshold = parallelismThreshold,
parallelismMax = parallelismMax
).flatMap(_._2)
case _ =>
dirs.flatMap { dir =>
listLeafFiles(
path = dir.getPath,
hadoopConf = hadoopConf,
filter = filter,
contextOpt = contextOpt,
ignoreMissingFiles = ignoreMissingFiles,
ignoreLocality = ignoreLocality,
isRootPath = false,
parallelismThreshold = parallelismThreshold,
parallelismMax = parallelismMax)
}
}
val allFiles = topLevelFiles ++ nestedFiles
if (filter != null) allFiles.filter(f => filter.accept(f.getPath)) else allFiles
}
val missingFiles = mutable.ArrayBuffer.empty[String]
val resolvedLeafStatuses = allLeafStatuses.flatMap {
case f: LocatedFileStatus =>
Some(f)
// NOTE:
//
// - Although S3/S3A/S3N file system can be quite slow for remote file metadata
// operations, calling `getFileBlockLocations` does no harm here since these file system
// implementations don't actually issue RPC for this method.
//
// - Here we are calling `getFileBlockLocations` in a sequential manner, but it should not
// be a big deal since we always use to `parallelListLeafFiles` when the number of
// paths exceeds threshold.
case f if !ignoreLocality =>
// The other constructor of LocatedFileStatus will call FileStatus.getPermission(),
// which is very slow on some file system (RawLocalFileSystem, which is launch a
// subprocess and parse the stdout).
try {
val locations = fs.getFileBlockLocations(f, 0, f.getLen).map { loc =>
// Store BlockLocation objects to consume less memory
if (loc.getClass == classOf[BlockLocation]) {
loc
} else {
new BlockLocation(loc.getNames, loc.getHosts, loc.getOffset, loc.getLength)
}
}
val lfs = new LocatedFileStatus(f.getLen, f.isDirectory, f.getReplication, f.getBlockSize,
f.getModificationTime, 0, null, null, null, null, f.getPath, locations)
if (f.isSymlink) {
lfs.setSymlink(f.getSymlink)
}
Some(lfs)
} catch {
case _: FileNotFoundException if ignoreMissingFiles =>
missingFiles += f.getPath.toString
None
}
case f => Some(f)
}
if (missingFiles.nonEmpty) {
logWarning(
s"the following files were missing during file scan:\n ${missingFiles.mkString("\n ")}")
}
resolvedLeafStatuses
}
// scalastyle:on parameter.number
/** A serializable variant of HDFS's BlockLocation. This is required by Hadoop 2.7. */
private case class SerializableBlockLocation(names: Array[String],
hosts: Array[String],
offset: Long,
length: Long)
/** A serializable variant of HDFS's FileStatus. This is required by Hadoop 2.7. */
private case class SerializableFileStatus(path: String,
length: Long,
isDir: Boolean,
blockReplication: Short,
blockSize: Long,
modificationTime: Long,
accessTime: Long,
blockLocations: Array[SerializableBlockLocation])
/** Checks if we should filter out this path name. */
def shouldFilterOutPathName(pathName: String): Boolean = {
// We filter follow paths:
// 1. everything that starts with _ and ., except _common_metadata and _metadata
// because Parquet needs to find those metadata files from leaf files returned by this method.
// We should refactor this logic to not mix metadata files with data files.
// 2. everything that ends with `._COPYING_`, because this is a intermediate state of file. we
// should skip this file in case of double reading.
val exclude = (pathName.startsWith("_") && !pathName.contains("=")) || pathName.endsWith("._COPYING_")
val include = pathName.startsWith("_common_metadata") || pathName.startsWith("_metadata")
exclude && !include
}
}