org.apache.spark.deploy.SparkHadoopUtil.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-core Show documentation
Show all versions of spark-core Show documentation
Shaded version of Apache Spark 2.x.x for Presto
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.deploy
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream, File, IOException}
import java.security.PrivilegedExceptionAction
import java.text.DateFormat
import java.util.{Arrays, Comparator, Date, Locale}
import scala.collection.JavaConverters._
import scala.collection.immutable.Map
import scala.collection.mutable
import scala.collection.mutable.HashMap
import scala.util.control.NonFatal
import com.google.common.primitives.Longs
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path, PathFilter}
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.security.{Credentials, UserGroupInformation}
import org.apache.hadoop.security.token.{Token, TokenIdentifier}
import org.apache.hadoop.security.token.delegation.AbstractDelegationTokenIdentifier
import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._
import org.apache.spark.util.Utils
/**
* :: DeveloperApi ::
* Contains util methods to interact with Hadoop from Spark.
*/
@DeveloperApi
class SparkHadoopUtil extends Logging {
private val sparkConf = new SparkConf(false).loadFromSystemProperties(true)
val conf: Configuration = newConfiguration(sparkConf)
UserGroupInformation.setConfiguration(conf)
/**
* Runs the given function with a Hadoop UserGroupInformation as a thread local variable
* (distributed to child threads), used for authenticating HDFS and YARN calls.
*
* IMPORTANT NOTE: If this function is going to be called repeated in the same process
* you need to look https://issues.apache.org/jira/browse/HDFS-3545 and possibly
* do a FileSystem.closeAllForUGI in order to avoid leaking Filesystems
*/
def runAsSparkUser(func: () => Unit) {
createSparkUser().doAs(new PrivilegedExceptionAction[Unit] {
def run: Unit = func()
})
}
def createSparkUser(): UserGroupInformation = {
val user = Utils.getCurrentUserName()
logDebug("creating UGI for user: " + user)
val ugi = UserGroupInformation.createRemoteUser(user)
transferCredentials(UserGroupInformation.getCurrentUser(), ugi)
ugi
}
def transferCredentials(source: UserGroupInformation, dest: UserGroupInformation) {
dest.addCredentials(source.getCredentials())
}
/**
* Appends S3-specific, spark.hadoop.*, and spark.buffer.size configurations to a Hadoop
* configuration.
*/
def appendS3AndSparkHadoopConfigurations(conf: SparkConf, hadoopConf: Configuration): Unit = {
SparkHadoopUtil.appendS3AndSparkHadoopConfigurations(conf, hadoopConf)
}
/**
* Appends spark.hadoop.* configurations from a [[SparkConf]] to a Hadoop
* configuration without the spark.hadoop. prefix.
*/
def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = {
SparkHadoopUtil.appendSparkHadoopConfigs(conf, hadoopConf)
}
/**
* Appends spark.hadoop.* configurations from a Map to another without the spark.hadoop. prefix.
*/
def appendSparkHadoopConfigs(
srcMap: Map[String, String],
destMap: HashMap[String, String]): Unit = {
// Copy any "spark.hadoop.foo=bar" system properties into destMap as "foo=bar"
for ((key, value) <- srcMap if key.startsWith("spark.hadoop.")) {
destMap.put(key.substring("spark.hadoop.".length), value)
}
}
/**
* Return an appropriate (subclass) of Configuration. Creating config can initialize some Hadoop
* subsystems.
*/
def newConfiguration(conf: SparkConf): Configuration = {
val hadoopConf = SparkHadoopUtil.newConfiguration(conf)
hadoopConf.addResource(SparkHadoopUtil.SPARK_HADOOP_CONF_FILE)
hadoopConf
}
/**
* Add any user credentials to the job conf which are necessary for running on a secure Hadoop
* cluster.
*/
def addCredentials(conf: JobConf): Unit = {
val jobCreds = conf.getCredentials()
jobCreds.mergeAll(UserGroupInformation.getCurrentUser().getCredentials())
}
def addCurrentUserCredentials(creds: Credentials): Unit = {
UserGroupInformation.getCurrentUser.addCredentials(creds)
}
def loginUserFromKeytab(principalName: String, keytabFilename: String): Unit = {
if (!new File(keytabFilename).exists()) {
throw new SparkException(s"Keytab file: ${keytabFilename} does not exist")
} else {
logInfo("Attempting to login to Kerberos " +
s"using principal: ${principalName} and keytab: ${keytabFilename}")
UserGroupInformation.loginUserFromKeytab(principalName, keytabFilename)
}
}
/**
* Add or overwrite current user's credentials with serialized delegation tokens,
* also confirms correct hadoop configuration is set.
*/
private[spark] def addDelegationTokens(tokens: Array[Byte], sparkConf: SparkConf) {
UserGroupInformation.setConfiguration(newConfiguration(sparkConf))
val creds = deserialize(tokens)
logInfo("Updating delegation tokens for current user.")
logDebug(s"Adding/updating delegation tokens ${dumpTokens(creds)}")
addCurrentUserCredentials(creds)
}
/**
* Returns a function that can be called to find Hadoop FileSystem bytes read. If
* getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will
* return the bytes read on r since t.
*/
private[spark] def getFSBytesReadOnThreadCallback(): () => Long = {
val f = () => FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics.getBytesRead).sum
val baseline = (Thread.currentThread().getId, f())
/**
* This function may be called in both spawned child threads and parent task thread (in
* PythonRDD), and Hadoop FileSystem uses thread local variables to track the statistics.
* So we need a map to track the bytes read from the child threads and parent thread,
* summing them together to get the bytes read of this task.
*/
new Function0[Long] {
private val bytesReadMap = new mutable.HashMap[Long, Long]()
override def apply(): Long = {
bytesReadMap.synchronized {
bytesReadMap.put(Thread.currentThread().getId, f())
bytesReadMap.map { case (k, v) =>
v - (if (k == baseline._1) baseline._2 else 0)
}.sum
}
}
}
}
/**
* Returns a function that can be called to find Hadoop FileSystem bytes written. If
* getFSBytesWrittenOnThreadCallback is called from thread r at time t, the returned callback will
* return the bytes written on r since t.
*
* @return None if the required method can't be found.
*/
private[spark] def getFSBytesWrittenOnThreadCallback(): () => Long = {
val threadStats = FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics)
val f = () => threadStats.map(_.getBytesWritten).sum
val baselineBytesWritten = f()
() => f() - baselineBytesWritten
}
/**
* Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the
* given path points to a file, return a single-element collection containing [[FileStatus]] of
* that file.
*/
def listLeafStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = {
listLeafStatuses(fs, fs.getFileStatus(basePath))
}
/**
* Get [[FileStatus]] objects for all leaf children (files) under the given base path. If the
* given path points to a file, return a single-element collection containing [[FileStatus]] of
* that file.
*/
def listLeafStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = {
def recurse(status: FileStatus): Seq[FileStatus] = {
val (directories, leaves) = fs.listStatus(status.getPath).partition(_.isDirectory)
leaves ++ directories.flatMap(f => listLeafStatuses(fs, f))
}
if (baseStatus.isDirectory) recurse(baseStatus) else Seq(baseStatus)
}
def listLeafDirStatuses(fs: FileSystem, basePath: Path): Seq[FileStatus] = {
listLeafDirStatuses(fs, fs.getFileStatus(basePath))
}
def listLeafDirStatuses(fs: FileSystem, baseStatus: FileStatus): Seq[FileStatus] = {
def recurse(status: FileStatus): Seq[FileStatus] = {
val (directories, files) = fs.listStatus(status.getPath).partition(_.isDirectory)
val leaves = if (directories.isEmpty) Seq(status) else Seq.empty[FileStatus]
leaves ++ directories.flatMap(dir => listLeafDirStatuses(fs, dir))
}
assert(baseStatus.isDirectory)
recurse(baseStatus)
}
def isGlobPath(pattern: Path): Boolean = {
pattern.toString.exists("{}[]*?\\".toSet.contains)
}
def globPath(pattern: Path): Seq[Path] = {
val fs = pattern.getFileSystem(conf)
globPath(fs, pattern)
}
def globPath(fs: FileSystem, pattern: Path): Seq[Path] = {
Option(fs.globStatus(pattern)).map { statuses =>
statuses.map(_.getPath.makeQualified(fs.getUri, fs.getWorkingDirectory)).toSeq
}.getOrElse(Seq.empty[Path])
}
def globPathIfNecessary(pattern: Path): Seq[Path] = {
if (isGlobPath(pattern)) globPath(pattern) else Seq(pattern)
}
def globPathIfNecessary(fs: FileSystem, pattern: Path): Seq[Path] = {
if (isGlobPath(pattern)) globPath(fs, pattern) else Seq(pattern)
}
/**
* Lists all the files in a directory with the specified prefix, and does not end with the
* given suffix. The returned {{FileStatus}} instances are sorted by the modification times of
* the respective files.
*/
def listFilesSorted(
remoteFs: FileSystem,
dir: Path,
prefix: String,
exclusionSuffix: String): Array[FileStatus] = {
try {
val fileStatuses = remoteFs.listStatus(dir,
new PathFilter {
override def accept(path: Path): Boolean = {
val name = path.getName
name.startsWith(prefix) && !name.endsWith(exclusionSuffix)
}
})
Arrays.sort(fileStatuses, new Comparator[FileStatus] {
override def compare(o1: FileStatus, o2: FileStatus): Int = {
Longs.compare(o1.getModificationTime, o2.getModificationTime)
}
})
fileStatuses
} catch {
case NonFatal(e) =>
logWarning("Error while attempting to list files from application staging dir", e)
Array.empty
}
}
private[spark] def getSuffixForCredentialsPath(credentialsPath: Path): Int = {
val fileName = credentialsPath.getName
fileName.substring(
fileName.lastIndexOf(SparkHadoopUtil.SPARK_YARN_CREDS_COUNTER_DELIM) + 1).toInt
}
private val HADOOP_CONF_PATTERN = "(\\$\\{hadoopconf-[^\\}\\$\\s]+\\})".r.unanchored
/**
* Substitute variables by looking them up in Hadoop configs. Only variables that match the
* ${hadoopconf- .. } pattern are substituted.
*/
def substituteHadoopVariables(text: String, hadoopConf: Configuration): String = {
text match {
case HADOOP_CONF_PATTERN(matched) =>
logDebug(text + " matched " + HADOOP_CONF_PATTERN)
val key = matched.substring(13, matched.length() - 1) // remove ${hadoopconf- .. }
val eval = Option[String](hadoopConf.get(key))
.map { value =>
logDebug("Substituted " + matched + " with " + value)
text.replace(matched, value)
}
if (eval.isEmpty) {
// The variable was not found in Hadoop configs, so return text as is.
text
} else {
// Continue to substitute more variables.
substituteHadoopVariables(eval.get, hadoopConf)
}
case _ =>
logDebug(text + " didn't match " + HADOOP_CONF_PATTERN)
text
}
}
/**
* Dump the credentials' tokens to string values.
*
* @param credentials credentials
* @return an iterator over the string values. If no credentials are passed in: an empty list
*/
private[spark] def dumpTokens(credentials: Credentials): Iterable[String] = {
if (credentials != null) {
credentials.getAllTokens.asScala.map(tokenToString)
} else {
Seq.empty
}
}
/**
* Convert a token to a string for logging.
* If its an abstract delegation token, attempt to unmarshall it and then
* print more details, including timestamps in human-readable form.
*
* @param token token to convert to a string
* @return a printable string value.
*/
private[spark] def tokenToString(token: Token[_ <: TokenIdentifier]): String = {
val df = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT, Locale.US)
val buffer = new StringBuilder(128)
buffer.append(token.toString)
try {
val ti = token.decodeIdentifier
buffer.append("; ").append(ti)
ti match {
case dt: AbstractDelegationTokenIdentifier =>
// include human times and the renewer, which the HDFS tokens toString omits
buffer.append("; Renewer: ").append(dt.getRenewer)
buffer.append("; Issued: ").append(df.format(new Date(dt.getIssueDate)))
buffer.append("; Max Date: ").append(df.format(new Date(dt.getMaxDate)))
case _ =>
}
} catch {
case e: IOException =>
logDebug(s"Failed to decode $token: $e", e)
}
buffer.toString
}
def serialize(creds: Credentials): Array[Byte] = {
val byteStream = new ByteArrayOutputStream
val dataStream = new DataOutputStream(byteStream)
creds.writeTokenStorageToStream(dataStream)
byteStream.toByteArray
}
def deserialize(tokenBytes: Array[Byte]): Credentials = {
val tokensBuf = new ByteArrayInputStream(tokenBytes)
val creds = new Credentials()
creds.readTokenStorageStream(new DataInputStream(tokensBuf))
creds
}
def isProxyUser(ugi: UserGroupInformation): Boolean = {
ugi.getAuthenticationMethod() == UserGroupInformation.AuthenticationMethod.PROXY
}
}
object SparkHadoopUtil {
private lazy val instance = new SparkHadoopUtil
val SPARK_YARN_CREDS_TEMP_EXTENSION = ".tmp"
val SPARK_YARN_CREDS_COUNTER_DELIM = "-"
/**
* Number of records to update input metrics when reading from HadoopRDDs.
*
* Each update is potentially expensive because we need to use reflection to access the
* Hadoop FileSystem API of interest (only available in 2.5), so we should do this sparingly.
*/
private[spark] val UPDATE_INPUT_METRICS_INTERVAL_RECORDS = 1000
/**
* Name of the file containing the gateway's Hadoop configuration, to be overlayed on top of the
* cluster's Hadoop config. It is up to the Spark code launching the application to create
* this file if it's desired. If the file doesn't exist, it will just be ignored.
*/
private[spark] val SPARK_HADOOP_CONF_FILE = "__spark_hadoop_conf__.xml"
def get: SparkHadoopUtil = instance
/**
* Given an expiration date for the current set of credentials, calculate the time when new
* credentials should be created.
*
* @param expirationDate Drop-dead expiration date
* @param conf Spark configuration
* @return Timestamp when new credentials should be created.
*/
private[spark] def nextCredentialRenewalTime(expirationDate: Long, conf: SparkConf): Long = {
val ct = System.currentTimeMillis
val ratio = conf.get(CREDENTIALS_RENEWAL_INTERVAL_RATIO)
(ct + (ratio * (expirationDate - ct))).toLong
}
/**
* Returns a Configuration object with Spark configuration applied on top. Unlike
* the instance method, this will always return a Configuration instance, and not a
* cluster manager-specific type.
*/
private[spark] def newConfiguration(conf: SparkConf): Configuration = {
val hadoopConf = new Configuration()
appendS3AndSparkHadoopConfigurations(conf, hadoopConf)
hadoopConf
}
private def appendS3AndSparkHadoopConfigurations(
conf: SparkConf,
hadoopConf: Configuration): Unit = {
// Note: this null check is around more than just access to the "conf" object to maintain
// the behavior of the old implementation of this code, for backwards compatibility.
if (conf != null) {
// Explicitly check for S3 environment variables
val keyId = System.getenv("AWS_ACCESS_KEY_ID")
val accessKey = System.getenv("AWS_SECRET_ACCESS_KEY")
if (keyId != null && accessKey != null) {
hadoopConf.set("fs.s3.awsAccessKeyId", keyId)
hadoopConf.set("fs.s3n.awsAccessKeyId", keyId)
hadoopConf.set("fs.s3a.access.key", keyId)
hadoopConf.set("fs.s3.awsSecretAccessKey", accessKey)
hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey)
hadoopConf.set("fs.s3a.secret.key", accessKey)
val sessionToken = System.getenv("AWS_SESSION_TOKEN")
if (sessionToken != null) {
hadoopConf.set("fs.s3a.session.token", sessionToken)
}
}
appendSparkHadoopConfigs(conf, hadoopConf)
val bufferSize = conf.get("spark.buffer.size", "65536")
hadoopConf.set("io.file.buffer.size", bufferSize)
}
}
private def appendSparkHadoopConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = {
// Copy any "spark.hadoop.foo=bar" spark properties into conf as "foo=bar"
for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) {
hadoopConf.set(key.substring("spark.hadoop.".length), value)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy