org.apache.spark.sql.artifact.ArtifactManager.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-sql_2.13 Show documentation
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.spark.sql.artifact

import java.io.File
import java.net.{URI, URL, URLClassLoader}
import java.nio.file.{CopyOption, Files, Path, Paths, StandardCopyOption}
import java.util.concurrent.CopyOnWriteArrayList

import scala.jdk.CollectionConverters._
import scala.reflect.ClassTag

import org.apache.commons.io.{FilenameUtils, FileUtils}
import org.apache.hadoop.fs.{LocalFileSystem, Path => FSPath}

import org.apache.spark.{JobArtifactSet, JobArtifactState, SparkEnv, SparkException, SparkUnsupportedOperationException}
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.{CONNECT_SCALA_UDF_STUB_PREFIXES, EXECUTOR_USER_CLASS_PATH_FIRST}
import org.apache.spark.sql.{Artifact, SparkSession}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.util.ArtifactUtils
import org.apache.spark.storage.{CacheId, StorageLevel}
import org.apache.spark.util.{ChildFirstURLClassLoader, StubClassLoader, Utils}

/**
 * This class handles the storage of artifacts as well as preparing the artifacts for use.
 *
 * Artifacts belonging to different SparkSessions are isolated from each other with the help of the
 * `sessionUUID`.
 *
 * Jars and classfile artifacts are stored under "jars", "classes" and "pyfiles" sub-directories
 * respectively while other types of artifacts are stored under the root directory for that
 * particular SparkSession.
 *
 * @param session The object used to hold the Spark Connect session state.
 */
class ArtifactManager(session: SparkSession) extends Logging {
  import ArtifactManager._

  // The base directory where all artifacts are stored.
  protected def artifactRootPath: Path = artifactRootDirectory

  private[artifact] lazy val artifactRootURI: String = SparkEnv
    .get
    .rpcEnv
    .fileServer
    .addDirectoryIfAbsent(ARTIFACT_DIRECTORY_PREFIX, artifactRootPath.toFile)

  // The base directory/URI where all artifacts are stored for this `sessionUUID`.
  protected[artifact] val (artifactPath, artifactURI): (Path, String) =
    (ArtifactUtils.concatenatePaths(artifactRootPath, session.sessionUUID),
      s"$artifactRootURI${File.separator}${session.sessionUUID}")

  // The base directory/URI where all class file artifacts are stored for this `sessionUUID`.
  protected[artifact] val (classDir, classURI): (Path, String) =
    (ArtifactUtils.concatenatePaths(artifactPath, "classes"),
      s"$artifactURI${File.separator}classes${File.separator}")

  protected[artifact] val state: JobArtifactState =
    JobArtifactState(session.sessionUUID, Option(classURI))

  def withResources[T](f: => T): T = {
    Utils.withContextClassLoader(classloader) {
      JobArtifactSet.withActiveJobArtifactState(state) {
        f
      }
    }
  }

  protected val jarsList = new CopyOnWriteArrayList[Path]
  protected val pythonIncludeList = new CopyOnWriteArrayList[String]

  /**
   * Get the URLs of all jar artifacts.
   */
  def getAddedJars: Seq[URL] = jarsList.asScala.map(_.toUri.toURL).toSeq

  /**
   * Get the py-file names added to this SparkSession.
   *
   * @return
   */
  def getPythonIncludes: Seq[String] = pythonIncludeList.asScala.toSeq

  private def transferFile(
      source: Path,
      target: Path,
      allowOverwrite: Boolean = false,
      deleteSource: Boolean = true): Unit = {
    def execute(s: Path, t: Path, opt: CopyOption*): Path =
      if (deleteSource) Files.move(s, t, opt: _*) else Files.copy(s, t, opt: _*)

    Files.createDirectories(target.getParent)
    if (allowOverwrite) {
      execute(source, target, StandardCopyOption.REPLACE_EXISTING)
    } else {
      execute(source, target)
    }
  }

  private def normalizePath(path: Path): Path = {
    // Convert the path to a string with the current system's separator
    val normalizedPathString = path.toString
      .replace('/', File.separatorChar)
      .replace('\\', File.separatorChar)
    // Convert the normalized string back to a Path object
    Paths.get(normalizedPathString).normalize()
  }
  /**
   * Add and prepare a staged artifact (i.e an artifact that has been rebuilt locally from bytes
   * over the wire) for use.
   *
   * @param remoteRelativePath
   * @param serverLocalStagingPath
   * @param fragment
   * @param deleteStagedFile
   */
  def addArtifact(
      remoteRelativePath: Path,
      serverLocalStagingPath: Path,
      fragment: Option[String],
      deleteStagedFile: Boolean = true
  ): Unit = JobArtifactSet.withActiveJobArtifactState(state) {
    require(!remoteRelativePath.isAbsolute)
    val normalizedRemoteRelativePath = normalizePath(remoteRelativePath)
    if (normalizedRemoteRelativePath.startsWith(s"cache${File.separator}")) {
      val tmpFile = serverLocalStagingPath.toFile
      Utils.tryWithSafeFinallyAndFailureCallbacks {
        val blockManager = session.sparkContext.env.blockManager
        val blockId = CacheId(
          sessionUUID = session.sessionUUID,
          hash = normalizedRemoteRelativePath.toString.stripPrefix(s"cache${File.separator}"))
        val updater = blockManager.TempFileBasedBlockStoreUpdater(
          blockId = blockId,
          level = StorageLevel.MEMORY_AND_DISK_SER,
          classTag = implicitly[ClassTag[Array[Byte]]],
          tmpFile = tmpFile,
          blockSize = tmpFile.length(),
          tellMaster = false)
        updater.save()
      }(catchBlock = { tmpFile.delete() })
    } else if (normalizedRemoteRelativePath.startsWith(s"classes${File.separator}")) {
      // Move class files to the right directory.
      val target = ArtifactUtils.concatenatePaths(
        classDir,
        normalizedRemoteRelativePath.toString.stripPrefix(s"classes${File.separator}"))
      // Allow overwriting class files to capture updates to classes.
      // This is required because the client currently sends all the class files in each class file
      // transfer.
      transferFile(
        serverLocalStagingPath,
        target,
        allowOverwrite = true,
        deleteSource = deleteStagedFile)
    } else {
      val target = ArtifactUtils.concatenatePaths(artifactPath, normalizedRemoteRelativePath)
      // Disallow overwriting with modified version
      if (Files.exists(target)) {
        // makes the query idempotent
        if (FileUtils.contentEquals(target.toFile, serverLocalStagingPath.toFile)) {
          return
        }

        throw new RuntimeException(s"Duplicate Artifact: $normalizedRemoteRelativePath. " +
            "Artifacts cannot be overwritten.")
      }
      transferFile(serverLocalStagingPath, target, deleteSource = deleteStagedFile)

      // This URI is for Spark file server that starts with "spark://".
      val uri = s"$artifactURI/${Utils.encodeRelativeUnixPathToURIRawPath(
          FilenameUtils.separatorsToUnix(normalizedRemoteRelativePath.toString))}"

      if (normalizedRemoteRelativePath.startsWith(s"jars${File.separator}")) {
        session.sparkContext.addJar(uri)
        jarsList.add(target)
      } else if (normalizedRemoteRelativePath.startsWith(s"pyfiles${File.separator}")) {
        session.sparkContext.addFile(uri)
        val stringRemotePath = normalizedRemoteRelativePath.toString
        if (stringRemotePath.endsWith(".zip") || stringRemotePath.endsWith(
            ".egg") || stringRemotePath.endsWith(".jar")) {
          pythonIncludeList.add(target.getFileName.toString)
        }
      } else if (normalizedRemoteRelativePath.startsWith(s"archives${File.separator}")) {
        val canonicalUri =
          fragment.map(Utils.getUriBuilder(new URI(uri)).fragment).getOrElse(new URI(uri))
        session.sparkContext.addArchive(canonicalUri.toString)
      } else if (normalizedRemoteRelativePath.startsWith(s"files${File.separator}")) {
        session.sparkContext.addFile(uri)
      }
    }
  }

  /**
   * Add locally-stored artifacts to the session. These artifacts are from a user-provided
   * permanent path which are accessible by the driver directly.
   *
   * Different from the [[addArtifact]] method, this method will not delete staged artifacts since
   * they are from a permanent location.
   */
  private[sql] def addLocalArtifacts(artifacts: Seq[Artifact]): Unit = {
    artifacts.foreach { artifact =>
      artifact.storage match {
        case d: Artifact.LocalFile =>
          addArtifact(
            artifact.path,
            d.path,
            fragment = None,
            deleteStagedFile = false)
        case d: Artifact.InMemory =>
          val tempDir = Utils.createTempDir().toPath
          val tempFile = tempDir.resolve(artifact.path.getFileName)
          val outStream = Files.newOutputStream(tempFile)
          Utils.tryWithSafeFinallyAndFailureCallbacks {
            d.stream.transferTo(outStream)
            addArtifact(artifact.path, tempFile, fragment = None)
          }(finallyBlock = {
            outStream.close()
          })
        case _ =>
          throw SparkException.internalError(s"Unsupported artifact storage: ${artifact.storage}")
      }
    }
  }

  /**
   * Returns a [[ClassLoader]] for session-specific jar/class file resources.
   */
  def classloader: ClassLoader = {
    val urls = getAddedJars :+ classDir.toUri.toURL
    val prefixes = SparkEnv.get.conf.get(CONNECT_SCALA_UDF_STUB_PREFIXES)
    val userClasspathFirst = SparkEnv.get.conf.get(EXECUTOR_USER_CLASS_PATH_FIRST)
    val loader = if (prefixes.nonEmpty) {
      // Two things you need to know about classloader for all of this to make sense:
      // 1. A classloader needs to be able to fully define a class.
      // 2. Classes are loaded lazily. Only when a class is used the classes it references are
      //    loaded.
      // This makes stubbing a bit more complicated then you'd expect. We cannot put the stubbing
      // classloader as a fallback at the end of the loading process, because then classes that
      // have been found in one of the parent classloaders and that contain a reference to a
      // missing, to-be-stubbed missing class will still fail with classloading errors later on.
      // The way we currently fix this is by making the stubbing class loader the last classloader
      // it delegates to.
      if (userClasspathFirst) {
        // USER -> SYSTEM -> STUB
        new ChildFirstURLClassLoader(
          urls.toArray,
          StubClassLoader(Utils.getContextOrSparkClassLoader, prefixes))
      } else {
        // SYSTEM -> USER -> STUB
        new ChildFirstURLClassLoader(
          urls.toArray,
          StubClassLoader(null, prefixes),
          Utils.getContextOrSparkClassLoader)
      }
    } else {
      if (userClasspathFirst) {
        new ChildFirstURLClassLoader(urls.toArray, Utils.getContextOrSparkClassLoader)
      } else {
        new URLClassLoader(urls.toArray, Utils.getContextOrSparkClassLoader)
      }
    }

    logDebug(s"Using class loader: $loader, containing urls: $urls")
    loader
  }

  /**
   * Cleans up all resources specific to this `session`.
   */
  private[sql] def cleanUpResources(): Unit = {
    logDebug(
      s"Cleaning up resources for session with sessionUUID ${session.sessionUUID}")

    // Clean up added files
    val fileserver = SparkEnv.get.rpcEnv.fileServer
    val sparkContext = session.sparkContext
    val shouldUpdateEnv = sparkContext.addedFiles.contains(state.uuid) ||
      sparkContext.addedArchives.contains(state.uuid) ||
      sparkContext.addedJars.contains(state.uuid)
    if (shouldUpdateEnv) {
      sparkContext.addedFiles.remove(state.uuid).foreach(_.keys.foreach(fileserver.removeFile))
      sparkContext.addedArchives.remove(state.uuid).foreach(_.keys.foreach(fileserver.removeFile))
      sparkContext.addedJars.remove(state.uuid).foreach(_.keys.foreach(fileserver.removeJar))
      sparkContext.postEnvironmentUpdate()
    }

    // Clean up cached relations
    val blockManager = sparkContext.env.blockManager
    blockManager.removeCache(session.sessionUUID)

    // Clean up artifacts folder
    FileUtils.deleteDirectory(artifactPath.toFile)
  }

  def uploadArtifactToFs(
      remoteRelativePath: Path,
      serverLocalStagingPath: Path): Unit = {
    val normalizedRemoteRelativePath = normalizePath(remoteRelativePath)
    val hadoopConf = session.sparkContext.hadoopConfiguration
    assert(
      normalizedRemoteRelativePath.startsWith(
        ArtifactManager.forwardToFSPrefix + File.separator))
    val destFSPath = new FSPath(
      Paths
        .get(File.separator)
        .resolve(normalizedRemoteRelativePath.subpath(1, normalizedRemoteRelativePath.getNameCount))
        .toString)
    val localPath = serverLocalStagingPath
    val fs = destFSPath.getFileSystem(hadoopConf)
    if (fs.isInstanceOf[LocalFileSystem]) {
      val allowDestLocalConf =
        session.conf.get(SQLConf.ARTIFACT_COPY_FROM_LOCAL_TO_FS_ALLOW_DEST_LOCAL)
          .getOrElse(
            session.conf.get("spark.connect.copyFromLocalToFs.allowDestLocal").contains("true"))

      if (!allowDestLocalConf) {
        // To avoid security issue, by default,
        // we don't support uploading file to local file system
        // destination path, otherwise user is able to overwrite arbitrary file
        // on spark driver node.
        // We can temporarily allow the behavior by setting spark config
        // `spark.sql.artifact.copyFromLocalToFs.allowDestLocal`
        // to `true` when starting spark driver, we should only enable it for testing
        // purpose.
        throw new SparkUnsupportedOperationException("_LEGACY_ERROR_TEMP_3161")
      }
    }
    fs.copyFromLocalFile(false, true, new FSPath(localPath.toString), destFSPath)
  }
}

object ArtifactManager extends Logging {

  val forwardToFSPrefix = "forward_to_fs"

  val ARTIFACT_DIRECTORY_PREFIX = "artifacts"

  private[artifact] lazy val artifactRootDirectory =
    Utils.createTempDir(ARTIFACT_DIRECTORY_PREFIX).toPath
}