All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.cloudera.livy.utils.SparkYarnApp.scala Maven / Gradle / Ivy

/*
 * Licensed to Cloudera, Inc. under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  Cloudera, Inc. licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.cloudera.livy.utils

import java.util.concurrent.TimeoutException

import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.concurrent._
import scala.concurrent.duration._
import scala.language.postfixOps
import scala.util.Try

import org.apache.hadoop.yarn.api.records.{ApplicationId, ApplicationReport, FinalApplicationStatus, YarnApplicationState}
import org.apache.hadoop.yarn.client.api.YarnClient
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException
import org.apache.hadoop.yarn.util.ConverterUtils

import com.cloudera.livy.{LivyConf, Logging, Utils}
import com.cloudera.livy.util.LineBufferedProcess

object SparkYarnApp extends Logging {

  def init(livyConf: LivyConf): Unit = {
    sessionLeakageCheckInterval = livyConf.getTimeAsMs(LivyConf.YARN_APP_LEAKAGE_CHECK_INTERVAL)
    sessionLeakageCheckTimeout = livyConf.getTimeAsMs(LivyConf.YARN_APP_LEAKAGE_CHECK_TIMEOUT)
    leakedAppsGCThread.setDaemon(true)
    leakedAppsGCThread.setName("LeakedAppsGCThread")
    leakedAppsGCThread.start()
  }

  // YarnClient is thread safe. Create once, share it across threads.
  lazy val yarnClient = {
    val c = YarnClient.createYarnClient()
    c.init(new YarnConfiguration())
    c.start()
    c
  }

  private def getYarnTagToAppIdTimeout(livyConf: LivyConf): FiniteDuration =
    livyConf.getTimeAsMs(LivyConf.YARN_APP_LOOKUP_TIMEOUT) milliseconds

  private def getYarnPollInterval(livyConf: LivyConf): FiniteDuration =
    livyConf.getTimeAsMs(LivyConf.YARN_POLL_INTERVAL) milliseconds

  private val appType = Set("SPARK").asJava

  private val leakedAppTags = new java.util.concurrent.ConcurrentHashMap[String, Long]()

  private var sessionLeakageCheckTimeout: Long = _

  private var sessionLeakageCheckInterval: Long = _

  private val leakedAppsGCThread = new Thread() {
    override def run(): Unit = {
      while (true) {
        if (!leakedAppTags.isEmpty) {
          // kill the app if found it and remove it if exceeding a threashold
          val iter = leakedAppTags.entrySet().iterator()
          var isRemoved = false
          val now = System.currentTimeMillis()
          val apps = yarnClient.getApplications(appType).asScala
          while(iter.hasNext) {
            val entry = iter.next()
            apps.find(_.getApplicationTags.contains(entry.getKey))
              .foreach({ e =>
                info(s"Kill leaked app ${e.getApplicationId}")
                yarnClient.killApplication(e.getApplicationId)
                iter.remove()
                isRemoved = true
              })
            if (!isRemoved) {
              if ((entry.getValue - now) > sessionLeakageCheckTimeout) {
                iter.remove()
                info(s"Remove leaked yarn app tag ${entry.getKey}")
              }
            }
          }
        }
        Thread.sleep(sessionLeakageCheckInterval)
      }
    }
  }


}

/**
 * Provide a class to control a Spark application using YARN API.
 *
 * @param appTag An app tag that can unique identify the YARN app.
 * @param appIdOption The appId of the YARN app. If this's None, SparkYarnApp will find it
 *                    using appTag.
 * @param process The spark-submit process launched the YARN application. This is optional.
 *                If it's provided, SparkYarnApp.log() will include its log.
 * @param listener Optional listener for notification of appId discovery and app state changes.
 */
class SparkYarnApp private[utils] (
    appTag: String,
    appIdOption: Option[String],
    process: Option[LineBufferedProcess],
    listener: Option[SparkAppListener],
    livyConf: LivyConf,
    yarnClient: => YarnClient = SparkYarnApp.yarnClient) // For unit test.
  extends SparkApp
  with Logging {
  import SparkYarnApp._

  private val appIdPromise: Promise[ApplicationId] = Promise()
  private[utils] var state: SparkApp.State = SparkApp.State.STARTING
  private var yarnDiagnostics: IndexedSeq[String] = IndexedSeq.empty[String]

  override def log(): IndexedSeq[String] =
    process.map(_.inputLines).getOrElse(ArrayBuffer.empty[String]) ++ yarnDiagnostics

  override def kill(): Unit = synchronized {
    if (isRunning) {
      try {
        val timeout = SparkYarnApp.getYarnTagToAppIdTimeout(livyConf)
        yarnClient.killApplication(Await.result(appIdPromise.future, timeout))
      } catch {
        // We cannot kill the YARN app without the app id.
        // There's a chance the YARN app hasn't been submitted during a livy-server failure.
        // We don't want a stuck session that can't be deleted. Emit a warning and move on.
        case _: TimeoutException | _: InterruptedException =>
          warn("Deleting a session while its YARN application is not found.")
          yarnAppMonitorThread.interrupt()
      } finally {
        process.foreach(_.destroy())
      }
    }
  }

  private def changeState(newState: SparkApp.State.Value): Unit = {
    if (state != newState) {
      listener.foreach(_.stateChanged(state, newState))
      state = newState
    }
  }

  /**
   * Find the corresponding YARN application id from an application tag.
   *
   * @param appTag The application tag tagged on the target application.
   *               If the tag is not unique, it returns the first application it found.
   *               It will be converted to lower case to match YARN's behaviour.
   * @return ApplicationId or the failure.
   */
  @tailrec
  private def getAppIdFromTag(
      appTag: String,
      pollInterval: Duration,
      deadline: Deadline): ApplicationId = {
    val appTagLowerCase = appTag.toLowerCase()

    // FIXME Should not loop thru all YARN applications but YarnClient doesn't offer an API.
    // Consider calling rmClient in YarnClient directly.
    yarnClient.getApplications(appType).asScala.find(_.getApplicationTags.contains(appTagLowerCase))
    match {
      case Some(app) => app.getApplicationId
      case None =>
        if (deadline.isOverdue) {
          process.foreach(_.destroy())
          leakedAppTags.put(appTag, System.currentTimeMillis())
          throw new Exception(s"No YARN application is found with tag $appTagLowerCase in " +
            livyConf.getTimeAsMs(LivyConf.YARN_APP_LOOKUP_TIMEOUT)/1000 + " seconds. " +
            "Please check your cluster status, it is may be very busy.")
        } else {
          Clock.sleep(pollInterval.toMillis)
          getAppIdFromTag(appTagLowerCase, pollInterval, deadline)
        }
    }
  }

  private def getYarnDiagnostics(appReport: ApplicationReport): IndexedSeq[String] = {
    Option(appReport.getDiagnostics)
      .filter(_.nonEmpty)
      .map[IndexedSeq[String]]("YARN Diagnostics:" +: _.split("\n"))
      .getOrElse(IndexedSeq.empty)
  }

  private def isRunning: Boolean = {
    state != SparkApp.State.FAILED && state != SparkApp.State.FINISHED &&
      state != SparkApp.State.KILLED
  }

  // Exposed for unit test.
  private[utils] def mapYarnState(
      appId: ApplicationId,
      yarnAppState: YarnApplicationState,
      finalAppStatus: FinalApplicationStatus): SparkApp.State.Value = {
    yarnAppState match {
      case (YarnApplicationState.NEW |
            YarnApplicationState.NEW_SAVING |
            YarnApplicationState.SUBMITTED |
            YarnApplicationState.ACCEPTED) => SparkApp.State.STARTING
      case YarnApplicationState.RUNNING => SparkApp.State.RUNNING
      case YarnApplicationState.FINISHED =>
        finalAppStatus match {
          case FinalApplicationStatus.SUCCEEDED => SparkApp.State.FINISHED
          case FinalApplicationStatus.FAILED => SparkApp.State.FAILED
          case FinalApplicationStatus.KILLED => SparkApp.State.KILLED
          case s =>
            error(s"Unknown YARN final status $appId $s")
            SparkApp.State.FAILED
        }
      case YarnApplicationState.FAILED => SparkApp.State.FAILED
      case YarnApplicationState.KILLED => SparkApp.State.KILLED
    }
  }

  // Exposed for unit test.
  // TODO Instead of spawning a thread for every session, create a centralized thread and
  // batch YARN queries.
  private[utils] val yarnAppMonitorThread = Utils.startDaemonThread(s"yarnAppMonitorThread-$this") {
    try {
      // Wait for spark-submit to finish submitting the app to YARN.
      process.foreach { p =>
        val exitCode = p.waitFor()
        if (exitCode != 0) {
          throw new Exception(s"spark-submit exited with code $exitCode}.\n" +
            s"${process.get.inputLines.mkString("\n")}")
        }
      }

      // If appId is not known, query YARN by appTag to get it.
      val appId = try {
        appIdOption.map(ConverterUtils.toApplicationId).getOrElse {
          val pollInterval = getYarnPollInterval(livyConf)
          val deadline = getYarnTagToAppIdTimeout(livyConf).fromNow
          getAppIdFromTag(appTag, pollInterval, deadline)
        }
      } catch {
        case e: Exception =>
          appIdPromise.failure(e)
          throw e
      }
      appIdPromise.success(appId)

      Thread.currentThread().setName(s"yarnAppMonitorThread-$appId")
      listener.foreach(_.appIdKnown(appId.toString))

      val pollInterval = SparkYarnApp.getYarnPollInterval(livyConf)
      var appInfo = AppInfo()
      while (isRunning) {
        try {
          Clock.sleep(pollInterval.toMillis)

          // Refresh application state
          val appReport = yarnClient.getApplicationReport(appId)
          yarnDiagnostics = getYarnDiagnostics(appReport)
          changeState(mapYarnState(
            appReport.getApplicationId,
            appReport.getYarnApplicationState,
            appReport.getFinalApplicationStatus))

          val latestAppInfo = {
            val attempt =
              yarnClient.getApplicationAttemptReport(appReport.getCurrentApplicationAttemptId)
            val driverLogUrl =
              Try(yarnClient.getContainerReport(attempt.getAMContainerId).getLogUrl)
                .toOption
            AppInfo(driverLogUrl, Option(appReport.getTrackingUrl))
          }

          if (appInfo != latestAppInfo) {
            listener.foreach(_.infoChanged(latestAppInfo))
            appInfo = latestAppInfo
          }
        } catch {
          // This exception might be thrown during app is starting up. It's transient.
          case e: ApplicationAttemptNotFoundException =>
          // Workaround YARN-4411: No enum constant FINAL_SAVING from getApplicationAttemptReport()
          case e: IllegalArgumentException =>
            if (e.getMessage.contains("FINAL_SAVING")) {
              debug("Encountered YARN-4411.")
            } else {
              throw e
            }
        }
      }

      debug(s"$appId $state ${yarnDiagnostics.mkString(" ")}")
    } catch {
      case e: InterruptedException =>
        yarnDiagnostics = ArrayBuffer("Session stopped by user.")
        changeState(SparkApp.State.KILLED)
      case e: Throwable =>
        error(s"Error whiling refreshing YARN state: $e")
        yarnDiagnostics = ArrayBuffer(e.toString, e.getStackTrace().mkString(" "))
        changeState(SparkApp.State.FAILED)
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy