All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gridscale.condor.package.scala Maven / Gradle / Ivy

The newest version!
package gridscale.condor

import gridscale.*
import gridscale.cluster.BatchScheduler.BatchJob
import gridscale.cluster.{ BatchScheduler, HeadNode, Requirement }
import squants._
import gridscale.tools._
import squants.information._

import scala.language.{ higherKinds, postfixOps }


case class CondorJobDescription(
  executable: String,
  arguments: String,
  workDirectory: String,
  memory: Option[Information] = None,
  nodes: Option[Int] = None,
  coreByNode: Option[Int] = None,
  requirements: Option[CondorRequirement] = None)

object impl {

  import Requirement._
  import BatchScheduler.{ BatchJob, output, error }

  def toScript(description: CondorJobDescription)(uniqId: String) = {

    import description._

    val header = "#!/bin/bash\n"

    val core = Seq(
      "output = " -> Some(output(uniqId)),
      "error = " -> Some(error(uniqId)),
      "request_memory = " -> memory.map(m ⇒ s"${m.toMBString} MB"),

      "initialdir = " -> Some(workDirectory),
      "executable = " -> Some(executable),
      "arguments = " -> Some(s""""$arguments""""))

    // TODO: are these features available in Condor?
    //    queue match {
    //      case Some(q) ⇒ buffer += "#PBS -q " + q
    //      case None ⇒
    //    }
    //
    //    wallTime match {
    //      case Some(t) ⇒
    //        val df = new java.text.SimpleDateFormat("HH:mm:ss")
    //        df.setTimeZone(java.util.TimeZone.getTimeZone("GMT"))
    //        buffer += "#PBS -lwalltime=" + df.format(t * 60 * 1000)
    //      case None ⇒
    //    }

    val universe = nodes match {
      case Some(n) ⇒
        s"""universe = parallel
          |machine_count = $n
          |
          |request_cpus = ${coreByNode.getOrElse(1)}
          """.stripMargin
      case None ⇒ "universe = vanilla"
    }

    val reqAll = Seq(requirements, coreByNode.map(c ⇒ CondorRequirement(c.toString)))
    val reqList = reqAll.foldLeft(Seq.empty[CondorRequirement])(_ ++ _)

    // 'queue 1' actually submits N jobs (default to 1 in our case)
    s"""$header
         |$universe
         |${requirementsString(core)}
         |
         |${if (reqList.nonEmpty) "requirements = " + CondorRequirement(reqList.mkString).toCondor else ""}
         |
         |getenv = True
         |
         |queue 1
         |""".stripMargin
  }

  def retrieveJobId(out: String) = out.trim.reverse.tail.takeWhile(_ != ' ').reverse

  def translateStatus(status: String, command: String): JobState =
    status match {
      case "3" | "4"       ⇒ JobState.Done
      case "2"             ⇒ JobState.Running
      // choice was made to characterize held jobs (status=5) as submitted instead of Running
      case "0" | "1" | "5" ⇒ JobState.Submitted
      case "6"             ⇒ JobState.Failed
      case _               ⇒ throw new RuntimeException(s"Unrecognized state $status retrieved from $command")
    }

  def formatError(command: String, cmdRet: ExecutionResult): RuntimeException =
    new RuntimeException(s"Could not retrieve job state from $command [output: ${cmdRet.stdOut}] [error: ${cmdRet.stdErr}]")

  /**
   * When split, the actual state is the last member of a 2-element array
   *
   * @param output State command output to parse
   * @return parse state to be translated
   */
  def parseStateInQueue(output: String) = output.split('=').map(_ trim).last

  /**
   * Can't match it with a regex from the ouput for some reason...
   * resulting in this ugly one-liner...
   *
   * @param output State command output to parse
   * @return parse state to be translated
   */
  def parseStateFinished(output: String) =
    output.split("\n").filter(_ matches "^JobStatus = .*").head.split('=').map(_ trim).last

  // FIXME fails to compile if second param list is implicit..
  def queryState(server: HeadNode, job: BatchJob): JobState =

    // if the job is still running, his state is returned by condor_q...
    val queryInQueueCommand = s"condor_q ${job.jobId} -long -attributes JobStatus"
    // ...but if the job is already completed, his state is returned by condor_history...
    val queryFinishedCommand = s"condor_history ${job.jobId} -long"

    val cmdRet = server.execute(queryInQueueCommand)
    val ExecutionResult(ret, stdOut, _) = cmdRet
    if (ret != 0) throw formatError(queryInQueueCommand, cmdRet)

    val state: JobState =
      if stdOut.nonEmpty
      then
        val state = parseStateInQueue(stdOut)
        translateStatus(state, queryInQueueCommand)
      else
        val cmdRet = server.execute(queryFinishedCommand)
        val state = parseStateFinished(cmdRet.stdOut)
        translateStatus(state, queryFinishedCommand)
    state

}

import impl._
import BatchScheduler._
import shell.BashShell._

val scriptSuffix = ".condor"

def submit(server: HeadNode, jobDescription: CondorJobDescription): BatchJob =
  BatchScheduler.submit(
    jobDescription.workDirectory,
    toScript(jobDescription),
    scriptSuffix,
    (f, _) ⇒ s"condor_submit $f",
    impl.retrieveJobId,
    server)

def state(server: HeadNode, job: BatchJob): JobState = queryState(server, job)

def clean(server: HeadNode, job: BatchJob): Unit =
  BatchScheduler.clean(
    s"condor_rm ${job.jobId}",
    scriptSuffix)(server, job)

def stdOut(server: HeadNode, job: BatchJob): String = BatchScheduler.stdOut(server, job)
def stdErr(server: HeadNode, job: BatchJob): String = BatchScheduler.stdErr(server, job)





© 2015 - 2024 Weber Informatics LLC | Privacy Policy