All Downloads are FREE. Search and download functionalities are using the official Maven repository.

givers.moonlight.v2.JobDispatcher.scala Maven / Gradle / Ivy

The newest version!
package givers.moonlight.v2

import akka.actor.Cancellable
import akka.actor.typed.Scheduler
import com.codahale.metrics.MetricRegistry
import com.google.inject.Singleton
import givers.moonlight.util.Metrics.MetricRegistryOps
import givers.moonlight.{BackgroundJob, JobExecutor}
import givers.moonlight.util.{DateTimeFactory, Metrics}
import givers.moonlight.util.RichDate._
import givers.moonlight.util.RichFuture.TimeoutAwareFuture
import givers.moonlight.v2.repository.BackgroundJobRepository
import play.api.Logger
import play.api.inject.Injector

import java.time.Instant
import java.util.concurrent.atomic.{AtomicBoolean, AtomicInteger}
import javax.inject.Inject
import scala.concurrent.duration.{DurationInt, FiniteDuration}
import scala.concurrent.{ExecutionContext, Future, Promise}
import scala.util.control.NoStackTrace
import scala.util.{Failure, Random, Success, Try}

case class JobExecutionError(realCause: Throwable) extends Exception("Job execution error", realCause)
case class JobTypeExecutorAlreadyExists(typeId: String) extends Exception(s"Job executor for $typeId already exists")

case class JobExecutorNotFound(typeId: String)
    extends Exception(s"Job executor for $typeId not found")
    with NoStackTrace

@Singleton
/**
 * Dispatches jobs
 *
 * @param bgJobRepo
 *   background jobs repository
 * @param settings
 *   moonlight settings
 * @param dateTimeFactory
 *   date/time factory
 * @param executionContext
 *   execution context
 * @param injector
 *   DI system injector
 * @param scheduler
 *   task scheduler
 * @param random
 *   randomizer for duration randomization
 */
class JobDispatcher @Inject() (
  bgJobRepo: BackgroundJobRepository,
  settings: MoonlightSettings,
  dateTimeFactory: DateTimeFactory,
  metricRegistry: MetricRegistry
)(implicit executionContext: ExecutionContext, injector: Injector, scheduler: Scheduler, random: Random) {
  private[this] val logger = Logger(this.getClass)

  private val activeExecutorsCount = metricRegistry.settableGauge[Int](Metrics.jobDispatcher.activeExecutorsCount)
  private val readyToStartCount = metricRegistry.settableGauge[Int](Metrics.jobDispatcher.jobsReadyToStart)
  private val jobsOverall = metricRegistry.settableGauge[Int](Metrics.jobDispatcher.jobsOverall)

  private val maintenanceDeleteOldTimer = metricRegistry.timer(Metrics.jobDispatcher.maintenanceOldJobs)
  private val maintenanceDeleteOldErrorCount = metricRegistry.counter(Metrics.jobDispatcher.maintenanceOldJobsErrors)

  private val maintenanceUnstuckTimer = metricRegistry.timer(Metrics.jobDispatcher.maintenanceUnstuck)
  private val maintenanceUnstuckErrorCount = metricRegistry.counter(Metrics.jobDispatcher.maintenanceUnstuckErrors)

  private val concurrentFailCount = metricRegistry.counter(Metrics.jobDispatcher.concurrentFail)

  val typeExecutors: Map[String, JobExecutor[_]] = {
    settings.executors.foldLeft(Map.empty[String, JobExecutor[_]]) { case (current, executor) =>
      current.updatedWith(executor.jobType.id) {
        case Some(_) => throw JobTypeExecutorAlreadyExists(executor.jobType.id)
        case _ => Some(executor)
      }
    }
  }

  /**
   * Schedules 2 maintenance tasks
   *   - first will delete old jobs
   *   - second will "unstuck" jobs
   *
   * @return
   */
  private def scheduleMaintenance(): Cancellable = {
    scheduler.scheduleAtFixedRate(0.minutes, settings.maintenanceInterval)(() => {
      val oldJobsDeleteTimer = maintenanceDeleteOldTimer.time()
      bgJobRepo
        .deleteJobsSucceededOrFailedBefore(dateTimeFactory.now.sub(settings.completedJobsTtl).midnight)
        .onComplete {
          case Success(0) =>
            oldJobsDeleteTimer.stop()
          case Success(numberOfDeletedJobs) =>
            oldJobsDeleteTimer.stop()
            logger.warn(s"$numberOfDeletedJobs succeeded/failed jobs were removed")
          case Failure(e) =>
            oldJobsDeleteTimer.stop()
            maintenanceDeleteOldErrorCount.inc()
            logger.error("maintenance (delete task) error", e)
        }

      val unstuckTimer = maintenanceUnstuckTimer.time()
      val _ = bgJobRepo.maintainExpiredJobs(settings.jobRunTimeout, dateTimeFactory.now).onComplete {
        case Success(0) =>
          unstuckTimer.stop()
        case Success(numberOfMaintainedJobs) =>
          unstuckTimer.stop()
          logger.warn(s"$numberOfMaintainedJobs expired jobs were maintained")
        case Failure(e) =>
          unstuckTimer.stop()
          maintenanceUnstuckErrorCount.inc()
          logger.error("maintenance (unstuck task) error", e)
      }
    })
  }

  private def scheduleMetrics(): Cancellable = {
    scheduler.scheduleAtFixedRate(0.minutes, settings.countMetricsCollectionInterval)(() => {
      bgJobRepo
        .countPendingJobReadyForStart(dateTimeFactory.now)
        .onComplete {
          case Success(count) =>
            readyToStartCount.setValue(count)
          case Failure(e) =>
            readyToStartCount.setValue(-1)
            logger.error("maintenance (delete task) error", e)
        }

      bgJobRepo.count
        .onComplete {
          case Success(count) =>
            jobsOverall.setValue(count)
          case Failure(e) =>
            jobsOverall.setValue(-1)
            logger.error("maintenance (delete task) error", e)
        }
    })
  }

  /**
   * Get job that can be started now. First checks pending jobs, if there is no any then checks failed jobs that can be
   * retried
   *
   * @return
   */
  private def getJobReadyForStart: Future[Option[BackgroundJob]] = {
    val now = dateTimeFactory.now
    val lastAttemptAfter = now.sub(settings.betweenRunAttemptInterval)

    bgJobRepo.getPendingJobReadyForStart(now).flatMap {
      case None => bgJobRepo.getFailedJobReadyForRetry(settings.maxJobRetries, lastAttemptAfter)
      case res => Future.successful(res)
    }
  }

  /**
   * Dispatcher main loop Picks tasks and runs it, pauses if no tasks
   *
   * @return
   *   loop cancel control, future that will be finished after cancel
   */
  def runLoop(): (Cancellable, Future[Unit]) = {
    val runPromise = Promise[Unit]
    val untilRunning = new AtomicBoolean(true)

    // the idea is to complete runPromise only when last executor is closed
    val executorsLeft = new AtomicInteger(settings.parallelism)
    val activeExecutors = new AtomicInteger(0)

    def runForever(threadSeqId: Int): Unit = {
      Option.when(untilRunning.get())(()) match {
        case Some(_) =>
          getJobReadyForStart
            .onComplete {
              case Success(Some(job)) =>
                activeExecutorsCount.setValue(activeExecutors.incrementAndGet())
                runJob(job, threadSeqId)
                  .foreach { _ =>
                    activeExecutorsCount.setValue(activeExecutors.decrementAndGet())
                    runForever(threadSeqId)
                  }
              // maybe db error
              case Failure(e) =>
                logger.error("can't get job", e)
                scheduler.scheduleOnce(settings.pauseDurationWhenNoJobsRandomized, () => runForever(threadSeqId))
              // no jobs for start
              case _ =>
                scheduler.scheduleOnce(settings.pauseDurationWhenNoJobsRandomized, () => runForever(threadSeqId))
            }
        case _ =>
          logger.info(s"$threadSeqId closing executor")
          if (executorsLeft.decrementAndGet() == 0) {
            logger.info(s"$threadSeqId finishing run loop")
            runPromise.trySuccess(())
          }
      }
    }

    // respect required parallelism
    (1 to settings.parallelism)
      .foreach { threadSeqId =>
        scheduler.scheduleOnce(JobDispatcher.jobStartDelay, () => runForever(threadSeqId))
      }

    val maintenanceCancelControl = scheduleMaintenance()
    val metricsCancelControl = scheduleMetrics()

    val loopCancelControl = new Cancellable {
      override def cancel(): Boolean = {
        metricsCancelControl.cancel()
        maintenanceCancelControl.cancel()
        untilRunning.set(false)

        true
      }

      override def isCancelled: Boolean = !untilRunning.get() && maintenanceCancelControl.isCancelled
    }

    (loopCancelControl, runPromise.future)
  }

  /**
   * Runs specified background job
   *
   * @param job
   *   background job
   * @param threadSeqId
   *   number from 1 to 
   * @return
   */
  private def runJob(job: BackgroundJob, threadSeqId: Int): Future[Unit] = {
    val startInMillis = Instant.now().toEpochMilli
    def duration: Long = Instant.now().toEpochMilli - startInMillis

    metricRegistry.counter(Metrics.executor.started(job.jobType)).inc()

    val startResult = for {
      executor <- Future.fromTry(
        Try(
          typeExecutors.getOrElse(
            job.jobType,
            throw JobExecutorNotFound(job.jobType)
          )
        )
      )
      isStarted <- bgJobRepo.tryMarkJobAsStarted(
        job.id,
        job.tryCount + 1,
        dateTimeFactory.now,
        settings.maxJobRetries,
        dateTimeFactory.now,
        settings.betweenRunAttemptInterval
      )
    } yield (isStarted, executor)

    startResult
      .flatMap {
        case (false, _) =>
          logger.warn(s"#$threadSeqId job ${job.id} can't be started because it's started by someone else")
          concurrentFailCount.inc()
          Future.successful(())
        case (_, executor) =>
          logger.info(s"#$threadSeqId starting executor ${executor.getClass.getSimpleName} for job ${job.id}")
          val timer = metricRegistry.timer(Metrics.executor.duration(job.jobType)).time()

          val runFuture = for {
            _ <- executor.run(job.paramsInJsonString).withTimeout(settings.jobRunTimeout)
            _ <- bgJobRepo.markJobAsSucceed(job.id, dateTimeFactory.now)
          } yield {
            logger.info(
              s"#$threadSeqId executor ${executor.getClass.getSimpleName} with job " +
                s"${job.id} finished successfully. Took $duration millis"
            )
          }

          runFuture.onComplete { res =>
            timer.stop()

            res.foreach(_ => metricRegistry.counter(Metrics.executor.succeeded(job.jobType)).inc())
            res.failed.foreach(_ => metricRegistry.counter(Metrics.executor.failed(job.jobType)).inc())
          }

          runFuture.recover(e => throw JobExecutionError(e))
      }
      // this "recover" will try to mark job as failed
      .recoverWith {
        case JobExecutionError(e) =>
          logger.error(
            s"#$threadSeqId error occurred while running the job " +
              s"(id=${job.id}, type=${job.jobType}, params=${job.paramsInJsonString}, tryCount=${job.tryCount}). " +
              s"Took $duration millis",
            e
          )
          bgJobRepo.markJobAsFailed(job.id, e, dateTimeFactory.now)
        case e: JobExecutorNotFound =>
          logger.error(
            s"#$threadSeqId error occurred while searching for executor " +
              s"(id=${job.id}, type=${job.jobType}, params=${job.paramsInJsonString}, tryCount=${job.tryCount}). " +
              s"Took $duration millis",
            e
          )
          bgJobRepo.markJobAsFailed(job.id, e, dateTimeFactory.now)
      }
      // in case of other error like db failure
      .recover { case e =>
        logger.error(s"#$threadSeqId critical failure job $job failure. Took $duration millis", e)
        ()
      }
  }
}

object JobDispatcher {
  val jobStartDelay: FiniteDuration = 1.milli
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy