poapsis.ortserver.transport.kubernetes-jobmonitor.0.1.0-RC2.source-code.JobHandler.kt Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of kubernetes-jobmonitor Show documentation
Part of the ORT Server, the reference implementation of Eclipse Apoapsis.
There is a newer version: 0.1.0-RC1
Show newest version
/*
 * Copyright (C) 2023 The ORT Server Authors (See )
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-License-Identifier: Apache-2.0
 * License-Filename: LICENSE
 */

package org.eclipse.apoapsis.ortserver.transport.kubernetes.jobmonitor

import io.kubernetes.client.openapi.apis.BatchV1Api
import io.kubernetes.client.openapi.apis.CoreV1Api
import io.kubernetes.client.openapi.models.V1Job
import io.kubernetes.client.openapi.models.V1Pod

import java.time.OffsetDateTime
import java.util.TreeMap

import kotlinx.coroutines.sync.Mutex
import kotlinx.coroutines.sync.withLock
import kotlinx.datetime.Clock
import kotlinx.datetime.Instant

import org.eclipse.apoapsis.ortserver.transport.Endpoint
import org.eclipse.apoapsis.ortserver.utils.logging.withMdcContext

import org.slf4j.LoggerFactory

/**
 * An internal helper class providing functionality to deal with jobs.
 */
internal class JobHandler(
    /** The API to access job objects. */
    private val jobApi: BatchV1Api,

    /** The core API. */
    private val api: CoreV1Api,

    /** The object to send notifications about failed jobs. */
    private val notifier: FailedJobNotifier,

    /** The configuration. */
    private val config: MonitorConfig,
) {
    companion object {
        private val logger = LoggerFactory.getLogger(JobHandler::class.java)

        /** Constant for a condition type that indicates that a job has failed. */
        private const val FAILED_CONDITION = "Failed"

        /** Constant for a condition type that indicates a normal completion of a job. */
        private const val COMPLETE_CONDITION = "Complete"

        /** A set with the condition types that indicate that a job is completed. */
        private val COMPLETED_CONDITIONS = setOf(COMPLETE_CONDITION, FAILED_CONDITION)

        /** The label which stores the ORT run ID. */
        private const val RUN_ID_LABEL = "run-id"

        /** A prefix for the name of a label storing a part of the trace ID. */
        private const val TRACE_LABEL_PREFIX = "trace-id-"

        /**
         * A label selector to find only jobs for ORT Server components. Only those are handled when looking for
         * completed or failed jobs.
         */
        private val workerJobsLabelSelector = "ort-worker in " +
                Endpoint.entries().joinToString(",", prefix = "(", postfix = ")") { it.configPrefix }

        /**
         * Return a flag whether this job has failed. For jobs that are still running the result is *false*.
         */
        fun V1Job.isFailed(): Boolean = status?.conditions.orEmpty().any { it.type == FAILED_CONDITION }

        /**
         * Return a flag whether this job has completed, either successfully or in failure state.
         * See https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#jobstatus-v1-batch.
         */
        fun V1Job.isCompleted(): Boolean =
            status?.completionTime != null ||
                    status?.conditions.orEmpty().any { it.type in COMPLETED_CONDITIONS }

        /**
         * Return a flag whether this job has run into a timeout according to the given [threshold]. This means that
         * the job was started before the given date and is not yet completed.
         */
        fun V1Job.isTimeout(threshold: OffsetDateTime): Boolean =
            !isCompleted() && status?.startTime?.isBefore(threshold) == true

        /**
         * Obtain the ID of the ORT run from this job from the label used for this purpose. If the label is not set,
         * return *null*.
         */
        val V1Job.ortRunId: Long?
            get() = metadata?.labels?.get(RUN_ID_LABEL)?.toLongOrNull()

        /**
         * Return the trace ID from this job for the labels used for this purpose.
         */
        fun V1Job.traceId(): String {
            val labels = metadata?.labels.orEmpty()
            val traceLabels = labels.filterKeys { it.startsWith(TRACE_LABEL_PREFIX) }.toList()
                .sortedBy { it.first.substringAfter(TRACE_LABEL_PREFIX).toInt() }
            return traceLabels.fold("") { id, label -> "$id${label.second}" }
        }

        /**
         * Return a flag whether this job has completed before the given [time]. Note that a completion time is only
         * available for jobs that have been completed normally; in case of failed jobs, it is undefined. For such
         * jobs, this function returns *true*, since failed jobs need to be handled immediately.
         */
        private fun V1Job.completedBefore(time: OffsetDateTime): Boolean {
            if (!isCompleted()) return false

            val completionTime = status?.completionTime
            return completionTime == null || completionTime.isBefore(time)
        }
    }

    /** A set with the names of the jobs that have been processed recently. */
    private val recentJobNames = mutableSetOf()

    /** Stores the times when jobs have been processed. */
    private val processingTimes = TreeMap()

    /** A mutex for controlling access to the data structures for recent jobs. */
    private val recentJobsMutex = Mutex()

    /**
     * Return a list with all currently existing jobs that have been completed before the given [time].
     */
    fun findJobsCompletedBefore(time: OffsetDateTime): List =
        listJobs(workerJobsLabelSelector).filter { it.completedBefore(time) }

    /**
     * Return a list with all currently active jobs for the worker defined by the given [endpoint].
     */
    fun findJobsForWorker(endpoint: Endpoint<*>): List {
        val labelSelector = "ort-worker=${endpoint.configPrefix}"

        return listJobs(labelSelector)
    }

    /**
     * Delete the given [job]. Check whether it is a failed job. If so, try sending a corresponding notification
     * using [notifier] and delete the job only if this is successful. This operation is needed by both the reaper
     * and the monitor components when they detect a completed job.
     */
    suspend fun deleteAndNotifyIfFailed(job: V1Job) {
        job.metadata?.name?.takeIf { canProcess(it) }?.let { jobName ->
            withMdcContext(
                "traceId" to (job.traceId().takeIf { it.isNotEmpty() } ?: "unknown"),
                "ortRunId" to (job.ortRunId?.toString() ?: "unknown")
            ) {
                runCatching {
                    if (job.isFailed()) {
                        logger.info("Detected a failed job '{}'.", jobName)
                        logger.debug("Details of the failed job: {}", job)

                        notifier.sendFailedJobNotification(job)
                    }
                }.onFailure { exception ->
                    logger.error("Failed to notify about failed job: '{}'.", jobName, exception)
                }.onSuccess {
                    deleteJob(jobName)
                }
            }
        }
    }

    /**
     * Delete the job with the given [jobName]. Log occurring exceptions, but ignore them otherwise.
     */
    fun deleteJob(jobName: String) {
        runCatching {
            jobApi.deleteNamespacedJob(jobName, config.namespace, null, null, null, null, null, null)
        }.onFailure { e ->
            logger.error("Could not remove job '$jobName': $e.")
        }

        findPodsForJob(jobName).forEach(this::deletePod)
    }

    /**
     * Find all pods that have been created for the job with the specified [jobName].
     */
    private fun findPodsForJob(jobName: String): List {
        val selector = "job-name=$jobName"

        return api.listNamespacedPod(
            config.namespace,
            null,
            null,
            null,
            null,
            selector,
            null,
            null,
            null,
            null,
            false
        ).items
    }

    /**
     * Delete the given [pod]. Kubernetes does not automatically remove completed pods. Therefore, this class does the
     * removal when the associated jobs are completed.
     */
    private fun deletePod(pod: V1Pod) {
        pod.metadata?.name?.let { podName ->
            logger.info("Deleting pod $podName.")
            runCatching {
                api.deleteNamespacedPod(podName, config.namespace, null, null, null, null, null, null)
            }.onFailure { e ->
                logger.error("Could not remove pod '$podName': $e.")
            }
        }
    }

    /**
     * Check whether the job with the given [jobName] can be processed. Return *false* if this job has already been
     * processed in the configured time window. Also update the data structures for the recently processed jobs.
     */
    private suspend fun canProcess(jobName: String): Boolean {
        val now = Clock.System.now()
        val recentThreshold = now - config.recentlyProcessedInterval

        return recentJobsMutex.withLock {
            // Remove older entries from the data structures.
            while (processingTimes.isNotEmpty() && processingTimes.firstKey() < recentThreshold) {
                val entry = processingTimes.firstEntry()
                processingTimes -= entry.key
                recentJobNames -= entry.value
            }

            if (jobName in recentJobNames) {
                false
            } else {
                recentJobNames += jobName
                processingTimes[now] = jobName
                true
            }
        }
    }

    /**
     * Return a list with the jobs in the configured namespace. Apply the given [labelSelector] filter.
     */
    private fun listJobs(labelSelector: String?): List =
        jobApi.listNamespacedJob(
            config.namespace,
            null,
            null,
            null,
            null,
            labelSelector,
            null,
            null,
            null,
            null,
            false
        ).items
}