All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nextflow.executor.GridTaskHandler.groovy Maven / Gradle / Ivy

Go to download

A DSL modelled around the UNIX pipe concept, that simplifies writing parallel and scalable pipelines in a portable manner

The newest version!
/*
 * Copyright 2013-2024, Seqera Labs
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package nextflow.executor

import static nextflow.processor.TaskStatus.*

import java.nio.file.Path
import java.nio.file.attribute.BasicFileAttributes
import java.time.temporal.ChronoUnit
import java.util.function.Predicate
import java.util.regex.Pattern

import dev.failsafe.Failsafe
import dev.failsafe.RetryPolicy
import dev.failsafe.event.EventListener
import dev.failsafe.event.ExecutionAttemptedEvent
import dev.failsafe.function.CheckedSupplier
import groovy.transform.CompileStatic
import groovy.transform.Memoized
import groovy.util.logging.Slf4j
import nextflow.exception.ProcessException
import nextflow.exception.ProcessFailedException
import nextflow.exception.ProcessNonZeroExitStatusException
import nextflow.file.FileHelper
import nextflow.fusion.FusionAwareTask
import nextflow.fusion.FusionHelper
import nextflow.processor.TaskArrayRun
import nextflow.processor.TaskHandler
import nextflow.processor.TaskRun
import nextflow.trace.TraceRecord
import nextflow.util.CmdLineHelper
import nextflow.util.Duration
import nextflow.util.Throttle
/**
 * Handles a job execution in the underlying grid platform
 */
@Slf4j
@CompileStatic
class GridTaskHandler extends TaskHandler implements FusionAwareTask {

    /** The target executor platform */
    final AbstractGridExecutor executor

    /** Location of the file created when the job is started */
    final Path startFile

    /** Location of the file created when the job is terminated */
    final Path exitFile

    /** Location of the file holding the task std output */
    final Path outputFile

    /** Location of the file holding the task std error */
    final Path errorFile

    /** The wrapper file used to execute the user script */
    final Path wrapperFile

    /** The unique job ID as provided by the underlying grid platform */
    private jobId

    private queue

    private long exitStatusReadTimeoutMillis

    private Duration sanityCheckInterval

    static private final Duration READ_TIMEOUT = Duration.of('270sec') // 4.5 minutes

    BatchCleanup batch

    /** only for testing purpose */
    protected GridTaskHandler() {}

    GridTaskHandler( TaskRun task, AbstractGridExecutor executor ) {
        super(task)

        this.executor = executor
        this.startFile = task.workDir.resolve(TaskRun.CMD_START)
        this.exitFile = task.workDir.resolve(TaskRun.CMD_EXIT)
        this.outputFile = task.workDir.resolve(TaskRun.CMD_OUTFILE)
        this.errorFile = task.workDir.resolve(TaskRun.CMD_ERRFILE)
        this.wrapperFile = task.workDir.resolve(TaskRun.CMD_RUN)
        final duration = executor.session?.getExitReadTimeout(executor.name, READ_TIMEOUT) ?: READ_TIMEOUT
        this.exitStatusReadTimeoutMillis = duration.toMillis()
        this.queue = task.config?.queue
        this.sanityCheckInterval = duration
    }

    @Override
    void prepareLauncher() {
        // -- create the wrapper script
        createTaskWrapper(task).build()
    }

    protected ProcessBuilder createProcessBuilder() {

        // -- log the qsub command
        final cli = executor.getSubmitCommandLine(task, wrapperFile)
        log.trace "start process ${task.name} > cli: ${cli}"

        /*
         * launch 'sub' script wrapper
         */
        ProcessBuilder builder = new ProcessBuilder()
            .command( cli as String[] )
            .redirectErrorStream(true)
        if( !fusionEnabled() )
            builder .directory(task.workDir.toFile())

        return builder
    }

    @Memoized
    protected Predicate retryCondition(String reasonPattern) {
        final pattern = Pattern.compile(reasonPattern)
        return new Predicate() {
            @Override
            boolean test(Throwable failure) {
                if( failure instanceof ProcessNonZeroExitStatusException ) {
                    final reason = failure.reason
                    return reason ? pattern.matcher(reason).find() : false
                }
                return false
            }
        }
    }

    protected  RetryPolicy retryPolicy() {

        final delay = executor.session.getConfigAttribute("executor.retry.delay", '500ms') as Duration
        final maxDelay = executor.session.getConfigAttribute("executor.retry.maxDelay", '30s') as Duration
        final jitter = executor.session.getConfigAttribute("executor.retry.jitter", '0.25') as double
        final maxAttempts = executor.session.getConfigAttribute("executor.retry.maxAttempts", '3') as int
        final reason = executor.session.getConfigAttribute("executor.submit.retry.reason", 'Socket timed out') as String

        final listener = new EventListener() {
            @Override
            void accept(ExecutionAttemptedEvent event) throws Throwable {
                final failure = event.getLastFailure()
                if( failure instanceof ProcessNonZeroExitStatusException ) {
                    final failure0 = (ProcessNonZeroExitStatusException)failure
                    final msg = """\
                        Failed to submit process '${task.name}'
                         - attempt : ${event.attemptCount}
                         - command : ${failure0.command}
                         - reason  : ${failure0.reason}
                        """.stripIndent(true)
                    log.warn msg

                } else {
                    log.debug("Unexpected retry failure: ${failure?.message}", failure)
                }
            }
        }

        return RetryPolicy.builder()
                .handleIf(retryCondition(reason))
                .withBackoff(delay.toMillis(), maxDelay.toMillis(), ChronoUnit.MILLIS)
                .withMaxAttempts(maxAttempts)
                .withJitter(jitter)
                .onFailedAttempt(listener)
                .build()
    }

    protected  T safeExecute(CheckedSupplier action) {
        final policy = retryPolicy()
        return Failsafe.with(policy).get(action)
    }

    protected String processStart(ProcessBuilder builder, String pipeScript) {
        final process = builder.start()

        try {
            // -- forward the job launcher script to the command stdin if required
            if( pipeScript ) {
                log.trace "[${executor.name.toUpperCase()}] Submit STDIN command ${task.name} >\n${pipeScript.indent()}"
                process.out << pipeScript
                process.out.close()
            }

            // -- wait the the process completes
            final result = process.text
            final exitStatus = process.waitFor()
            final cmd = launchCmd0(builder,pipeScript)

            if( exitStatus ) {
                throw new ProcessNonZeroExitStatusException("Failed to submit process to grid scheduler for execution", result, exitStatus, cmd)
            }

            // -- return the process stdout
            return result
        }
        finally {
            // make sure to release all resources
            process.in.closeQuietly()
            process.out.closeQuietly()
            process.err.closeQuietly()
            process.destroy()
        }
    }

    protected BashWrapperBuilder createTaskWrapper(TaskRun task) {
        return fusionEnabled()
            ? fusionLauncher()
            : executor.createBashWrapperBuilder(task)
    }

    protected String stdinLauncherScript() {
        return fusionEnabled() ? fusionStdinWrapper() : wrapperFile.text
    }

    protected String fusionStdinWrapper() {
        final submit = fusionSubmitCli()
        final launcher = fusionLauncher()
        final config = task.getContainerConfig()
        final containerOpts = task.config.getContainerOptions()
        final cmd = FusionHelper.runWithContainer(launcher, config, task.getContainer(), containerOpts, submit)
        // create an inline script to launch the job execution
        return '#!/bin/bash\n' + submitDirective(task) + cmd + '\n'
    }

    protected String submitDirective(TaskRun task) {
        final remoteLog = task.workDir.resolve(TaskRun.CMD_LOG).toString()
        // replaces the log file with a null file because the cluster submit tool
        // cannot write to a file hosted in a remote object storage
        final result = executor
                .getHeaders(task)
                .replaceAll(remoteLog, '/dev/null')
        return result
    }

    protected String launchCmd0(ProcessBuilder builder, String pipeScript) {
        def result = CmdLineHelper.toLine(builder.command())
        if( pipeScript ) {
            result = "cat << 'LAUNCH_COMMAND_EOF' | ${result}\n"
            result += pipeScript.trim() + '\n'
            result += 'LAUNCH_COMMAND_EOF\n'
        }
        return result
    }

    /*
     * {@inheritDocs}
     */
    @Override
    void submit() {
        ProcessBuilder builder = null
        try {
            // -- start the execution and notify the event to the monitor
            builder = createProcessBuilder()
            // -- forward the job launcher script to the command stdin if required
            final stdinScript = executor.pipeLauncherScript() ? stdinLauncherScript() : null
            // -- execute with a re-triable strategy
            final result = safeExecute( () -> processStart(builder, stdinScript) )
            // -- save the job id
            final jobId = (String)executor.parseJobId(result)
            updateStatus(jobId)
            log.debug "[${executor.name.toUpperCase()}] submitted process ${task.name} > jobId: $jobId; workDir: ${task.workDir}"

        }
        catch( Exception e ) {
            // update task exit status and message
            if( e instanceof ProcessNonZeroExitStatusException ) {
                task.exitStatus = e.getExitStatus()
                task.stdout = e.getReason()
                task.script = e.getCommand()
            }
            else {
                task.script = builder ? CmdLineHelper.toLine(builder.command()) : null
            }
            status = COMPLETED
            throw new ProcessFailedException("Error submitting process '${task.name}' for execution", e )
        }
    }

    private void updateStatus(String jobId) {
        if( task instanceof TaskArrayRun ) {
            for( int i=0; i ')
            // -- dump directory listing
            errMessage << "Content of workDir: ${task.workDir}"
            errMessage << workDirList?.indent('> ')
            log.debug errMessage.join('\n')

            return Integer.MAX_VALUE
        }

        /*
         * read the exit file, it should contain the executed process exit status
         */
        def status = exitFile.text?.trim()
        if( status ) {
            try {
                return status.toInteger()
            }
            catch( Exception e ) {
                log.warn "Unable to parse process exit file: ${exitFile.toUriString()} -- bad value: '$status'"
                return Integer.MAX_VALUE
            }
        }

        else {
            /*
             * Since working with NFS it may happen that the file exists BUT it is empty due to network latencies,
             * before returning an invalid exit code, wait some seconds.
             *
             * More in detail:
             * 1) the very first time that arrive here initialize the 'exitTimestampMillis' to the current timestamp
             * 2) when the file is empty but less than 5 seconds are spent from the first check, return null
             *    this will force the monitor to continue to wait for job termination
             * 3) if more than 5 seconds are spent, and the file is empty return MAX_INT as an invalid exit status
             *
             */
            if( !exitTimestampMillis2 ) {
                log.debug "File is returning empty content: $this -- Try to wait a while... and pray."
                exitTimestampMillis2 = System.currentTimeMillis()
            }

            def delta = System.currentTimeMillis() - exitTimestampMillis2
            if( delta < exitStatusReadTimeoutMillis ) {
                return null
            }
            log.warn "Unable to read command status from: ${exitFile.toUriString()} after $delta ms"
            return -1
        }
    }

    @Override
    boolean checkIfRunning() {

        if( isSubmitted() ) {
            if( isStarted() ) {
                status = RUNNING
                // use local timestamp because files are created on remote nodes which
                // may not have a synchronized clock
                startedMillis = System.currentTimeMillis()
                return true
            }
        }

        return false
    }

    private boolean isStarted() {

        BasicFileAttributes attr
        if( startFile && (attr=FileHelper.readAttributes(startFile)) && attr.lastModifiedTime()?.toMillis() > 0  )
            return true

        // check if the jobId is tracked in the queue status
        if( executor.checkStartedStatus(jobId, queue) )
            return true

        // to avoid unnecessary pressure on the file system check the existence of
        // the exit file on only on a time-periodic basis
        def now = System.currentTimeMillis()
        if( now - exitTimestampMillis0 > exitStatusReadTimeoutMillis ) {
            exitTimestampMillis0 = now
            // fix issue #268
            if( exitFile && (attr=FileHelper.readAttributes(exitFile)) && attr.lastModifiedTime()?.toMillis() > 0  )
                return true
        }

        return false
    }

    @Override
    boolean checkIfCompleted() {

        // verify the exit file exists
        Integer exit
        if( isRunning() && (exit = readExitStatus()) != null ) {
            // finalize the task
            task.exitStatus = exit
            task.stdout = outputFile
            task.stderr = errorFile
            status = COMPLETED
            return true
        }
        // sanity check
        else if( !passSanityCheck() ) {
            log.debug "Task sanity check failed > $task"
            task.stdout = outputFile
            task.stderr = errorFile
            status = COMPLETED
            return true
        }

        return false
    }

    protected boolean passSanityCheck() {
        Throttle.after(sanityCheckInterval, true) {
            if( isCompleted() ) {
                return true
            }
            if( task.workDir.exists() ) {
                return true
            }
            // if the task is not complete (ie submitted or running)
            // AND the work-dir does not exist ==> something is wrong
            task.error = new ProcessException("Task work directory is missing (!)")
            // sanity check does not pass
            return false
        }
    }

    @Override
    void kill() {
        if( batch ) {
            batch.collect(executor, jobId)
        }
        else {
            executor.killTask(jobId)
        }
    }

    protected StringBuilder toStringBuilder( StringBuilder builder ) {
        builder << "jobId: $jobId; "

        super.toStringBuilder(builder)
        final exitAttrs = FileHelper.readAttributes(exitFile)

        builder << " started: " << (startedMillis ? startedMillis : '-') << ';'
        builder << " exited: " << (exitAttrs ? exitAttrs.lastModifiedTime() : '-') << '; '

        return builder
    }

    /**
     * @return An {@link nextflow.trace.TraceRecord} instance holding task runtime information
     */
    @Override
    TraceRecord getTraceRecord() {
        def trace = super.getTraceRecord()
        trace.put('native_id', jobId)
        return trace
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy