All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.powsybl.computation.slurm.AbstractTask Maven / Gradle / Ivy

/**
 * Copyright (c) 2020, RTE (http://www.rte-france.com)
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 * SPDX-License-Identifier: MPL-2.0
 */
package com.powsybl.computation.slurm;

import com.powsybl.commons.io.WorkingDirectory;
import com.powsybl.computation.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.stream.Collectors;

import static com.powsybl.computation.slurm.SlurmConstants.BATCH_EXT;

/**
 * @author Yichen TANG {@literal }
 */
public abstract class AbstractTask implements SlurmTask {

    private static final Logger LOGGER = LoggerFactory.getLogger(AbstractTask.class);

    protected static final String UNZIP_INPUTS_COMMAND_ID = "unzip_inputs_command";
    private static final String CLOSE_START_NO_MORE_SEND_INFO = "SCM close started and no more send sbatch to slurm";

    protected final Path workingDir;
    protected final Path flagDir;
    protected final CommandExecutor commandExecutor;
    protected final List executions;
    protected final ComputationParameters parameters;
    protected final ExecutionEnvironment environment;

    protected final List jobs = new ArrayList<>();
    protected final CompletableFuture taskCompletion = new CompletableFuture<>();
    protected Map commandByJobId;

    private final SlurmComputationManager scm;

    AbstractTask(SlurmComputationManager scm, WorkingDirectory directory,
                        List executions, ComputationParameters parameters, ExecutionEnvironment environment) {
        this.scm = Objects.requireNonNull(scm);
        Objects.requireNonNull(directory);
        this.workingDir = Objects.requireNonNull(directory.toPath());
        this.commandExecutor = Objects.requireNonNull(scm.getCommandRunner());
        this.flagDir = Objects.requireNonNull(scm.getFlagDir());
        this.executions = Objects.requireNonNull(executions);
        this.parameters = Objects.requireNonNull(parameters);
        this.environment = Objects.requireNonNull(environment);
    }

    /**
     * Check if task has already been completed, or if computation manager is closing.
     */
    protected boolean cannotSubmit() {
        if (scm.isCloseStarted()) {
            LOGGER.info(CLOSE_START_NO_MORE_SEND_INFO);
            return true;
        }
        if (isCompleted()) {
            LOGGER.info("Stopping jobs submission for task in {}: task has been interrupted.", workingDir);
            return true;
        }
        return false;
    }

    protected long launchSbatch(SbatchCmd cmd) {
        try {
            SbatchCmdResult sbatchResult = cmd.send(commandExecutor);
            long submittedJobId = sbatchResult.getSubmittedJobId();
            LOGGER.debug("Submitted with jobId:{}", submittedJobId);
            return submittedJobId;
        } catch (SlurmCmdNonZeroException e) {
            throw new SlurmException(e);
        }
    }

    protected void copyShellToRemoteWorkingDir(List shell, String batchName) throws IOException {
        StringBuilder sb = new StringBuilder();
        shell.forEach(line -> sb.append(line).append('\n'));
        String str = sb.toString();
        Path batch;
        batch = workingDir.resolve(batchName + BATCH_EXT);
        try (InputStream targetStream = new ByteArrayInputStream(str.getBytes())) {
            Files.copy(targetStream, batch);
        }
    }

    Path getWorkingDirPath() {
        return workingDir;
    }

    void addParameters(SbatchCmdBuilder builder, String commandId) {
        SlurmComputationParameters extension = parameters.getExtension(SlurmComputationParameters.class);
        if (extension != null) {
            extension.getQos().ifPresent(builder::qos);
            extension.getMem().ifPresent(builder::mem);
        }
        parameters.getDeadline(commandId).ifPresent(builder::deadline);
        parameters.getTimeout(commandId).ifPresent(builder::timeout);
    }

    final void aggregateMonitoredJobs() {
        CompletableFuture[] monitoredJobsFutures = jobs.stream()
                .filter(CompletableMonitoredJob::isCompletionRequired)
                .map(CompletableMonitoredJob::getCompletableFuture)
                .toArray(CompletableFuture[]::new);
        CompletableFuture.allOf(monitoredJobsFutures)
                .thenRun(() -> {
                    LOGGER.debug("Slurm task completed in {}.", workingDir);
                    taskCompletion.complete(null);
                });
    }

    @Override
    public ExecutionReport await() throws InterruptedException, ExecutionException {
        taskCompletion.get();
        return generateReport();
    }

    ExecutionReport generateReport() {
        List errors = new ArrayList<>();
        try {
            for (Long id : getAllJobIds()) {
                final ScontrolCmd.ScontrolResult scontrolResult = ScontrolCmdFactory.showJob(id).send(commandExecutor);
                for (ScontrolCmd.ScontrolResultBean bean : scontrolResult.getResultBeanList()) {
                    if (bean.getExitCode() != 0) {
                        final ExecutionError error = convertScontrolResult2Error(bean);
                        errors.add(error);
                        LOGGER.debug("{} error added ", error);
                    }
                }
            }
        } catch (SlurmCmdNonZeroException e) {
            LOGGER.warn("Scontrol non zero:", e);
        }
        return new DefaultExecutionReport(workingDir, errors);
    }

    abstract Collection getAllJobIds();

    abstract ExecutionError convertScontrolResult2Error(ScontrolCmd.ScontrolResultBean scontrolResultBean);

    /**
     * The list of jobs for which status must be monitored.
     *
     */
    @Override
    public List getPendingJobs() {
        return jobs.stream().filter(job -> !job.isCompleted())
                .collect(Collectors.toList());
    }

    @Override
    public void interrupt() {
        taskCompletion.cancel(true);
        cancelSubmittedJobs();
    }

    /**
     * Asks for cancellation of submitted jobs to Slurm infrastructure.
     */
    protected void cancelSubmittedJobs() {
        jobs.forEach(CompletableMonitoredJob::interruptJob);
    }

    /**
     * {@code true} if the task is already considered completed, be it through normal completion or interruption.
     */
    private boolean isCompleted() {
        return taskCompletion.isDone();
    }

    public class CompletableMonitoredJob implements MonitoredJob {

        private final long jobId;
        private final CompletableFuture completed;
        private final boolean completionRequired;
        private boolean interrupted = false;

        private int counter = 1;

        CompletableMonitoredJob(long jobId) {
            this(jobId, true);
        }

        /**
         * Some jobs (see unzip) are not monitored for completion because we already monitor dependent jobs.
         * However if they fail, we still want to interrupt the task.
         */
        CompletableMonitoredJob(long jobId, boolean completionRequired) {
            this.jobId = jobId;
            this.completed = new CompletableFuture<>();
            this.completionRequired = completionRequired;
        }

        public void setCounter(int counter) {
            this.counter = counter;
        }

        boolean isCompleted() {
            return completed.isDone();
        }

        boolean isCompletionRequired() {
            return completionRequired;
        }

        CompletableFuture getCompletableFuture() {
            return this.completed;
        }

        /**
         * This job ID in slurm
         */
        @Override
        public long getJobId() {
            return jobId;
        }

        /**
         * To be called by a monitor when the job has ended successfully.
         */
        @Override
        public void done() {
            counter--;
            if (counter == 0) {
                LOGGER.debug("Slurm job {} done.", jobId);
                completed.complete(null);
            } else {
                LOGGER.debug("Slurm array job {} done. Rest: {}", jobId, counter);
            }
        }

        /**
         * Asks for cancellation of this job to Slurm infrastructure,
         * if not already interrupted.
         */
        void interruptJob() {
            synchronized (this) {
                if (interrupted) {
                    return;
                }
                interrupted = true;
            }

            completed.complete(null);
            LOGGER.debug("Scancel slurm job {}.", jobId);
            commandExecutor.execute("scancel " + jobId);
        }

        /**
         * To be called by a monitor when the job has failed.
         *
         * 

The implementation asks for the interruption of all other jobs, * but the task will complete normally and generate an execution report. */ @Override public void failed() { LOGGER.debug("Slurm job {} failed.", jobId); taskCompletion.complete(null); cancelSubmittedJobs(); } /** * To be called if the job is detected to have been killed * before completing. *

* The implementation completes the task with an exception, * and asks for interruption of all jobs. *

*/ @Override public void interrupted() { taskCompletion.completeExceptionally(new SlurmException("Job " + jobId + " execution has been interrupted on slurm infrastructure.")); cancelSubmittedJobs(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy