All Downloads are FREE. Search and download functionalities are using the official Maven repository.

automately.core.services.job.JobServer Maven / Gradle / Ivy

There is a newer version: 1.8.8
Show newest version
package automately.core.services.job;

import automately.core.data.Job;
import automately.core.data.Meta;
import automately.core.data.User;
import automately.core.data.predicates.JsonQueryPredicate;
import automately.core.file.VirtualFile;
import automately.core.file.VirtualFileSystem;
import automately.core.services.core.AutomatelyService;
import automately.core.data.UserData;
import automately.core.services.job.script.ScriptContext;
import automately.core.services.job.script.ScriptContextFactory;
import com.hazelcast.core.ICountDownLatch;
import com.hazelcast.core.ILock;
import com.hazelcast.core.IMap;
import com.hazelcast.core.ISet;
import com.hazelcast.query.EntryObject;
import com.hazelcast.query.Predicate;
import com.hazelcast.query.PredicateBuilder;
import com.hazelcast.query.Predicates;
import io.jsync.app.core.Cluster;
import io.jsync.app.core.Config;
import io.jsync.app.core.Logger;
import io.jsync.Async;
import io.jsync.Handler;
import io.jsync.buffer.Buffer;
import io.jsync.eventbus.EventBus;
import io.jsync.eventbus.Message;
import io.jsync.json.JsonArray;
import io.jsync.json.JsonObject;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;

import java.io.*;
import java.util.*;
import java.util.concurrent.*;

import static automately.core.services.job.JobUtil.isStale;
import static automately.core.services.job.JobUtil.updateStatus;

/**
 * JobServer handles all jobs. This is used to submit jobs to the cluster
 * for execution.
 */
public class JobServer extends AutomatelyService {

    // TODO Definitely complete javadocs

    private Cluster cluster;
    private Logger logger;
    private Async async;
    private EventBus eventBus;

    private static ScriptContextFactory scriptContextFactory;
    private ExecutorService jobExecutorService;

    private IMap registeredJobServers;
    private IMap registeredServices;

    private ISet jobsBeingExecuted;

    private String nodeId = "";
    private Handler jobEventBusHandler = null;

    private IMap jobExecutionNodes;
    private long staleJobTimer = 0;

    /**
     * This allows you to define a custom ScriptContextFactory within the JobServer.
     *
     * @param scriptContextFactory
     */
    public static void setScriptContextFactory(ScriptContextFactory scriptContextFactory){
        JobServer.scriptContextFactory = scriptContextFactory;
    }

    @Override
    public void start(Cluster owner) {

        this.cluster = owner;
        this.logger = cluster.logger();
        this.async = cluster.async();
        this.eventBus = cluster.eventBus();

        Config config = cluster.config();

        JsonObject jobServerConfig = coreConfig().getObject("job", new JsonObject());

        if (!jobServerConfig.containsField("max_jobs")) {
            jobServerConfig.putNumber("max_jobs", 100);
        }

        if (!jobServerConfig.containsField("execute_on_least_jobs")) {
            // This is true by default
            jobServerConfig.putBoolean("execute_on_least_jobs", true);
        }

        if (!jobServerConfig.containsField("lite_jobs_enabled")) {
            jobServerConfig.putBoolean("lite_jobs_enabled", false);
        }

        coreConfig().putObject("job", jobServerConfig);
        config.save();

        // END configuration settings.

        int maxJobs = jobServerConfig.getInteger("max_jobs");

        // We use this so we can queue up jobs that don't get processed due to load

        jobsBeingExecuted = cluster.data().getSet("jobs.executing");
        jobExecutionNodes = cluster.data().getMap("jobs.executing.nodes");
        registeredJobServers = cluster.data().getMap("job.server.nodes");
        registeredServices = cluster.data().persistentMap("job.server.user.services");

        // If we are not a job server or our role isn't configured for all
        // then we do not need to continue.
        if (!config.isRole("job") && !config.isAll()) return;

        logger.info("Max jobs set to " + maxJobs);

        // TODO finish implementation
        jobServerConfig.putBoolean("clientMode", cluster.manager().clientMode());
        jobServerConfig.putString("nodeId", cluster.manager().nodeId());

        this.nodeId = cluster.manager().nodeId();

        // Ensure that we put our id in the registeredJobServers
        registeredJobServers.set(this.nodeId, jobServerConfig);

        jobExecutorService = Executors.newFixedThreadPool(maxJobs + 5); // Pull the default amount of max jobs + 5 (Seems to be a fail safe)

        if(scriptContextFactory == null){
            // Create the ScriptContextFactory so we can execute jobs
            scriptContextFactory = new ScriptContextFactory(cluster);
        }

        // Job handler - this is an event bus handler that actually handles our job execution
        jobEventBusHandler = event -> {
            if(event.body() != null){
                if(event.body() instanceof String &&
                        jobs().containsKey(event.body().toString())){

                    jobExecutorService.submit(new Runnable() {
                        @Override
                        public void run() {

                            // Here we will handle the actual processing of the job.
                            Job job = jobs().get(event.body().toString());

                            updateStatus(job, "processing");

                            // Set the name of the thread so we know that this thread is soley used for executing this job
                            Thread.currentThread().setName("job-execution-thread-" + job.token());

                            // Create an ICountDownLatch so we can let the cluster know that we are not finished running this job.
                            ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
                            globalJobFinishLatch.trySetCount(1);

                            // Store the current job token so other nodes can know that this job is being executed with a simple check
                            jobsBeingExecuted.add(job.token());
                            // Store the current node handling the execution of this job
                            jobExecutionNodes.set(job.token(), cluster.manager().nodeId());

                            // Begin timeout handling - This ensures jobs are not running forever

                            // By default all jobs are timed out at 15 minutes unless they are a service job
                            long defaultTimeout = TimeUnit.MINUTES.toMillis(15);

                            if (job.lite) {
                                defaultTimeout = TimeUnit.MINUTES.toMillis(1); // Lite jobs always have a timeout of 1 minute
                            } else if (job.service) {
                                defaultTimeout = 0; // Service jobs do not have a timeout
                            }

                            long timeoutTimer = 0;
                            // If the defaultTimeout is set to 0 then we will not cause timeouts.. this could be a very dangerous feature. Use at own risk
                            if (defaultTimeout > 0) {
                                timeoutTimer = async.setTimer(defaultTimeout, aLong -> eventBus.publish("job.server." + job.token() + ".execution", "timeout"));
                            }

                            boolean jobHalted = false;

                            // if script is null we will pull script data from job..
                            final ScriptContext context = scriptContextFactory.create(job);
                            Job completedJob = job;

                            if (context != null) {

                                // TODO Make ScriptContext rely not so much on the job at all.

                                // Begin print stream handler - allows the ability to retrieve job output anywhere in the cluster
                                Handler printStreamBufferHandler = event -> event.reply(context.getPrintStreamBuffer());
                                cluster.eventBus().registerHandler("job.server." + job.token() + ".printStreamBuffer", printStreamBufferHandler);
                                // End print stream handler

                                // Begin actual script execution

                                // Create a CountDownLatch so we can wait for the job to finish executing before preparing results.
                                CountDownLatch finishLatch = new CountDownLatch(1);


                                // Begin execution thread creation - this executes the job using the JavaScript Engine

                                // This will help let us know that the job is being handled.
                                ILock executionLock = cluster.hazelcast().getLock("_job_lock_execution_" + job.token());
                                executionLock.lock();

                                final Future finalExecFuture = Executors.newSingleThreadExecutor().submit((() -> {

                                    updateStatus(job, "running");

                                    // IMPORTANT - this must not be changed.
                                    Thread.currentThread().setName("job-script-execution-thread-" + job.token());
                                    context.execute();

                                    finishLatch.countDown();

                                    cluster.eventBus().unregisterHandler("job.server." + job.token() + ".printStreamBuffer", printStreamBufferHandler);

                                }));

                                // End execution thread

                                final long finalTimeoutTimer = timeoutTimer;

                                // This is used so we can tell the job script execution to error/stop/timeout/ or halt
                                String jobExecutionIdentifier = "job.server." + job.token() + ".execution";

                                // Begin handler for job execution control
                                eventBus.registerHandler(jobExecutionIdentifier, new Handler() {
                                    @Override
                                    public void handle(io.jsync.eventbus.Message message) {
                                        if (!(message.body() instanceof String)) return;

                                        String method = (String) message.body();

                                        if (method.equals("halt") || method.equals("stop") || method.equals("timeout") || method.equals("error")) {
                                            // IMPORTANT
                                            // We want to store this in the cache because we have halted execution.. which means something outside our scope stopped it.
                                            // We can make it empty too because we don't care about the value..
                                            // Remove this handler.. we don't need it anymore..
                                            switch (method) {
                                                case "error":
                                                    // Handle async error
                                                    // We don't need to do anything
                                                    break;
                                                case "stop":
                                                    updateStatus(job, "stopped");
                                                    break;
                                                case "timeout":
                                                    updateStatus(job, "timeout");
                                                    break;
                                            }
                                            if (!finalExecFuture.isCancelled()) {
                                                finalExecFuture.cancel(true);
                                            }

                                            if (method.equals("halt")) {
                                                job.results.putBoolean("_halted", true);
                                            }

                                            eventBus.unregisterHandler(jobExecutionIdentifier, this);
                                        } else if (method.equals("cancel_timeout")) {
                                            async.cancelTimer(finalTimeoutTimer);
                                            logger.info("Canceling timeout for job " + job.token());
                                        }
                                    }
                                });
                                // End handler for job execution control

                                // Begin finish latch - this will wait for the job to finish executing the job script
                                try {
                                    finishLatch.await();
                                } catch (InterruptedException e) {
                                    logger.info("The job " + job.token() + " was interrupted.");
                                } finally {

                                    // We call getJob just so we have the latest manipulated job
                                    completedJob = context.getJob();

                                    // Simple way to check if the job was halted from the script
                                    if (completedJob.results.containsField("_halted")) {
                                        jobHalted = true;
                                    }
                                }
                                // End finish latch
                            }

                            final Job finalJob = completedJob;

                            if (jobHalted) {

                                // Even if the job was halted errors are handled differently
                                if (!finalJob.results.containsField("error")) {
                                    if (!finalJob.status.equals("stopped") && !finalJob.status.equals("timeout")) {
                                        // This is a regular HALT
                                        updateStatus(finalJob, "halted");
                                    }
                                } else if (finalJob.results.containsField("_halted")) {
                                    // This would probably be an async halt
                                    updateStatus(finalJob, "halted");
                                } else {
                                    // Throw the completed status.
                                    updateStatus(finalJob, "complete");
                                }

                                finalJob.results.removeField("_halted");
                            }
                            // END EXECUTION

                            // Timeout must be canceled..
                            // Cancel the timeout timer so we don't do weird stuff..
                            async.cancelTimer(timeoutTimer);

                            // We want to remove all reserved variables
                            // For some reason field names causes it to stay open
                            for (String key : finalJob.config.toMap().keySet()) {
                                if (key.startsWith("_")) {
                                    finalJob.config.removeField(key);
                                }
                            }

                            // We want to remove the error message
                            if (cluster.config().isDebug() && finalJob.results.containsField("error")) {
                                logger.error(finalJob.results.getObject("error").getString("message"));
                            }

                            // Check if this job has been halted at all so we don't store the results.
                            if (!jobHalted) {
                                updateStatus(finalJob, "complete");
                            }
                            // Ensure it gets stored

                            jobs().set(finalJob.token(), finalJob);

                            jobsBeingExecuted.remove(finalJob.token());
                            jobExecutionNodes.remove(finalJob.token());

                            if (finalJob.config.containsField("callbackUrl")) {
                                async.runOnContext(event -> {
                                    CloseableHttpClient httpClient = HttpClients.createDefault();

                                    try {

                                        JsonObject formatted = new JsonObject();
                                        formatted.putString("token", finalJob.token());
                                        formatted.putValue("created", finalJob.created);
                                        formatted.putValue("updated", finalJob.updated);
                                        formatted.putString("status", finalJob.status);
                                        JsonObject formattedResults = new JsonObject();
                                        formattedResults.putBoolean("success", finalJob.results.getBoolean("success", false));

                                        if (finalJob.results.containsField("error")) {
                                            formattedResults.putObject("error", finalJob.results.getObject("error"));
                                        }
                                        formatted.putObject("results", formattedResults);

                                        HttpPost post = new HttpPost(finalJob.config.getString("callbackUrl"));

                                        StringEntity postingString = new StringEntity(formatted.encode());
                                        post.setEntity(postingString);
                                        post.setHeader("Content-type", "application/json");
                                        post.setHeader("User-Agent", "Automately-Job-Callback");
                                        // We really don't care about the response
                                        httpClient.execute(post);

                                    } catch (IOException ignored) {
                                    } finally {
                                        try {
                                            httpClient.close();
                                        } catch (IOException ignored) {
                                        }
                                    }
                                });
                            }

                            // We are throwing it in a timer so we have a delay
                            async.setTimer(1500, event -> globalJobFinishLatch.countDown());
                            //globalJobFinishLatch.countDown();

                            // Just in case
                            eventBus.publish("job.server." + job.token() + ".finished", "finished");

                        }
                    });
                }
            }
        };

        // We register a handler so we have a place that receives events for jobs
        cluster.eventBus().registerHandler("job.server." + this.nodeId, jobEventBusHandler);

        // This exists so we can ensure that all the data for the dataBus is pre-loaded for
        // anything accessing it such as the DataBusObject
        cluster.data().persistentMap("dataBus");

        if(!cluster.hazelcast().getPartitionService().isClusterSafe()){
            // Let's go ahead and start up some stuff
            // If the cluster is big it may take up to 10 minutes for it to be ready
            cluster.hazelcast().getPartitionService().forceLocalMemberToBeSafe(10, TimeUnit.MINUTES);
        }



        // TODO implement automatically loaded native modules


        // Startup Scripts are called right before any other job gets started when the JobServer first starts.
        // This allows you to have scripts running on the server that can be handling many things
        JsonArray scriptsToStart = jobServerConfig.getArray("startup_scripts", new JsonArray());

        // Begin Startup Scripts
        for(Object value : scriptsToStart){
            if(value instanceof String && value.toString().split(":").length > 1){
                String newVal = (String) value;
                String user = newVal.split(":")[0];
                String script = newVal.split(":")[1];
                User mUser = UserData.getUserByUsername(user);
                if(mUser != null){
                    if(VirtualFileSystem.containsUserFile(mUser, script)){
                        VirtualFile file = VirtualFileSystem.getUserFile(mUser, script);

                        JsonObject scriptConfig = new JsonObject();

                        logger.info("Attempting to start a job for the startup script " + script + " in the path " + VirtualFileSystem.getPathAlias(script));

                        scriptConfig.putString("scriptPath", file.pathAlias);
                        scriptConfig.putString("scriptData", VirtualFileSystem.readFileData(file).toString());

                        Job newJob = new Job();
                        newJob.config = new JsonObject().putObject("script", scriptConfig);
                        // we make sure service is false because the script will handle itself if it is a service
                        newJob.service = false;
                        newJob.lite = false;
                        newJob.fileToken = null;
                        newJob.serviceConfig = new JsonObject();
                        newJob.serviceName = ""; // Make it empty by default
                        newJob.userToken = mUser.token();

                        try {
                            newJob = submit(newJob);
                            logger.info("Started new startup job " + newJob.token() + " for the script " + script);
                        } catch (Exception e) {
                            logger.error("Failed to start new startup job " + newJob.token() + " for the script " + script);
                        }
                    } else {
                        logger.error("Failed to to start \"" + newVal + "\". The file " + script + " does not exist.");
                    }
                } else {
                    logger.error("Failed to to start \"" + newVal + "\". The user " + user + " does not exist.");
                }
            }
        }

        // End Startup Scripts

        CountDownLatch waitLatch = new CountDownLatch(1);
        Timer startupTimer = new Timer();

        startupTimer.schedule(new TimerTask() {
            @Override
            public void run() {
                waitLatch.countDown();
            }
        }, 15000);

        try {
            waitLatch.await(2, TimeUnit.MINUTES);
        } catch (InterruptedException e) {
            logger.warn("Timeout reached while waiting for the startup script timer to finish.");
        }

        // Begin services..
        for(Job job : registeredServices.values()){

            // We can go ahead and clone the job then submitted
            Job newJob = new Job();
            newJob.config = job.config;
            newJob.service = false; // We set this to false because services will call initService
            newJob.lite = false;
            newJob.fileToken = job.fileToken;
            newJob.serviceConfig = job.serviceConfig;
            newJob.serviceName = job.serviceName;
            newJob.userToken = job.userToken;

            // Ensure that we do not start up a service when there has already been a job started for one.
            Collection existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken),
                    Predicates.equal("serviceName", newJob.serviceName),
                    Predicates.or(Predicates.equal("status", "running"),
                            Predicates.equal("status", "queued"),
                            Predicates.equal("status", "processing")
                    )));

            if(!existingServices.isEmpty()){
                boolean alreadyRunning = true;
                for(Job existing : existingServices){
                    if(isStale(existing)){
                        alreadyRunning = false;

                        logger.debug("The job " + existing.token() + " went stale.");
                        // Just to tell other things waiting to finish it
                        cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished");
                    } else {
                        // Set this back to true
                        alreadyRunning = true;
                    }
                }
                if(alreadyRunning){
                    logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started.");
                    return;
                }
            }

            try {
                submit(newJob);
                logger.debug("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
            } catch (Exception e) {
                logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
            }
        }

        if(!cluster.manager().clientMode()){

            Runnable staleJobHandler = () -> {
                logger.debug("Processing old jobs.");
                // TODO it might be less resource intensive to use size()
                for (Job job : jobs().values()) {
                    if (isStale(job)) {
                        logger.debug("The job " + job.token() + " went stale.");
                        // Just to tell other things waiting to finish it
                        cluster.eventBus().publish("job.server." + job.token() + ".finished", "finished");
                    } else if(isJobExpired(job, 14)){
                        logger.debug("Removing the job " + job.token() + " because it has expired. (over 14 days old)");
                        jobs().remove(job.token());
                    } else if(isJobExpired(job, 5)){
                        logger.info("Scrubbing the job " + job.token() + " because it over 5 days old.");
                        try {
                            if(job.results != null && job.results.containsField("output")){
                                job.results.putString("output", "Output Scrubbed");
                            }
                            job.config = new JsonObject();
                            job.updated = new Date();
                            jobs().set(job.token(), job);
                        } catch (Exception e){
                            e.printStackTrace();
                        }
                    }
                }
            };
            staleJobTimer = cluster().async().setPeriodic(TimeUnit.MINUTES.toMillis(30), event -> Executors.newSingleThreadExecutor().submit(staleJobHandler));

            // Run it immediately just to check at startup

            // Adding timeout for first check
            cluster().async().setTimer(15000, event -> Executors.newSingleThreadExecutor().submit(staleJobHandler));
        } else {
            logger.warn("Not checking for stale jobs since we are in client mode.");
        }

    }

    /**
     * This method is used to submit a job to the cluster.
     *
     * @param job the Job you wish to send to the Cluster
     * @return returns a new Job after it has been submitted returns null if it failed
     */
    public Job submit(final Job job) {

        // TODO Validate job

        if (job == null) {
            throw new NullPointerException("Your job cannot be null.");
        }

        if (job.lite && job.service) {
            throw new IllegalArgumentException("A job cannot be a lite job and a service job at the same time");
        }

        if (job.service && job.serviceConfig == null) {
            throw new IllegalArgumentException("Cannot start a new service job with an empty service config");
        }

        if(registeredJobServers.size() < 1){
            throw new RuntimeException("Cannot submit a job when there are no registered job servers.");
        }

        // We set the job as queued so other people know that
        // the job is going to be processed in the cluster.
        job.status = "queued";

        // We must store this job inside the cluster
        // so we can access it across multiple nodes.
        jobs().set(job.token(), job);

        if (!jobsBeingExecuted.contains(job.token())) {

            final ILock handleLock = cluster().hazelcast().getLock("_job_lock_" + job.token());

            if (!handleLock.isLocked()) {

                try {

                    // We must get a lock for at least 5 minutes so we
                    // don't handle the job multiple times in the server
                    if (handleLock.tryLock(5, TimeUnit.MINUTES)) {

                        User jobUser = UserData.getUserByToken(job.userToken);

                        // TODO handle quota errors better by storing an actual script error

                        if (jobUser != null) {
                            if (job.lite) {
                                if (!coreConfig().getObject("job", new JsonObject()).getBoolean("lite_jobs_enabled", false)) {
                                    logger.error("Lite jobs are disabled so we are not running the job " + job.token());

                                    JsonObject newResults = new JsonObject();
                                    newResults.putBoolean("success", false);

                                    JsonObject error = new JsonObject();
                                    error.putString("code", "System Error");
                                    error.putString("message", "Lite jobs are currently disabled.");
                                    newResults.putObject("error", error);

                                    job.status = "completed";
                                    job.results = newResults;

                                    jobs().set(job.token(), job);
                                    return job;
                                }
                                // Check for the Maximum Concurrent Allowed Lite Jobs Per User
                                Meta maxLiteJobs = UserData.getMeta(jobUser, "max_lite_jobs");
                                if (maxLiteJobs != null) {
                                    if (maxLiteJobs.value instanceof Number) {
                                        Number max = (Number) maxLiteJobs.value;
                                        EntryObject e = new PredicateBuilder().getEntryObject();
                                        Predicate p = e.get("userToken").equal(jobUser.token())
                                                .and(e.get("lite").equal(true))
                                                .and(e.get("service").equal(false))
                                                .and(e.get("status").equal("running"));
                                        if (jobs().values(p).size() > max.intValue()) {
                                            JsonObject newResults = new JsonObject();
                                            newResults.putBoolean("success", false);

                                            JsonObject error = new JsonObject();
                                            error.putString("code", "System Error");
                                            error.putString("message", "You have reached your maximum amount of lite jobs you can run at the same time.");
                                            newResults.putObject("error", error);

                                            job.status = "quota_reached";
                                            job.results = newResults;

                                            jobs().set(job.token(), job);
                                            return job;
                                        }
                                    }
                                }
                            } else if (job.service) {
                                Meta maxServiceJobs = UserData.getMeta(jobUser, "max_service_jobs");
                                if (maxServiceJobs != null) {
                                    if (maxServiceJobs.value instanceof Number) {
                                        Number max = (Number) maxServiceJobs.value;
                                        // Check for jobs owned by the user that are not lite jobs but are service and are running
                                        EntryObject e = new PredicateBuilder().getEntryObject();
                                        Predicate p = e.get("userToken").equal(jobUser.token())
                                                .and(e.get("lite").equal(false))
                                                .and(e.get("service").equal(true))
                                                .and(e.get("status").equal("running"));

                                        if (jobs().values(p).size() > max.intValue()) {

                                            JsonObject newResults = new JsonObject();
                                            newResults.putBoolean("success", false);

                                            JsonObject error = new JsonObject();
                                            error.putString("code", "System Error");
                                            error.putString("message", "You have reached your maximum amount of service jobs you can run at the same time.");
                                            newResults.putObject("error", error);

                                            job.status = "quota_reached";
                                            job.results = newResults;

                                            jobs().set(job.token(), job);
                                            return job;
                                        }
                                    }
                                }
                            } else {
                                // Check for the Maximum Concurrent Allowed Jobs Per User
                                Meta maxConcurrentJobs = UserData.getMeta(jobUser, "max_jobs");
                                if (maxConcurrentJobs != null) {
                                    if (maxConcurrentJobs.value instanceof Number) {
                                        Number max = (Number) maxConcurrentJobs.value;

                                        // Check for jobs owned by the user that are not lite jobs and are not service and are running
                                        EntryObject e = new PredicateBuilder().getEntryObject();
                                        Predicate p = e.get("userToken").equal(jobUser.token())
                                                .and(e.get("lite").equal(false))
                                                .and(e.get("service").equal(false))
                                                .and(e.get("status").equal("running"));

                                        if (jobs().values(p).size() > max.intValue()) {

                                            JsonObject newResults = new JsonObject();
                                            newResults.putBoolean("success", false);

                                            JsonObject error = new JsonObject();
                                            error.putString("code", "System Error");
                                            error.putString("message", "You have reached your maximum amount of jobs you can run at the same time.");
                                            newResults.putObject("error", error);

                                            job.status = "quota_reached";
                                            job.results = newResults;

                                            jobs().set(job.token(), job);
                                            return job;
                                        }
                                    }
                                }
                            }

                            String jobServerToUse = null;

                            // This code cannot be used if hazelcast is in client mode for jCluster
                            if (coreConfig().getObject("job", new JsonObject()).getBoolean("execute_on_least_jobs", true)) {
                                JsonObject leastMemberConfig = null;
                                Set keys;
                                if(job.serverTag != null && jobUser.admin){
                                    // Right now admin users can only use job tags
                                    // TODO change this later
                                    // This is where server tag queries happen.
                                    // It basically allows you to filter out server job server types
                                    // This make it possible to do a lot of things
                                    keys = registeredJobServers.keySet(new JsonQueryPredicate(job.serverTag));
                                } else {
                                    keys = registeredJobServers.keySet();
                                }
                                for(String nodeId : keys){
                                    JsonObject memberConfig = registeredJobServers.get(nodeId);
                                    if (leastMemberConfig != null) {
                                        int memberSize = jobExecutionNodes.values(Predicates.equal("toString", nodeId)).size();
                                        int leastMemberSize = jobExecutionNodes.values(Predicates.equal("toString", leastMemberConfig.getString("nodeId"))).size();
                                        if (memberSize < leastMemberSize) {
                                            leastMemberConfig = memberConfig;
                                        }
                                    } else {
                                        leastMemberConfig = memberConfig;
                                    }
                                }

                                if(leastMemberConfig != null){
                                    jobServerToUse = leastMemberConfig.getString("nodeId");
                                }

                            }

                            if(jobServerToUse == null){
                                // TODO make sure properly random
                                List nList = new ArrayList<>(registeredJobServers.keySet());
                                Collections.shuffle(nList);
                                jobServerToUse =  nList.iterator().next();
                            }

                            // Now let's actually execute this job by publishing it to the cluster.

                            String serverId = "job.server." + jobServerToUse;

                            logger.info("Submitting the job " + job.token() + " to \"" + serverId + "\"");

                            cluster.eventBus().publish(serverId, job.token());

                            return job;
                        }
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                } finally {
                    handleLock.unlock();
                }
                return job;
            }
        }

        // We return null if the job cannot be submitted for some reason
        return null;
    }

    /**
     * This is a simple utility to check if a job has expired.
     *
     * @param job
     * @param days
     * @return
     */
    private boolean isJobExpired(Job job, int days) {
        if (job == null) {
            throw new NullPointerException();
        }
        String status = job.status;
        // This means we are already processing it.
        if (status.equals("running") || status.equals("queued") || status.equals("processing")) {
            return false;
        }
        long howManyDays = TimeUnit.MILLISECONDS.toDays(((new Date())).getTime() - job.updated.getTime());
        return howManyDays >= days;
    }

    @Override
    public void stop() {
        // Since the nodeId is not empty we know this is an actual jobserver node
        if(!nodeId.isEmpty() && jobEventBusHandler != null){

            logger.info("Shutting down the JobServer for the node " + nodeId);

            // We register a handler so we have a place that receives events for jobs
            cluster.eventBus().unregisterHandler("job.server." + nodeId, jobEventBusHandler);

            registeredJobServers.remove(this.nodeId);

            if(staleJobTimer > -1){
                cluster().async().cancelTimer(staleJobTimer);
            }

            Collection handlingJobs = jobExecutionNodes.keySet(Predicates.equal("toString", nodeId));

            logger.debug("There are " + handlingJobs.size() + " being handled by the node " + this.nodeId);

            // Let's automatically handle jobs for this node
            for(String jobToken : handlingJobs){
                logger.debug("Attempting to cleanup the job " + jobToken);

                Job job = jobs().get(jobToken);

                if(job != null){
                    // We only resubmit service jobs
                    if(job.service && registeredJobServers.size() > 0){

                        // We are going to send a direct hook to to the job to tell it to halt
                        cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop");

                        // We can tell whatever handler to let the job finish.
                        ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");

                        logger.debug("Waiting for the job " + job.token() + " to finish.");
                        try {
                            globalJobFinishLatch.await(30, TimeUnit.SECONDS);
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }

                        jobExecutionNodes.remove(jobToken);
                        jobsBeingExecuted.remove(jobToken);

                        job = jobs().get(jobToken);

                        // We can go ahead and clone the job then submitted
                        Job newJob = new Job();
                        newJob.config = job.config;
                        newJob.service = false;
                        newJob.lite = false;
                        newJob.fileToken = job.fileToken;
                        newJob.serviceConfig = job.serviceConfig;
                        newJob.serviceName = job.serviceName;
                        newJob.userToken = job.userToken;

                        // Ensure that we do not start up a service when there has already been a job started for one.
                        Collection existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken),
                                Predicates.equal("serviceName", newJob.serviceName),
                                Predicates.or(Predicates.equal("status", "running"),
                                        Predicates.equal("status", "queued"),
                                        Predicates.equal("status", "processing")
                                )));

                        if(!existingServices.isEmpty()){
                            boolean alreadyRunning = true;
                            for(Job existing : existingServices){
                                if(isStale(existing)){
                                    alreadyRunning = false;

                                    logger.debug("The job " + existing.token() + " went stale.");
                                    // Just to tell other things waiting to finish it
                                    cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished");
                                } else {
                                    // Set this back to true
                                    alreadyRunning = true;
                                }
                            }
                            if(alreadyRunning){
                                logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started.");
                                return;
                            }
                        }

                        try {
                            newJob = submit(newJob);
                            logger.debug("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
                            try {
                                // Let's try to wait for the service to be ready
                                ICountDownLatch serviceReadyLatch = cluster.hazelcast().getCountDownLatch(newJob.token() + "_service_ready_latch");
                                serviceReadyLatch.await(15, TimeUnit.SECONDS);
                            } catch (InterruptedException ignored){
                            }
                        } catch (Exception e) {
                            logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
                        }
                    } else {

                        // We are going to send a direct hook to to the job to tell it to halt
                        cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop");

                        // We can tell whatever handler to let the job finish.
                        ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");

                        logger.debug("Waiting for the job " + job.token() + " to finish.");
                        try {
                            globalJobFinishLatch.await(30, TimeUnit.SECONDS);
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }

                        jobExecutionNodes.remove(jobToken);
                        jobsBeingExecuted.remove(jobToken);

                    }
                }
            }

            CountDownLatch waitLatch = new CountDownLatch(1);

            Timer timer = new Timer();
            timer.schedule(new TimerTask() {
                @Override
                public void run() {
                    waitLatch.countDown();
                }
            }, 10000);

            try {
                waitLatch.await(30, TimeUnit.SECONDS);
                jobExecutorService.shutdownNow();
            } catch (InterruptedException e) {
                e.printStackTrace();
            }
        }
    }

    @Override
    public String name() {
        return getClass().getCanonicalName();
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy