
automately.core.services.job.JobServer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of automately-core Show documentation
Show all versions of automately-core Show documentation
A Scalable Web Application Platform
package automately.core.services.job;
import automately.core.data.Job;
import automately.core.data.Meta;
import automately.core.data.User;
import automately.core.data.predicates.JsonQueryPredicate;
import automately.core.file.VirtualFile;
import automately.core.file.VirtualFileSystem;
import automately.core.services.core.AutomatelyService;
import automately.core.data.UserData;
import automately.core.services.job.script.ScriptContext;
import automately.core.services.job.script.ScriptContextFactory;
import com.hazelcast.core.ICountDownLatch;
import com.hazelcast.core.ILock;
import com.hazelcast.core.IMap;
import com.hazelcast.core.ISet;
import com.hazelcast.query.EntryObject;
import com.hazelcast.query.Predicate;
import com.hazelcast.query.PredicateBuilder;
import com.hazelcast.query.Predicates;
import io.jsync.app.core.Cluster;
import io.jsync.app.core.Config;
import io.jsync.app.core.Logger;
import io.jsync.Async;
import io.jsync.Handler;
import io.jsync.buffer.Buffer;
import io.jsync.eventbus.EventBus;
import io.jsync.eventbus.Message;
import io.jsync.json.JsonArray;
import io.jsync.json.JsonObject;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import java.io.*;
import java.util.*;
import java.util.concurrent.*;
import static automately.core.services.job.JobUtil.isStale;
import static automately.core.services.job.JobUtil.updateStatus;
/**
* JobServer handles all jobs. This is used to submit jobs to the cluster
* for execution.
*/
public class JobServer extends AutomatelyService {
// TODO Definitely complete javadocs
private Cluster cluster;
private Logger logger;
private Async async;
private EventBus eventBus;
private static ScriptContextFactory scriptContextFactory;
private ExecutorService jobExecutorService;
private IMap registeredJobServers;
private IMap registeredServices;
private ISet jobsBeingExecuted;
private String nodeId = "";
private Handler jobEventBusHandler = null;
private IMap jobExecutionNodes;
private long staleJobTimer = 0;
/**
* This allows you to define a custom ScriptContextFactory within the JobServer.
*
* @param scriptContextFactory
*/
public static void setScriptContextFactory(ScriptContextFactory scriptContextFactory){
JobServer.scriptContextFactory = scriptContextFactory;
}
@Override
public void start(Cluster owner) {
this.cluster = owner;
this.logger = cluster.logger();
this.async = cluster.async();
this.eventBus = cluster.eventBus();
Config config = cluster.config();
JsonObject jobServerConfig = coreConfig().getObject("job", new JsonObject());
if (!jobServerConfig.containsField("max_jobs")) {
jobServerConfig.putNumber("max_jobs", 100);
}
if (!jobServerConfig.containsField("execute_on_least_jobs")) {
// This is true by default
jobServerConfig.putBoolean("execute_on_least_jobs", true);
}
if (!jobServerConfig.containsField("lite_jobs_enabled")) {
jobServerConfig.putBoolean("lite_jobs_enabled", false);
}
coreConfig().putObject("job", jobServerConfig);
config.save();
// END configuration settings.
int maxJobs = jobServerConfig.getInteger("max_jobs");
// We use this so we can queue up jobs that don't get processed due to load
jobsBeingExecuted = cluster.data().getSet("jobs.executing");
jobExecutionNodes = cluster.data().getMap("jobs.executing.nodes");
registeredJobServers = cluster.data().getMap("job.server.nodes");
registeredServices = cluster.data().persistentMap("job.server.user.services");
// If we are not a job server or our role isn't configured for all
// then we do not need to continue.
if (!config.isRole("job") && !config.isAll()) return;
logger.info("Max jobs set to " + maxJobs);
// TODO finish implementation
jobServerConfig.putBoolean("clientMode", cluster.manager().clientMode());
jobServerConfig.putString("nodeId", cluster.manager().nodeId());
this.nodeId = cluster.manager().nodeId();
// Ensure that we put our id in the registeredJobServers
registeredJobServers.set(this.nodeId, jobServerConfig);
jobExecutorService = Executors.newFixedThreadPool(maxJobs + 5); // Pull the default amount of max jobs + 5 (Seems to be a fail safe)
if(scriptContextFactory == null){
// Create the ScriptContextFactory so we can execute jobs
scriptContextFactory = new ScriptContextFactory(cluster);
}
// Job handler - this is an event bus handler that actually handles our job execution
jobEventBusHandler = event -> {
if(event.body() != null){
if(event.body() instanceof String &&
jobs().containsKey(event.body().toString())){
jobExecutorService.submit(new Runnable() {
@Override
public void run() {
// Here we will handle the actual processing of the job.
Job job = jobs().get(event.body().toString());
updateStatus(job, "processing");
// Set the name of the thread so we know that this thread is soley used for executing this job
Thread.currentThread().setName("job-execution-thread-" + job.token());
// Create an ICountDownLatch so we can let the cluster know that we are not finished running this job.
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
globalJobFinishLatch.trySetCount(1);
// Store the current job token so other nodes can know that this job is being executed with a simple check
jobsBeingExecuted.add(job.token());
// Store the current node handling the execution of this job
jobExecutionNodes.set(job.token(), cluster.manager().nodeId());
// Begin timeout handling - This ensures jobs are not running forever
// By default all jobs are timed out at 15 minutes unless they are a service job
long defaultTimeout = TimeUnit.MINUTES.toMillis(15);
if (job.lite) {
defaultTimeout = TimeUnit.MINUTES.toMillis(1); // Lite jobs always have a timeout of 1 minute
} else if (job.service) {
defaultTimeout = 0; // Service jobs do not have a timeout
}
long timeoutTimer = 0;
// If the defaultTimeout is set to 0 then we will not cause timeouts.. this could be a very dangerous feature. Use at own risk
if (defaultTimeout > 0) {
timeoutTimer = async.setTimer(defaultTimeout, aLong -> eventBus.publish("job.server." + job.token() + ".execution", "timeout"));
}
boolean jobHalted = false;
// if script is null we will pull script data from job..
final ScriptContext context = scriptContextFactory.create(job);
Job completedJob = job;
if (context != null) {
// TODO Make ScriptContext rely not so much on the job at all.
// Begin print stream handler - allows the ability to retrieve job output anywhere in the cluster
Handler printStreamBufferHandler = event -> event.reply(context.getPrintStreamBuffer());
cluster.eventBus().registerHandler("job.server." + job.token() + ".printStreamBuffer", printStreamBufferHandler);
// End print stream handler
// Begin actual script execution
// Create a CountDownLatch so we can wait for the job to finish executing before preparing results.
CountDownLatch finishLatch = new CountDownLatch(1);
// Begin execution thread creation - this executes the job using the JavaScript Engine
// This will help let us know that the job is being handled.
ILock executionLock = cluster.hazelcast().getLock("_job_lock_execution_" + job.token());
executionLock.lock();
final Future finalExecFuture = Executors.newSingleThreadExecutor().submit((() -> {
updateStatus(job, "running");
// IMPORTANT - this must not be changed.
Thread.currentThread().setName("job-script-execution-thread-" + job.token());
context.execute();
finishLatch.countDown();
cluster.eventBus().unregisterHandler("job.server." + job.token() + ".printStreamBuffer", printStreamBufferHandler);
}));
// End execution thread
final long finalTimeoutTimer = timeoutTimer;
// This is used so we can tell the job script execution to error/stop/timeout/ or halt
String jobExecutionIdentifier = "job.server." + job.token() + ".execution";
// Begin handler for job execution control
eventBus.registerHandler(jobExecutionIdentifier, new Handler() {
@Override
public void handle(io.jsync.eventbus.Message message) {
if (!(message.body() instanceof String)) return;
String method = (String) message.body();
if (method.equals("halt") || method.equals("stop") || method.equals("timeout") || method.equals("error")) {
// IMPORTANT
// We want to store this in the cache because we have halted execution.. which means something outside our scope stopped it.
// We can make it empty too because we don't care about the value..
// Remove this handler.. we don't need it anymore..
switch (method) {
case "error":
// Handle async error
// We don't need to do anything
break;
case "stop":
updateStatus(job, "stopped");
break;
case "timeout":
updateStatus(job, "timeout");
break;
}
if (!finalExecFuture.isCancelled()) {
finalExecFuture.cancel(true);
}
if (method.equals("halt")) {
job.results.putBoolean("_halted", true);
}
eventBus.unregisterHandler(jobExecutionIdentifier, this);
} else if (method.equals("cancel_timeout")) {
async.cancelTimer(finalTimeoutTimer);
logger.info("Canceling timeout for job " + job.token());
}
}
});
// End handler for job execution control
// Begin finish latch - this will wait for the job to finish executing the job script
try {
finishLatch.await();
} catch (InterruptedException e) {
logger.info("The job " + job.token() + " was interrupted.");
} finally {
// We call getJob just so we have the latest manipulated job
completedJob = context.getJob();
// Simple way to check if the job was halted from the script
if (completedJob.results.containsField("_halted")) {
jobHalted = true;
}
}
// End finish latch
}
final Job finalJob = completedJob;
if (jobHalted) {
// Even if the job was halted errors are handled differently
if (!finalJob.results.containsField("error")) {
if (!finalJob.status.equals("stopped") && !finalJob.status.equals("timeout")) {
// This is a regular HALT
updateStatus(finalJob, "halted");
}
} else if (finalJob.results.containsField("_halted")) {
// This would probably be an async halt
updateStatus(finalJob, "halted");
} else {
// Throw the completed status.
updateStatus(finalJob, "complete");
}
finalJob.results.removeField("_halted");
}
// END EXECUTION
// Timeout must be canceled..
// Cancel the timeout timer so we don't do weird stuff..
async.cancelTimer(timeoutTimer);
// We want to remove all reserved variables
// For some reason field names causes it to stay open
for (String key : finalJob.config.toMap().keySet()) {
if (key.startsWith("_")) {
finalJob.config.removeField(key);
}
}
// We want to remove the error message
if (cluster.config().isDebug() && finalJob.results.containsField("error")) {
logger.error(finalJob.results.getObject("error").getString("message"));
}
// Check if this job has been halted at all so we don't store the results.
if (!jobHalted) {
updateStatus(finalJob, "complete");
}
// Ensure it gets stored
jobs().set(finalJob.token(), finalJob);
jobsBeingExecuted.remove(finalJob.token());
jobExecutionNodes.remove(finalJob.token());
if (finalJob.config.containsField("callbackUrl")) {
async.runOnContext(event -> {
CloseableHttpClient httpClient = HttpClients.createDefault();
try {
JsonObject formatted = new JsonObject();
formatted.putString("token", finalJob.token());
formatted.putValue("created", finalJob.created);
formatted.putValue("updated", finalJob.updated);
formatted.putString("status", finalJob.status);
JsonObject formattedResults = new JsonObject();
formattedResults.putBoolean("success", finalJob.results.getBoolean("success", false));
if (finalJob.results.containsField("error")) {
formattedResults.putObject("error", finalJob.results.getObject("error"));
}
formatted.putObject("results", formattedResults);
HttpPost post = new HttpPost(finalJob.config.getString("callbackUrl"));
StringEntity postingString = new StringEntity(formatted.encode());
post.setEntity(postingString);
post.setHeader("Content-type", "application/json");
post.setHeader("User-Agent", "Automately-Job-Callback");
// We really don't care about the response
httpClient.execute(post);
} catch (IOException ignored) {
} finally {
try {
httpClient.close();
} catch (IOException ignored) {
}
}
});
}
// We are throwing it in a timer so we have a delay
async.setTimer(1500, event -> globalJobFinishLatch.countDown());
//globalJobFinishLatch.countDown();
// Just in case
eventBus.publish("job.server." + job.token() + ".finished", "finished");
}
});
}
}
};
// We register a handler so we have a place that receives events for jobs
cluster.eventBus().registerHandler("job.server." + this.nodeId, jobEventBusHandler);
// This exists so we can ensure that all the data for the dataBus is pre-loaded for
// anything accessing it such as the DataBusObject
cluster.data().persistentMap("dataBus");
if(!cluster.hazelcast().getPartitionService().isClusterSafe()){
// Let's go ahead and start up some stuff
// If the cluster is big it may take up to 10 minutes for it to be ready
cluster.hazelcast().getPartitionService().forceLocalMemberToBeSafe(10, TimeUnit.MINUTES);
}
// TODO implement automatically loaded native modules
// Startup Scripts are called right before any other job gets started when the JobServer first starts.
// This allows you to have scripts running on the server that can be handling many things
JsonArray scriptsToStart = jobServerConfig.getArray("startup_scripts", new JsonArray());
// Begin Startup Scripts
for(Object value : scriptsToStart){
if(value instanceof String && value.toString().split(":").length > 1){
String newVal = (String) value;
String user = newVal.split(":")[0];
String script = newVal.split(":")[1];
User mUser = UserData.getUserByUsername(user);
if(mUser != null){
if(VirtualFileSystem.containsUserFile(mUser, script)){
VirtualFile file = VirtualFileSystem.getUserFile(mUser, script);
JsonObject scriptConfig = new JsonObject();
logger.info("Attempting to start a job for the startup script " + script + " in the path " + VirtualFileSystem.getPathAlias(script));
scriptConfig.putString("scriptPath", file.pathAlias);
scriptConfig.putString("scriptData", VirtualFileSystem.readFileData(file).toString());
Job newJob = new Job();
newJob.config = new JsonObject().putObject("script", scriptConfig);
// we make sure service is false because the script will handle itself if it is a service
newJob.service = false;
newJob.lite = false;
newJob.fileToken = null;
newJob.serviceConfig = new JsonObject();
newJob.serviceName = ""; // Make it empty by default
newJob.userToken = mUser.token();
try {
newJob = submit(newJob);
logger.info("Started new startup job " + newJob.token() + " for the script " + script);
} catch (Exception e) {
logger.error("Failed to start new startup job " + newJob.token() + " for the script " + script);
}
} else {
logger.error("Failed to to start \"" + newVal + "\". The file " + script + " does not exist.");
}
} else {
logger.error("Failed to to start \"" + newVal + "\". The user " + user + " does not exist.");
}
}
}
// End Startup Scripts
CountDownLatch waitLatch = new CountDownLatch(1);
Timer startupTimer = new Timer();
startupTimer.schedule(new TimerTask() {
@Override
public void run() {
waitLatch.countDown();
}
}, 15000);
try {
waitLatch.await(2, TimeUnit.MINUTES);
} catch (InterruptedException e) {
logger.warn("Timeout reached while waiting for the startup script timer to finish.");
}
// Begin services..
for(Job job : registeredServices.values()){
// We can go ahead and clone the job then submitted
Job newJob = new Job();
newJob.config = job.config;
newJob.service = false; // We set this to false because services will call initService
newJob.lite = false;
newJob.fileToken = job.fileToken;
newJob.serviceConfig = job.serviceConfig;
newJob.serviceName = job.serviceName;
newJob.userToken = job.userToken;
// Ensure that we do not start up a service when there has already been a job started for one.
Collection existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken),
Predicates.equal("serviceName", newJob.serviceName),
Predicates.or(Predicates.equal("status", "running"),
Predicates.equal("status", "queued"),
Predicates.equal("status", "processing")
)));
if(!existingServices.isEmpty()){
boolean alreadyRunning = true;
for(Job existing : existingServices){
if(isStale(existing)){
alreadyRunning = false;
logger.debug("The job " + existing.token() + " went stale.");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished");
} else {
// Set this back to true
alreadyRunning = true;
}
}
if(alreadyRunning){
logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started.");
return;
}
}
try {
submit(newJob);
logger.debug("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
} catch (Exception e) {
logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
}
}
if(!cluster.manager().clientMode()){
Runnable staleJobHandler = () -> {
logger.debug("Processing old jobs.");
// TODO it might be less resource intensive to use size()
for (Job job : jobs().values()) {
if (isStale(job)) {
logger.debug("The job " + job.token() + " went stale.");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + job.token() + ".finished", "finished");
} else if(isJobExpired(job, 14)){
logger.debug("Removing the job " + job.token() + " because it has expired. (over 14 days old)");
jobs().remove(job.token());
} else if(isJobExpired(job, 5)){
logger.info("Scrubbing the job " + job.token() + " because it over 5 days old.");
try {
if(job.results != null && job.results.containsField("output")){
job.results.putString("output", "Output Scrubbed");
}
job.config = new JsonObject();
job.updated = new Date();
jobs().set(job.token(), job);
} catch (Exception e){
e.printStackTrace();
}
}
}
};
staleJobTimer = cluster().async().setPeriodic(TimeUnit.MINUTES.toMillis(30), event -> Executors.newSingleThreadExecutor().submit(staleJobHandler));
// Run it immediately just to check at startup
// Adding timeout for first check
cluster().async().setTimer(15000, event -> Executors.newSingleThreadExecutor().submit(staleJobHandler));
} else {
logger.warn("Not checking for stale jobs since we are in client mode.");
}
}
/**
* This method is used to submit a job to the cluster.
*
* @param job the Job you wish to send to the Cluster
* @return returns a new Job after it has been submitted returns null if it failed
*/
public Job submit(final Job job) {
// TODO Validate job
if (job == null) {
throw new NullPointerException("Your job cannot be null.");
}
if (job.lite && job.service) {
throw new IllegalArgumentException("A job cannot be a lite job and a service job at the same time");
}
if (job.service && job.serviceConfig == null) {
throw new IllegalArgumentException("Cannot start a new service job with an empty service config");
}
if(registeredJobServers.size() < 1){
throw new RuntimeException("Cannot submit a job when there are no registered job servers.");
}
// We set the job as queued so other people know that
// the job is going to be processed in the cluster.
job.status = "queued";
// We must store this job inside the cluster
// so we can access it across multiple nodes.
jobs().set(job.token(), job);
if (!jobsBeingExecuted.contains(job.token())) {
final ILock handleLock = cluster().hazelcast().getLock("_job_lock_" + job.token());
if (!handleLock.isLocked()) {
try {
// We must get a lock for at least 5 minutes so we
// don't handle the job multiple times in the server
if (handleLock.tryLock(5, TimeUnit.MINUTES)) {
User jobUser = UserData.getUserByToken(job.userToken);
// TODO handle quota errors better by storing an actual script error
if (jobUser != null) {
if (job.lite) {
if (!coreConfig().getObject("job", new JsonObject()).getBoolean("lite_jobs_enabled", false)) {
logger.error("Lite jobs are disabled so we are not running the job " + job.token());
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "Lite jobs are currently disabled.");
newResults.putObject("error", error);
job.status = "completed";
job.results = newResults;
jobs().set(job.token(), job);
return job;
}
// Check for the Maximum Concurrent Allowed Lite Jobs Per User
Meta maxLiteJobs = UserData.getMeta(jobUser, "max_lite_jobs");
if (maxLiteJobs != null) {
if (maxLiteJobs.value instanceof Number) {
Number max = (Number) maxLiteJobs.value;
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("lite").equal(true))
.and(e.get("service").equal(false))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "You have reached your maximum amount of lite jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
jobs().set(job.token(), job);
return job;
}
}
}
} else if (job.service) {
Meta maxServiceJobs = UserData.getMeta(jobUser, "max_service_jobs");
if (maxServiceJobs != null) {
if (maxServiceJobs.value instanceof Number) {
Number max = (Number) maxServiceJobs.value;
// Check for jobs owned by the user that are not lite jobs but are service and are running
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("lite").equal(false))
.and(e.get("service").equal(true))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "You have reached your maximum amount of service jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
jobs().set(job.token(), job);
return job;
}
}
}
} else {
// Check for the Maximum Concurrent Allowed Jobs Per User
Meta maxConcurrentJobs = UserData.getMeta(jobUser, "max_jobs");
if (maxConcurrentJobs != null) {
if (maxConcurrentJobs.value instanceof Number) {
Number max = (Number) maxConcurrentJobs.value;
// Check for jobs owned by the user that are not lite jobs and are not service and are running
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("lite").equal(false))
.and(e.get("service").equal(false))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "You have reached your maximum amount of jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
jobs().set(job.token(), job);
return job;
}
}
}
}
String jobServerToUse = null;
// This code cannot be used if hazelcast is in client mode for jCluster
if (coreConfig().getObject("job", new JsonObject()).getBoolean("execute_on_least_jobs", true)) {
JsonObject leastMemberConfig = null;
Set keys;
if(job.serverTag != null && jobUser.admin){
// Right now admin users can only use job tags
// TODO change this later
// This is where server tag queries happen.
// It basically allows you to filter out server job server types
// This make it possible to do a lot of things
keys = registeredJobServers.keySet(new JsonQueryPredicate(job.serverTag));
} else {
keys = registeredJobServers.keySet();
}
for(String nodeId : keys){
JsonObject memberConfig = registeredJobServers.get(nodeId);
if (leastMemberConfig != null) {
int memberSize = jobExecutionNodes.values(Predicates.equal("toString", nodeId)).size();
int leastMemberSize = jobExecutionNodes.values(Predicates.equal("toString", leastMemberConfig.getString("nodeId"))).size();
if (memberSize < leastMemberSize) {
leastMemberConfig = memberConfig;
}
} else {
leastMemberConfig = memberConfig;
}
}
if(leastMemberConfig != null){
jobServerToUse = leastMemberConfig.getString("nodeId");
}
}
if(jobServerToUse == null){
// TODO make sure properly random
List nList = new ArrayList<>(registeredJobServers.keySet());
Collections.shuffle(nList);
jobServerToUse = nList.iterator().next();
}
// Now let's actually execute this job by publishing it to the cluster.
String serverId = "job.server." + jobServerToUse;
logger.info("Submitting the job " + job.token() + " to \"" + serverId + "\"");
cluster.eventBus().publish(serverId, job.token());
return job;
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
handleLock.unlock();
}
return job;
}
}
// We return null if the job cannot be submitted for some reason
return null;
}
/**
* This is a simple utility to check if a job has expired.
*
* @param job
* @param days
* @return
*/
private boolean isJobExpired(Job job, int days) {
if (job == null) {
throw new NullPointerException();
}
String status = job.status;
// This means we are already processing it.
if (status.equals("running") || status.equals("queued") || status.equals("processing")) {
return false;
}
long howManyDays = TimeUnit.MILLISECONDS.toDays(((new Date())).getTime() - job.updated.getTime());
return howManyDays >= days;
}
@Override
public void stop() {
// Since the nodeId is not empty we know this is an actual jobserver node
if(!nodeId.isEmpty() && jobEventBusHandler != null){
logger.info("Shutting down the JobServer for the node " + nodeId);
// We register a handler so we have a place that receives events for jobs
cluster.eventBus().unregisterHandler("job.server." + nodeId, jobEventBusHandler);
registeredJobServers.remove(this.nodeId);
if(staleJobTimer > -1){
cluster().async().cancelTimer(staleJobTimer);
}
Collection handlingJobs = jobExecutionNodes.keySet(Predicates.equal("toString", nodeId));
logger.debug("There are " + handlingJobs.size() + " being handled by the node " + this.nodeId);
// Let's automatically handle jobs for this node
for(String jobToken : handlingJobs){
logger.debug("Attempting to cleanup the job " + jobToken);
Job job = jobs().get(jobToken);
if(job != null){
// We only resubmit service jobs
if(job.service && registeredJobServers.size() > 0){
// We are going to send a direct hook to to the job to tell it to halt
cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop");
// We can tell whatever handler to let the job finish.
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
logger.debug("Waiting for the job " + job.token() + " to finish.");
try {
globalJobFinishLatch.await(30, TimeUnit.SECONDS);
} catch (InterruptedException e) {
e.printStackTrace();
}
jobExecutionNodes.remove(jobToken);
jobsBeingExecuted.remove(jobToken);
job = jobs().get(jobToken);
// We can go ahead and clone the job then submitted
Job newJob = new Job();
newJob.config = job.config;
newJob.service = false;
newJob.lite = false;
newJob.fileToken = job.fileToken;
newJob.serviceConfig = job.serviceConfig;
newJob.serviceName = job.serviceName;
newJob.userToken = job.userToken;
// Ensure that we do not start up a service when there has already been a job started for one.
Collection existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken),
Predicates.equal("serviceName", newJob.serviceName),
Predicates.or(Predicates.equal("status", "running"),
Predicates.equal("status", "queued"),
Predicates.equal("status", "processing")
)));
if(!existingServices.isEmpty()){
boolean alreadyRunning = true;
for(Job existing : existingServices){
if(isStale(existing)){
alreadyRunning = false;
logger.debug("The job " + existing.token() + " went stale.");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished");
} else {
// Set this back to true
alreadyRunning = true;
}
}
if(alreadyRunning){
logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started.");
return;
}
}
try {
newJob = submit(newJob);
logger.debug("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
try {
// Let's try to wait for the service to be ready
ICountDownLatch serviceReadyLatch = cluster.hazelcast().getCountDownLatch(newJob.token() + "_service_ready_latch");
serviceReadyLatch.await(15, TimeUnit.SECONDS);
} catch (InterruptedException ignored){
}
} catch (Exception e) {
logger.error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
}
} else {
// We are going to send a direct hook to to the job to tell it to halt
cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop");
// We can tell whatever handler to let the job finish.
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
logger.debug("Waiting for the job " + job.token() + " to finish.");
try {
globalJobFinishLatch.await(30, TimeUnit.SECONDS);
} catch (InterruptedException e) {
e.printStackTrace();
}
jobExecutionNodes.remove(jobToken);
jobsBeingExecuted.remove(jobToken);
}
}
}
CountDownLatch waitLatch = new CountDownLatch(1);
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
waitLatch.countDown();
}
}, 10000);
try {
waitLatch.await(30, TimeUnit.SECONDS);
jobExecutorService.shutdownNow();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
@Override
public String name() {
return getClass().getCanonicalName();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy