
automately.core.services.job.JobServer Maven / Gradle / Ivy
package automately.core.services.job;
import automately.core.data.Job;
import automately.core.data.Meta;
import automately.core.data.User;
import automately.core.data.predicates.JsonQueryPredicate;
import automately.core.file.VirtualFile;
import automately.core.file.VirtualFileSystem;
import automately.core.services.core.AutomatelyService;
import automately.core.data.UserData;
import com.hazelcast.core.ICountDownLatch;
import com.hazelcast.core.ILock;
import com.hazelcast.core.IMap;
import com.hazelcast.core.ISet;
import com.hazelcast.query.EntryObject;
import com.hazelcast.query.Predicate;
import com.hazelcast.query.PredicateBuilder;
import com.hazelcast.query.Predicates;
import io.jcluster.core.Cluster;
import io.jcluster.core.Config;
import io.jcluster.core.Logger;
import io.jsync.Handler;
import io.jsync.buffer.Buffer;
import io.jsync.eventbus.Message;
import io.jsync.json.JsonArray;
import io.jsync.json.JsonObject;
import java.io.*;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import static automately.core.services.job.JobUtil.isStale;
/**
* JobServer handles all jobs. This is used to submit jobs to the cluster
* for execution.
*/
public class JobServer extends AutomatelyService {
// TODO Definitely complete javadocs
private Cluster cluster;
private Logger logger;
/**
* This is the local ExecutorService where jobs are executed on.
*/
private ExecutorService jobExecutorService;
/**
* This IMap stores all the JobServer's in the cluster that can handle jobs.
*/
private IMap registeredJobServers;
/**
* This IMap is where all the users' registered services are located.
* We use this so we can startup scripts quicker.
*/
private IMap registeredServices;
/**
* These are the jobs that are currently being executed
*/
private ISet jobsBeingExecuted;
private ISet jobsInQueue;
private String nodeId = "";
private Handler jobEventBusHandler = null;
private IMap jobExecutionNodes;
private long staleJobTimer = 0;
// TODO We want to properly start services when we
// start the cluster. And also add a delay when doing the initial stale job checker
@Override
public void start(Cluster owner) {
this.cluster = owner;
this.logger = cluster.logger();
// BEGIN configuration settings.
Config config = cluster.config();
JsonObject jobServerConfig = coreConfig().getObject("job", new JsonObject());
/**
* This is the maximum amount of jobs this server will
* handle before we have to start queing them up.
*/
if (!jobServerConfig.containsField("max_jobs")) {
jobServerConfig.putNumber("max_jobs", 100);
}
if (!jobServerConfig.containsField("execute_on_least_jobs")) {
// This is true by default
jobServerConfig.putBoolean("execute_on_least_jobs", true);
}
if (!jobServerConfig.containsField("lite_jobs_enabled")) {
jobServerConfig.putBoolean("lite_jobs_enabled", false);
}
coreConfig().putObject("job", jobServerConfig);
config.save();
// END configuration settings.
int maxJobs = jobServerConfig.getInteger("max_jobs");
// We use this so we can queue up jobs that don't get processed due to load
jobsBeingExecuted = cluster.data().getSet("jobs.executing");
jobsInQueue = cluster.data().getSet("jobs.executing.queue");
jobExecutionNodes = cluster.data().getMap("jobs.executing.nodes");
registeredJobServers = cluster.data().getMap("job.server.nodes");
// Handles user services
registeredServices = cluster.data().persistentMap("job.server.user.services");
// Continue since we don't want any server except all-in-one and job to start/stop
if (!config.isRole("job") && !config.isAll()) return;
logger.info("Max jobs set to " + maxJobs);
// This is so we can make it easier for other servers
// To figure out how to send jobs
jobServerConfig.putBoolean("clientMode", cluster.manager().clientMode());
jobServerConfig.putString("nodeId", cluster.manager().nodeId());
this.nodeId = cluster.manager().nodeId();
// Ensure that we put our id in the registeredJobServers
registeredJobServers.set(this.nodeId, jobServerConfig);
// Check if we can register some handlers for an API Server so we can receive jobs()...
jobExecutorService = Executors.newFixedThreadPool(maxJobs + 5); // Pull the default amount of max jobs
jobEventBusHandler = event -> {
if(event.body() != null){
if(event.body() instanceof Buffer){
Buffer buff = (Buffer) event.body();
ByteArrayInputStream b = new ByteArrayInputStream(buff.getBytes());
try {
ObjectInputStream o = new ObjectInputStream(b);
JobExecutor executor = (JobExecutor) o.readObject();
jobExecutorService.submit(executor);
} catch (IOException | ClassNotFoundException e) {
e.printStackTrace();
}
}
}
};
// We register a handler so we have a place that receives events for jobs
cluster.eventBus().registerHandler("job.server." + this.nodeId, jobEventBusHandler);
Executors.newSingleThreadExecutor().submit(() -> {
logger.info("Attempting to read the dataBus so we can load all the data early.");
// This pretty much tells the cluster we want to start loading all data for this
cluster.data().persistentMap("dataBus");
});
if(!cluster.hazelcast().getPartitionService().isClusterSafe()){
// Let's go ahead and start up some stuff
// If the cluster is big it may take up to 10 minutes for it to be ready
cluster.hazelcast().getPartitionService().forceLocalMemberToBeSafe(10, TimeUnit.MINUTES);
}
// Startup Scripts are called right before any other job gets started when the JobServer first starts.
// This allows you to have scripts running on the server that can be handling many things
JsonArray scriptsToStart = jobServerConfig.getArray("startup_scripts", new JsonArray());
// Begin Startup Scripts
for(Object value : scriptsToStart){
if(value instanceof String && value.toString().split(":").length > 1){
String newVal = (String) value;
String user = newVal.split(":")[0];
String script = newVal.split(":")[1];
User mUser = UserData.getUserByUsername(user);
if(mUser != null){
if(VirtualFileSystem.containsUserFile(mUser, script)){
VirtualFile file = VirtualFileSystem.getUserFile(mUser, script);
JsonObject scriptConfig = new JsonObject();
logger.info("Attempting to start a job for the startup script " + script + " in the path " + VirtualFileSystem.getPathAlias(script));
scriptConfig.putString("scriptPath", file.pathAlias);
scriptConfig.putString("scriptData", VirtualFileSystem.readFileData(file).toString());
Job newJob = new Job();
newJob.config = new JsonObject().putObject("script", scriptConfig);
// we make sure service is false because the script will handle itself if it is a service
newJob.service = false;
newJob.lite = false;
newJob.fileToken = null;
newJob.serviceConfig = new JsonObject();
newJob.serviceName = ""; // Make it empty by default
newJob.userToken = mUser.token();
try {
newJob = submit(newJob);
cluster.logger().info("Started new startup job " + newJob.token() + " for the script " + script);
} catch (Exception e) {
cluster.logger().error("Failed to start new startup job " + newJob.token() + " for the script " + script);
}
} else {
logger.error("Failed to to start \"" + newVal + "\". The file " + script + " does not exist.");
}
} else {
logger.error("Failed to to start \"" + newVal + "\". The user " + user + " does not exist.");
}
}
}
// End Startup Scripts
CountDownLatch waitLatch = new CountDownLatch(1);
Timer startupTimer = new Timer();
startupTimer.schedule(new TimerTask() {
@Override
public void run() {
waitLatch.countDown();
}
}, 15000);
try {
waitLatch.await(2, TimeUnit.MINUTES);
} catch (InterruptedException e) {
logger.warn("Timeout reached while waiting for the startup script timer to finish.");
}
// Begin services..
for(Job job : registeredServices.values()){
// We can go ahead and clone the job then submitted
Job newJob = new Job();
newJob.config = job.config;
newJob.service = false; // We set this to false because services will call initService
newJob.lite = false;
newJob.fileToken = job.fileToken;
newJob.serviceConfig = job.serviceConfig;
newJob.serviceName = job.serviceName;
newJob.userToken = job.userToken;
// Ensure that we do not start up a service when there has already been a job started for one.
Collection existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken),
Predicates.equal("serviceName", newJob.serviceName),
Predicates.or(Predicates.equal("status", "running"),
Predicates.equal("status", "queued"),
Predicates.equal("status", "processing")
)));
if(!existingServices.isEmpty()){
boolean alreadyRunning = true;
for(Job existing : existingServices){
if(isStale(existing)){
alreadyRunning = false;
cluster().logger().info("The job " + existing.token() + " went stale.");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished");
} else {
// Set this back to true
alreadyRunning = true;
}
}
if(alreadyRunning){
cluster.logger().error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started.");
return;
}
}
try {
submit(newJob);
cluster.logger().info("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
} catch (Exception e) {
cluster.logger().error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
}
}
if(!cluster.manager().clientMode()){
Runnable staleJobHandler = () -> {
cluster().logger().info("Processing old jobs.");
// TODO it might be less resource intensive to use size()
for (Job job : jobs().values()) {
if (isStale(job)) {
cluster().logger().info("The job " + job.token() + " went stale.");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + job.token() + ".finished", "finished");
} else if(isJobExpired(job, 14)){
cluster().logger().info("Removing the job " + job.token() + " because it has expired. (over 14 days old)");
jobs().remove(job.token());
} else if(isJobExpired(job, 5)){
cluster().logger().info("Scrubbing the job " + job.token() + " because it over 5 days old.");
try {
if(job.results != null && job.results.containsField("output")){
job.results.putString("output", "Output Scrubbed");
}
job.config = new JsonObject();
job.updated = new Date();
jobs().set(job.token(), job);
} catch (Exception e){
e.printStackTrace();
}
}
}
};
staleJobTimer = cluster().async().setPeriodic(TimeUnit.MINUTES.toMillis(30), event -> Executors.newSingleThreadExecutor().submit(staleJobHandler));
// Run it immediately just to check at startup
// Adding timeout for first check
cluster().async().setTimer(15000, event -> Executors.newSingleThreadExecutor().submit(staleJobHandler));
} else {
logger.warn("Not checking for stale jobs since we are in client mode.");
}
}
/**
* This is a simple utility to check if a job has expired.
*
* @param job
* @param days
* @return
*/
private boolean isJobExpired(Job job, int days) {
if (job == null) {
throw new NullPointerException();
}
String status = job.status;
// This means we are already processing it.
if (status.equals("running") || status.equals("queued") || status.equals("processing")) {
return false;
}
long howManyDays = TimeUnit.MILLISECONDS.toDays(((new Date())).getTime() - job.updated.getTime());
return howManyDays >= days;
}
@Override
public void stop() {
// Since the nodeId is not empty we know this is an actual jobserver node
if(!nodeId.isEmpty() && jobEventBusHandler != null){
logger.info("Shutting down the JobServer for the node " + nodeId);
// We register a handler so we have a place that receives events for jobs
cluster.eventBus().unregisterHandler("job.server." + nodeId, jobEventBusHandler);
registeredJobServers.remove(this.nodeId);
if(staleJobTimer > -1){
cluster().async().cancelTimer(staleJobTimer);
}
Collection handlingJobs = jobExecutionNodes.keySet(new Predicates.EqualPredicate("toString", nodeId));
logger.info("There are " + handlingJobs.size() + " being handled by the node " + this.nodeId);
// Let's automatically handle jobs for this node
for(String jobToken : handlingJobs){
logger.info("Attempting to cleanup the job " + jobToken);
Job job = jobs().get(jobToken);
if(job != null){
// We only resubmit service jobs
if(job.service && registeredJobServers.size() > 0){
// We are going to send a direct hook to to the job to tell it to halt
cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop");
// We can tell whatever handler to let the job finish.
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
logger.info("Waiting for the job " + job.token() + " to finish.");
try {
globalJobFinishLatch.await(30, TimeUnit.SECONDS);
} catch (InterruptedException e) {
e.printStackTrace();
}
jobExecutionNodes.remove(jobToken);
jobsBeingExecuted.remove(jobToken);
jobsInQueue.remove(jobToken);
job = jobs().get(jobToken);
// We can go ahead and clone the job then submitted
Job newJob = new Job();
newJob.config = job.config;
newJob.service = false;
newJob.lite = false;
newJob.fileToken = job.fileToken;
newJob.serviceConfig = job.serviceConfig;
newJob.serviceName = job.serviceName;
newJob.userToken = job.userToken;
// Ensure that we do not start up a service when there has already been a job started for one.
Collection existingServices = jobs().values(Predicates.and(Predicates.equal("userToken", newJob.userToken),
Predicates.equal("serviceName", newJob.serviceName),
Predicates.or(Predicates.equal("status", "running"),
Predicates.equal("status", "queued"),
Predicates.equal("status", "processing")
)));
if(!existingServices.isEmpty()){
boolean alreadyRunning = true;
for(Job existing : existingServices){
if(isStale(existing)){
alreadyRunning = false;
cluster().logger().info("The job " + existing.token() + " went stale.");
// Just to tell other things waiting to finish it
cluster.eventBus().publish("job.server." + existing.token() + ".finished", "finished");
} else {
// Set this back to true
alreadyRunning = true;
}
}
if(alreadyRunning){
cluster.logger().error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken + " because a service already has been started.");
return;
}
}
try {
newJob = submit(newJob);
cluster.logger().info("Started new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
try {
// Let's try to wait for the service to be ready
ICountDownLatch serviceReadyLatch = cluster.hazelcast().getCountDownLatch(newJob.token() + "_service_ready_latch");
serviceReadyLatch.await(15, TimeUnit.SECONDS);
} catch (InterruptedException ignored){
}
} catch (Exception e) {
cluster.logger().error("Failed to start new service job " + newJob.token() + " for the service " + newJob.serviceName + " for the user " + newJob.userToken);
}
} else {
// We are going to send a direct hook to to the job to tell it to halt
cluster.eventBus().publish("job.server." + job.token() + ".execution", "stop");
// We can tell whatever handler to let the job finish.
ICountDownLatch globalJobFinishLatch = cluster.hazelcast().getCountDownLatch(job.token() + "_job_finish_latch");
logger.info("Waiting for the job " + job.token() + " to finish.");
try {
globalJobFinishLatch.await(30, TimeUnit.SECONDS);
} catch (InterruptedException e) {
e.printStackTrace();
}
jobExecutionNodes.remove(jobToken);
jobsBeingExecuted.remove(jobToken);
jobsInQueue.remove(jobToken);
}
}
}
CountDownLatch waitLatch = new CountDownLatch(1);
logger.info("Waiting 10 seconds for all jobs from this node to finish cleaning up...");
// Wait 5000ms before we let the service continue.
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
waitLatch.countDown();
}
}, 10000);
try {
waitLatch.await(30, TimeUnit.SECONDS);
jobExecutorService.shutdownNow();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
@Override
public String name() {
return getClass().getCanonicalName();
}
/**
* This method is used to submit a job to the cluster.
*
* @param job the Job you wish to send to the Cluster
* @return returns a new Job after it has been submitted returns null if it failed
*/
public Job submit(final Job job) {
// TODO Validate job
if (job == null || job.config == null || job.userToken == null) {
throw new NullPointerException("Your job cannot be null.");
}
if (job.lite && job.service) {
throw new IllegalArgumentException("A job cannot be a lite job and a service job at the same time");
}
if (job.service && job.serviceConfig == null) {
throw new IllegalArgumentException("Cannot start a new service job with an empty service config");
}
if(registeredJobServers.size() < 1){
throw new RuntimeException("Cannot submit a job when there are no registered job servers.");
}
jobs().set(job.token(), job);
// TODO Check if this is the most efficient ways
Executors.newSingleThreadExecutor().submit(() -> {
logger.info("Processing the job " + job.token());
String token = job.token();
// 1. Does the job even exist?
if (!jobs().containsKey(token)) {
logger.error("Job with the token " + token + " does not exist.");
return;
}
if (!jobsInQueue.contains(job.token()) && !jobsBeingExecuted.contains(job.token())) {
// We are checking if it is locked in the cluster so we o do not submit this job multiple times
final ILock handleLock = cluster().hazelcast().getLock("_job_lock_" + token);
if (!handleLock.isLocked()) {
try {
logger.info("Attempting to get a lock for the job " + job.token());
// Get the Lock for at least five minutes
if (handleLock.tryLock(5, TimeUnit.MINUTES)) {
logger.info("Lock successfully created for the job " + job.token());
User jobUser = UserData.getUserByToken(job.userToken);
// TODO handle quota errors better by storing an actual script error
if (jobUser != null) {
if (job.lite) {
if (!coreConfig().getObject("job", new JsonObject()).getBoolean("lite_jobs_enabled", false)) {
logger.error("Lite jobs are disabled so we are not running the job " + job.token());
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "Lite jobs are currently disabled");
newResults.putObject("error", error);
job.status = "completed";
job.results = newResults;
jobs().set(job.token(), job);
return;
}
// Check for the Maximum Concurrent Allowed Lite Jobs Per User
Meta maxLiteJobs = UserData.getMeta(jobUser, "max_lite_jobs");
if (maxLiteJobs != null) {
if (maxLiteJobs.value instanceof Number) {
Number max = (Number) maxLiteJobs.value;
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("lite").equal(true))
.and(e.get("service").equal(false))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "You have reached your maximum amount of lite jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
jobs().set(job.token(), job);
return;
}
}
}
} else if (job.service) {
Meta maxServiceJobs = UserData.getMeta(jobUser, "max_service_jobs");
if (maxServiceJobs != null) {
if (maxServiceJobs.value instanceof Number) {
Number max = (Number) maxServiceJobs.value;
// Check for jobs owned by the user that are not lite jobs but are service and are running
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("lite").equal(false))
.and(e.get("service").equal(true))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "You have reached your maximum amount of service jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
jobs().set(job.token(), job);
return;
}
}
}
} else {
// Check for the Maximum Concurrent Allowed Jobs Per User
Meta maxConcurrentJobs = UserData.getMeta(jobUser, "max_jobs");
if (maxConcurrentJobs != null) {
if (maxConcurrentJobs.value instanceof Number) {
Number max = (Number) maxConcurrentJobs.value;
// Check for jobs owned by the user that are not lite jobs and are not service and are running
EntryObject e = new PredicateBuilder().getEntryObject();
Predicate p = e.get("userToken").equal(jobUser.token())
.and(e.get("lite").equal(false))
.and(e.get("service").equal(false))
.and(e.get("status").equal("running"));
if (jobs().values(p).size() > max.intValue()) {
JsonObject newResults = new JsonObject();
newResults.putBoolean("success", false);
JsonObject error = new JsonObject();
error.putString("code", "System Error");
error.putString("message", "You have reached your maximum amount of jobs you can run at the same time.");
newResults.putObject("error", error);
job.status = "quota_reached";
job.results = newResults;
jobs().set(job.token(), job);
return;
}
}
}
}
logger.info("Submitting the job " + job.token() + " for execution.");
String jobServerToUse = null;
// This code cannot be used if hazelcast is in client mode for jCluster
if (coreConfig().getObject("job", new JsonObject()).getBoolean("execute_on_least_jobs", true)) {
JsonObject leastMemberConfig = null;
Set keys = null;
if(job.serverTag != null && jobUser.admin){
// Right now admin users can only use job tags
// TODO change this later
// This is where server tag queries happen.
// It basically allows you to filter out server job server types
// This make it possible to do a lot of things
keys = registeredJobServers.keySet(new JsonQueryPredicate(job.serverTag));
} else {
keys = registeredJobServers.keySet();
}
for(String nodeId : keys){
JsonObject memberConfig = registeredJobServers.get(nodeId);
if (leastMemberConfig != null) {
int memberSize = jobExecutionNodes.values(new Predicates.EqualPredicate("toString", nodeId)).size();
int leastMemberSize = jobExecutionNodes.values(new Predicates.EqualPredicate("toString", leastMemberConfig.getString("nodeId"))).size();
if (memberSize < leastMemberSize) {
leastMemberConfig = memberConfig;
}
} else {
leastMemberConfig = memberConfig;
}
}
if(leastMemberConfig != null){
jobServerToUse = leastMemberConfig.getString("nodeId");
}
}
if(jobServerToUse == null){
// TODO Make random
// Get the first one in the cluster
jobServerToUse = registeredJobServers.keySet().iterator().next();
}
// We add it to the Queue so we can retrieve it later if it does not get executed
jobsInQueue.add(job.token());
JobExecutor executor = new JobExecutor(job);
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
ObjectOutputStream objStream = new ObjectOutputStream(byteStream );
objStream.writeObject(executor);
Buffer objectData = new Buffer(byteStream.toByteArray());
String serverId = "job.server." + jobServerToUse;
logger.info("Submitting the job " + job.token() + " to \"" + serverId + "\"");
cluster.eventBus().publish(serverId, objectData);
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
handleLock.unlock();
}
return;
}
}
logger.info("The job " + token + " was already handled by another server.");
});
logger.info("The job " + job.token() + " has been submitted");
return job; // We return the job with the new token
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy