org.graylog.scheduler.JobExecutionEngine Maven / Gradle / Ivy
/*
* Copyright (C) 2020 Graylog, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* .
*/
package org.graylog.scheduler;
import com.codahale.metrics.Counter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.inject.assistedinject.Assisted;
import io.opentelemetry.api.trace.Span;
import io.opentelemetry.instrumentation.annotations.WithSpan;
import org.graylog.scheduler.eventbus.JobCompletedEvent;
import org.graylog.scheduler.eventbus.JobSchedulerEventBus;
import org.graylog.scheduler.worker.JobWorkerPool;
import org.joda.time.DateTime;
import org.joda.time.DateTimeZone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.inject.Inject;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.graylog.tracing.GraylogSemanticAttributes.SCHEDULER_JOB_CLASS;
import static org.graylog.tracing.GraylogSemanticAttributes.SCHEDULER_JOB_DEFINITION_ID;
import static org.graylog.tracing.GraylogSemanticAttributes.SCHEDULER_JOB_DEFINITION_TITLE;
import static org.graylog.tracing.GraylogSemanticAttributes.SCHEDULER_JOB_DEFINITION_TYPE;
/**
* The job execution engine checks runnable triggers and starts job execution in the given worker pool.
*/
public class JobExecutionEngine {
public interface Factory {
JobExecutionEngine create(JobWorkerPool workerPool);
}
private static final Logger LOG = LoggerFactory.getLogger(JobExecutionEngine.class);
private final DBJobTriggerService jobTriggerService;
private final DBJobDefinitionService jobDefinitionService;
private final JobSchedulerEventBus eventBus;
private final JobScheduleStrategies scheduleStrategies;
private final JobTriggerUpdates.Factory jobTriggerUpdatesFactory;
private final Map jobFactory;
private final JobWorkerPool workerPool;
private Counter executionSuccessful;
private Counter executionFailed;
private Timer executionTime;
private final AtomicBoolean isRunning = new AtomicBoolean(true);
private final AtomicBoolean shouldCleanup = new AtomicBoolean(true);
@Inject
public JobExecutionEngine(DBJobTriggerService jobTriggerService,
DBJobDefinitionService jobDefinitionService,
JobSchedulerEventBus eventBus,
JobScheduleStrategies scheduleStrategies,
JobTriggerUpdates.Factory jobTriggerUpdatesFactory,
Map jobFactory,
@Assisted JobWorkerPool workerPool, MetricRegistry metricRegistry) {
this.jobTriggerService = jobTriggerService;
this.jobDefinitionService = jobDefinitionService;
this.eventBus = eventBus;
this.scheduleStrategies = scheduleStrategies;
this.jobTriggerUpdatesFactory = jobTriggerUpdatesFactory;
this.jobFactory = jobFactory;
this.workerPool = workerPool;
this.executionSuccessful = metricRegistry.counter(MetricRegistry.name(getClass(), "executions", "successful"));
this.executionFailed = metricRegistry.counter(MetricRegistry.name(getClass(), "executions", "failed"));
this.executionTime = metricRegistry.timer(MetricRegistry.name(getClass(), "executions", "time"));
}
/**
* Signal shutdown to the engine.
*/
public void shutdown() {
// TODO this should indicate that running jobs have been aborted
isRunning.set(false);
}
private void cleanup() {
// The cleanup should only run once before starting job processing to avoid damaging any state.
if (shouldCleanup.getAndSet(false)) {
final int releasedTriggers = jobTriggerService.forceReleaseOwnedTriggers();
if (releasedTriggers > 0) {
LOG.warn("Force-released {} stale job triggers after an unclean job scheduler shutdown", releasedTriggers);
}
}
}
/**
* Execute the engine. This will try to lock a trigger and execute the job if there are free slots in the
* worker pool and the engine is not shutting down.
*
* @return true if a job trigger has been locked and the related job has been triggered, false otherwise
*/
public boolean execute() {
// Cleanup stale scheduler state *before* processing any triggers for the first time.
// This is a no-op after the first invocation.
if (shouldCleanup.get()) {
cleanup();
}
// We want to avoid a call to the database if there are no free slots in the pool or the engine is shutting down
if (isRunning.get() && workerPool.hasFreeSlots()) {
final Optional triggerOptional = jobTriggerService.nextRunnableTrigger();
if (triggerOptional.isPresent()) {
final JobTriggerDto trigger = triggerOptional.get();
if (!workerPool.execute(() -> handleTrigger(trigger))) {
// The job couldn't be executed so we have to release the trigger again with the same nextTime
jobTriggerService.releaseTrigger(trigger, JobTriggerUpdate.withNextTime(trigger.nextTime()));
return false;
}
return true;
}
}
return false;
}
public void updateLockedJobs() {
if (workerPool.anySlotsUsed()) {
jobTriggerService.updateLockedJobTriggers();
}
}
private void handleTrigger(JobTriggerDto trigger) {
LOG.trace("Locked trigger {} (owner={})", trigger.id(), trigger.lock().owner());
try {
final JobDefinitionDto jobDefinition = jobDefinitionService.get(trigger.jobDefinitionId())
.orElseThrow(() -> new IllegalStateException("Couldn't find job definition " + trigger.jobDefinitionId()));
final Job job = jobFactory.get(jobDefinition.config().type()).create(jobDefinition);
if (job == null) {
throw new IllegalStateException("Couldn't find job factory for type " + jobDefinition.config().type());
}
executionTime.time(() -> executeJob(trigger, jobDefinition, job));
} catch (IllegalStateException e) {
// The trigger cannot be handled because of a permanent error so we mark the trigger as defective
LOG.error("Couldn't handle trigger due to a permanent error {} - trigger won't be retried", trigger.id(), e);
jobTriggerService.setTriggerError(trigger);
} catch (Exception e) {
// The trigger cannot be handled because of an unknown error, retry in a few seconds
// TODO: Check if we need to implement a max-retry after which the trigger is set to ERROR
final DateTime nextTime = DateTime.now(DateTimeZone.UTC).plusSeconds(5);
LOG.error("Couldn't handle trigger {} - retrying at {}", trigger.id(), nextTime, e);
jobTriggerService.releaseTrigger(trigger, JobTriggerUpdate.withNextTime(nextTime));
} finally {
eventBus.post(JobCompletedEvent.INSTANCE);
}
}
@WithSpan
private void executeJob(JobTriggerDto trigger, JobDefinitionDto jobDefinition, Job job) {
Span.current().setAttribute(SCHEDULER_JOB_CLASS, job.getClass().getSimpleName())
.setAttribute(SCHEDULER_JOB_DEFINITION_TYPE, jobDefinition.config().type())
.setAttribute(SCHEDULER_JOB_DEFINITION_TITLE, jobDefinition.title())
.setAttribute(SCHEDULER_JOB_DEFINITION_ID, String.valueOf(jobDefinition.id()));
try {
if (LOG.isDebugEnabled()) {
LOG.debug("Execute job: {}/{}/{} (job-class={} trigger={} config={})", jobDefinition.title(), jobDefinition.id(),
jobDefinition.config().type(), job.getClass().getSimpleName(), trigger.id(), jobDefinition.config());
}
final JobTriggerUpdate triggerUpdate = job.execute(JobExecutionContext.create(trigger, jobDefinition, jobTriggerUpdatesFactory.create(trigger), isRunning, jobTriggerService));
if (triggerUpdate == null) {
executionFailed.inc();
throw new IllegalStateException("Job#execute() must not return null - this is a bug in the job class");
}
executionSuccessful.inc();
LOG.trace("Update trigger: trigger={} update={}", trigger.id(), triggerUpdate);
jobTriggerService.releaseTrigger(trigger, triggerUpdate);
} catch (JobExecutionException e) {
LOG.error("Job execution error - trigger={} job={}", trigger.id(), jobDefinition.id(), e);
executionFailed.inc();
jobTriggerService.releaseTrigger(e.getTrigger(), e.getUpdate());
} catch (Exception e) {
executionFailed.inc();
// This is an unhandled job execution error so we mark the trigger as defective
LOG.error("Unhandled job execution error - trigger={} job={}", trigger.id(), jobDefinition.id(), e);
// Calculate the next time in the future based on the trigger schedule. We cannot do much else because we
// don't know what happened and we also got no instructions from the job. (no JobExecutionException)
final DateTime nextFutureTime = scheduleStrategies.nextFutureTime(trigger).orElse(null);
jobTriggerService.releaseTrigger(trigger, JobTriggerUpdate.withNextTime(nextFutureTime));
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy