
org.hawkular.metrics.tasks.impl.TaskSchedulerImpl Maven / Gradle / Ivy
/*
* Copyright 2014-2015 Red Hat, Inc. and/or its affiliates
* and other contributors as indicated by the @author tags.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.hawkular.metrics.tasks.impl;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import org.hawkular.metrics.tasks.DateTimeService;
import org.hawkular.metrics.tasks.api.RepeatingTrigger;
import org.hawkular.metrics.tasks.api.SingleExecutionTrigger;
import org.hawkular.metrics.tasks.api.Task2;
import org.hawkular.metrics.tasks.api.TaskScheduler;
import org.hawkular.metrics.tasks.api.Trigger;
import org.hawkular.metrics.tasks.log.TaskQueueLogger;
import org.hawkular.metrics.tasks.log.TaskQueueLogging;
import org.hawkular.rx.cassandra.driver.RxSession;
import org.joda.time.DateTime;
import org.joda.time.Duration;
import com.datastax.driver.core.KeyspaceMetadata;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.UDTValue;
import com.datastax.driver.core.UserType;
import com.google.common.hash.HashCode;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import rx.Observable;
import rx.Scheduler;
import rx.Subscriber;
import rx.Subscription;
import rx.functions.Action1;
import rx.schedulers.Schedulers;
import rx.subjects.PublishSubject;
/**
* @author jsanda
*/
public class TaskSchedulerImpl implements TaskScheduler {
private static TaskQueueLogger log = TaskQueueLogging.getTaskQueueLogger(TaskSchedulerImpl.class);
public static final int DEFAULT_LEASE_TTL = 180;
private int numShards = Integer.parseInt(System.getProperty("hawkular.scheduler.shards", "10"));
private HashFunction hashFunction = Hashing.murmur3_128();
private RxSession session;
private Queries queries;
/**
* Runs a single job on a tight loop. The sole purpose of that job is to emit a tick on
* the leases thread pool. Each tick represents a time slice to be processed. Emission
* of ticks is very fast as it just involves queueing up the tick on the leases thread
* pool. It needs to be fast and non-blocking to ensure that do not skip a time slice.
*/
private ScheduledExecutorService tickExecutor;
private Scheduler tickScheduler;
/**
* When a tick is emitted, a job is submitted onto the leases thread pool to process
* leases for the corresponding time slice. The leases thread pool contains only a
* single thread to ensure we process leases/tasks in order with respect to time.
*/
private ExecutorService leaseExecutor;
/**
* The thread pool in which task execution is performed. An instance of
* TaskSchedulerImpl will execute multiple tasks in parallel provided they all share
* the same lease.
*/
private ExecutorService tasksExecutor;
private Scheduler tasksScheduler;
private Scheduler leaseScheduler;
private DateTimeService dateTimeService;
private boolean running;
/**
* A subject to broadcast tasks that are to be executed. Other task scheduling libraries
* and frameworks have a more tight coupling with the objects that perform the actual
* task execution. The pub/sub style makes things more loosely coupled. There are a
* couple benefits. First, it gives clients full control over the life cycle of the
* objects doing the task execution. Secondly, it makes writing tests easier.
*/
private PublishSubject taskSubject;
private PublishSubject tickSubject;
private Subscription leasesSubscription;
public TaskSchedulerImpl(RxSession session, Queries queries) {
this.session = session;
this.queries = queries;
dateTimeService = new DateTimeService();
tickExecutor = Executors.newScheduledThreadPool(1,
new ThreadFactoryBuilder().setNameFormat("ticker-pool-%d").build());
tickScheduler = Schedulers.from(tickExecutor);
leaseExecutor = Executors.newSingleThreadExecutor(
new ThreadFactoryBuilder().setNameFormat("lease-pool-%d").build());
tasksExecutor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors(),
new ThreadFactoryBuilder().setNameFormat("tasks-pool-%d").build());
tasksScheduler = Schedulers.from(tasksExecutor);
leaseScheduler = Schedulers.from(leaseExecutor);
taskSubject = PublishSubject.create();
tickSubject = PublishSubject.create();
}
public void setTickScheduler(Scheduler scheduler) {
this.tickScheduler = scheduler;
}
private class SubscriberWrapper extends Subscriber {
private Subscriber delegate;
public SubscriberWrapper(Subscriber delegate) {
this.delegate = delegate;
}
@Override
public void onCompleted() {
delegate.onCompleted();
}
@Override
public void onError(Throwable e) {
delegate.onError(e);
}
@Override
public void onNext(Task2 task2) {
try {
delegate.onNext(task2);
} catch (Exception e) {
log.warnTaskExecutionFailed(task2, e);
}
}
}
/**
* Subscribe a callback that will be responsible for executing tasks.
*
* @param onNext The task execution callback
* @return A subscription which can be used to stop receiving task notifications.
*/
@Override
public Subscription subscribe(Action1 onNext) {
return taskSubject.subscribe(onNext);
}
/**
* Subscribe a callback that will be responsible for executing tasks.
*
* @param subscriber The callback
* @return A subscription which can be used to stop receiving task notifications.
*/
@Override
public Subscription subscribe(Subscriber subscriber) {
return taskSubject.subscribe(new SubscriberWrapper(subscriber));
}
@Override
public Observable getFinishedTimeSlices() {
return tickSubject;
}
@Override
public boolean isRunning() {
return running;
}
@Override
public Observable getTasks() {
return taskSubject;
}
/**
* Starts the scheduler so that it starts emitting tasks for execution.
*
* @return An observable that emits leases that are processed by this TaskScheduler
* object. The observable emits a lease only after it has been fully processed. This
* means all tasks belonging to the task have been executed, and the lease has been
* marked finished. Note that the observable is hot; in other words, it emits leases
* regardless of wehther or not there are any subscriptions.
*/
@Override
public Observable start() {
Observable seconds = createTicks();
Observable leases = seconds.flatMap(this::getAvailableLeases);
Observable processedLeases = Observable.create(subscriber -> {
leasesSubscription = leases.subscribe(
lease -> {
log.debugf("Loading tasks for %s", lease);
CountDownLatch latch = new CountDownLatch(1);
getQueue(lease)
.observeOn(tasksScheduler)
.groupBy(Task2Impl::getGroupKey)
.flatMap(group -> group.flatMap(this::execute).map(this::rescheduleTask))
.subscribe(
task -> log.debugf("Finished executing %s", task),
t -> log.warnTasksObservationProblem(t),
() -> {
Date timeSlice = new Date(lease.getTimeSlice());
// TODO We need error handling here
// We do not want to mark the lease finished if deleting the task partition
// fails. If either delete fails, we probably want to employ some retry
// policy. If the failures continue, then we probably need to shut down the
// scheduler because Cassandra is unstable.
Observable.merge(
session.execute(queries.deleteTasks.bind(timeSlice,
lease.getShard()), tasksScheduler),
session.execute(queries.finishLease.bind(timeSlice,
lease.getShard()), tasksScheduler)
).subscribe(
resultSet -> {},
t -> {
log.warnTaskPostProcessProblem(t);
subscriber.onError(t);
},
() -> {
log.debugf("Finished executing tasks for %s", lease);
latch.countDown();
subscriber.onNext(lease);
}
);
}
);
log.debugf("Started processing tasks for %s", lease);
try {
// While using a CountDownLatch might seem contrary to RxJava, we
// want to block here until all tasks have finished executing. We
// We only want to acquire another lease and execute its tasks after
// we are finished with the current lease. If we do not block here,
// then we immediately start polling for another lease. We do
// not want to try and acquire another lease until we have
// finished with the tasks for the current lease.
latch.await();
log.debug("Done waiting!");
} catch (InterruptedException e) {
log.warnInterruptionOnTaskCompleteWaiting(e);
}
},
t -> log.warnLeasesObservationProblem(t),
() -> {
log.debug("Finished observing leases");
subscriber.onCompleted();
}
);
});
// We emit leases using a subject in order to make our observable hot. We want to
// process/emit leases regardless of whether or not there are any subscribers. Note
// that having an observable emit leases helps facilitate testing, and that was the
// primary motivation for having this method return a hot observable.
PublishSubject leasesSubject = PublishSubject.create();
processedLeases.subscribe(leasesSubject);
running = true;
return leasesSubject;
}
/**
*
* Returns an observable that emits "ticks" in the form of {@link Date} objects every
* minute. Each tick represents a time slice to be processed.
*
*
* Note: Ticks must be emitted on the tick scheduler and observed on
* the lease scheduler. No other work should run on the tick scheduler. This is to help
* ensure nothing blocks ticks from being emitted every second.
*
*/
// TODO We probably still need a check in place to make sure we don't skip a second
private Observable createTicks() {
// TODO handle back pressure
// The timer previously emitted ticks every second, but it has been changed to
// emit every minute to avoid back pressure. Emitting ticks less frequently
// should help a lot but it does not completely avoid the problem. It only
// takes one really long running task to cause back pressure. We will need to
// figure something out.
return Observable.interval(0, 1, TimeUnit.MINUTES, tickScheduler)
.map(tick -> currentTimeSlice())
.takeUntil(d -> !running)
.doOnNext(tick -> log.debugf("Tick %s", tick))
.observeOn(leaseScheduler);
}
/**
*
* Creates an observable that emits acquired leases for the specified time slice. The
* observable queries for available leases and emits the first one it acquires. After
* the lease has been emitted, the observable "refreshes" (i.e., reloads them from the
* database) the set of available leases since they can and will change when there are
* multiple TaskScheduler instances running. The suscriber's
* {@link Subscriber#onCompleted() onCompleted} method is called when all leases for
* the time slice have been processed.
*
*
* Note: The observable returned from this method must run on the
* leases scheduler to ensure that tasks are processed in order with respect to time.
*
*/
private Observable getAvailableLeases(Date timeSlice) {
Observable observable = Observable.create(subscriber -> {
// This observable is intentionally blocking. The queries that it executes are
// NOT async by design. We want to process lease serially, one at a time. Once
// we acquire a lease, we execute all of the tasks for the lease. We check for
// available leases again only after those tasks have completed.
try {
if (log.isDebugEnabled()) {
log.debug("Loading leases for " + timeSlice);
log.debug("Timestamp is " + timeSlice.getTime());
}
List leases = findAvailableLeases(timeSlice);
while (!leases.isEmpty()) {
for (Lease lease : leases) {
if (acquire(lease)) {
log.debugf("Acquired %s", lease);
subscriber.onNext(lease);
log.debugf("Finished with %s", lease);
}
break;
}
log.debug("Looking for available leases");
leases = findAvailableLeases(timeSlice);
}
log.debugf("No more leases to process for %s", timeSlice);
// TODO we do not want to perform a delete if there are no leases for the time slice
session.execute(queries.deleteLeases.bind(timeSlice)).toBlocking().first();
subscriber.onCompleted();
tickSubject.onNext(timeSlice.getTime());
} catch (Exception e) {
subscriber.onError(e);
}
});
return observable;//.observeOn(leaseScheduler);
}
/**
* Returns leases for the specified time slice that are not yet finished.
*/
private List findAvailableLeases(Date timeSlice) {
// Normally our queries are async, but we want this to sync/blocking. This method
// is called from the available leases observable which serializes its execution.
return session.execute(queries.findLeases.bind(timeSlice))
.flatMap(Observable::from)
.map(row -> new Lease(timeSlice.getTime(), row.getInt(0), row.getString(1), row.getBool(2)))
.filter(lease -> !lease.isFinished() && lease.getOwner() == null)
.toList()
.toBlocking()
.firstOrDefault(Collections.emptyList());
}
/**
* Attempts to acquire a lease.
*/
private boolean acquire(Lease lease) {
return session.execute(queries.acquireLease.bind(DEFAULT_LEASE_TTL, "localhost", new Date(lease.getTimeSlice()),
lease.getShard())).map(ResultSet::wasApplied).toBlocking().firstOrDefault(false);
}
/**
* Loads the task queue for the specified lease. The returned observable emits tasks in
* the queue. The observable should execute on the lease scheduler.
*/
Observable getQueue(Lease lease) {
log.debugf("Loading task queue for %s", lease);
return session.execute(queries.getTasksFromQueue.bind(new Date(lease.getTimeSlice()), lease.getShard()),
Schedulers.immediate())
.flatMap(Observable::from)
.map(row -> new Task2Impl(row.getUUID(2), row.getString(0), row.getInt(1), row.getString(3),
row.getMap(4, String.class, String.class), getTrigger(row.getUDTValue(5))));
}
/**
* Creates an observable that emits a task that has been executed. Task execution is
* accomplished by publishing the task. This method is somewhat of a hack because it
* is really just for side effects. We want tasks from different groups to execute in
* parallel. Execution of tasks within the same group should be serialized based on
* their specified order. If the tasks have the same order, they can be executed in
* parallel. The observable should run on the tasks scheduler.
*
* @param task The task to emit for execution
* @return An observable that emits a task once it has been executed.
*/
private Observable execute(Task2Impl task) {
Observable observable = Observable.create(subscriber -> {
log.debugf("Emitting %s for execution", task);
// This onNext call is to perform the actual task execution
taskSubject.onNext(task);
// This onNext call is for data flow. After the task executes, we call
// this onNext so that the task gets rescheduled.
subscriber.onNext(task);
subscriber.onCompleted();
log.debugf("Finished executing %s", task);
});
// Subscribe on the same scheduler thread to make sure tasks within the same group
// execute in order.
return observable.subscribeOn(Schedulers.immediate());
}
@Override
public void shutdown() {
try {
log.debug("shutting down");
running = false;
if (leasesSubscription != null) {
leasesSubscription.unsubscribe();
}
tasksExecutor.shutdown();
tasksExecutor.awaitTermination(5, TimeUnit.SECONDS);
leaseExecutor.shutdown();
leaseExecutor.awaitTermination(5, TimeUnit.SECONDS);
tickExecutor.shutdown();
tickExecutor.awaitTermination(5, TimeUnit.SECONDS);
} catch (InterruptedException e) {
throw new RuntimeException("Interrupted during shutdown", e);
}
}
private Date currentTimeSlice() {
// return dateTimeService.getTimeSlice(now(), Duration.standardSeconds(1)).toDate();
return dateTimeService.getTimeSlice(new DateTime(tickScheduler.now()), Duration.standardMinutes(1)).toDate();
}
// @Override
// public Observable createTask(String name, Map parameters, Trigger trigger) {
// UUID id = UUID.randomUUID();
// int shard = computeShard(id);
// UDTValue triggerUDT = getTriggerValue(session, trigger);
//
// return Observable.create(subscriber ->
// session.execute(queries.createTask2.bind(id, shard, name, parameters, triggerUDT)).subscribe(
// resultSet -> subscriber.onNext(new Task2Impl(id, shard, name, parameters, trigger)),
// t -> subscriber.onError(new RuntimeException("Failed to create task", t)),
// subscriber::onCompleted
// )
// );
// }
public Observable findTask(UUID id) {
return Observable.create(subscriber ->
session.execute(queries.findTask.bind(id)).flatMap(Observable::from).subscribe(
row -> subscriber.onNext(new Task2Impl(id, row.getString(0), row.getInt(1), row.getString(2),
row.getMap(3, String.class, String.class), getTrigger(row.getUDTValue(4)))),
t -> subscriber.onError(new RuntimeException("Failed to find task with id " + id, t)),
subscriber::onCompleted
)
);
}
@Override
public Observable scheduleTask(String name, String groupKey, int executionOrder,
Map parameters, Trigger trigger) {
UUID id = UUID.randomUUID();
int shard = computeShard(groupKey);
UDTValue triggerUDT = getTriggerValue(session, trigger);
Date timeSlice = new Date(trigger.getTriggerTime());
Task2Impl task = new Task2Impl(id, groupKey, executionOrder, name, parameters, trigger);
log.debugf("Scheduling %s", task);
Observable createTask = session.execute(queries.createTask2.bind(id, groupKey, executionOrder, name,
parameters, triggerUDT));
Observable updateQueue = session.execute(queries.insertIntoQueue.bind(timeSlice, shard, id, groupKey,
executionOrder, name, parameters, triggerUDT));
Observable createLease = session.execute(queries.createLease.bind(timeSlice, shard));
return Observable.create(subscriber ->
Observable.merge(createTask, updateQueue, createLease).subscribe(
resultSet -> {},
t -> subscriber.onError(new RuntimeException("Failed to schedule task [" + task + "]", t)),
() -> {
try {
subscriber.onNext(task);
subscriber.onCompleted();
} catch (Throwable t) {
subscriber.onError(t);
}
}
)
);
}
/**
* Reschedules the task for subsequent execution. The current implementation assumes
* repeating triggers. We need to update this method to handling single-execution
* triggers. The task is inserted into the new queue and a new lease is created. This
* observabe should execute on the tasks scheduler.
*
* @param task The task to be rescheduled.
* @return An observable that emits the rescheduled task which contains the new/next
* trigger. {@link rx.Observer#onNext(Object) onNext} is invoked after the
* database queries for updating the queue and creating the lease have
* completed.
*/
private Observable rescheduleTask(Task2Impl task) {
Trigger nextTrigger = task.getTrigger().nextTrigger();
if (nextTrigger == null) {
log.debugf("There are no more executions for %s", task);
return Observable.just(task);
}
int shard = computeShard(task.getGroupKey());
Task2Impl newTask = new Task2Impl(task.getId(), task.getGroupKey(), task.getOrder(), task.getName(),
task.getParameters(), task.getTrigger().nextTrigger());
UDTValue triggerUDT = getTriggerValue(session, newTask.getTrigger());
Date timeSlice = new Date(newTask.getTrigger().getTriggerTime());
if (log.isDebugEnabled()) {
log.debug("Next execution time for Task2Impl{id=" + newTask.getId() + ", name=" + newTask.getName() +
"} is " + new Date(newTask.getTrigger().getTriggerTime()));
}
Observable updateQueue = session.execute(queries.insertIntoQueue.bind(timeSlice, shard,
newTask.getId(), task.getGroupKey(), task.getOrder(), newTask.getName(), newTask.getParameters(),
triggerUDT), tasksScheduler);
Observable createLease = session.execute(queries.createLease.bind(timeSlice, shard), tasksScheduler);
Observable observable = Observable.create(subscriber -> {
Observable.merge(updateQueue, createLease).subscribe(
resultSet -> {
},
// TODO handle rescheduling failure
// If either write fails, we treat it as a scheduling failure. We cannot
// delete the current task/lease until these writes succeed. I think we
// should try again after some delay, and if we continue to fail, then
// we shut down the scheduler.
t -> subscriber.onError(new RuntimeException("Failed to reschedule " + newTask, t)),
() -> {
try {
log.debugf("Received result set for reschedule task, %s", newTask);
subscriber.onNext(newTask);
subscriber.onCompleted();
} catch (Exception e) {
subscriber.onError(e);
}
}
);
});
return observable;//.observeOn(Schedulers.immediate());
}
int computeShard(String key) {
HashCode hashCode = hashFunction.hashBytes(key.getBytes());
return Hashing.consistentHash(hashCode, numShards);
}
private static KeyspaceMetadata getKeyspace(RxSession session) {
return session.getCluster().getMetadata().getKeyspace(session.getLoggedKeyspace());
}
static Trigger getTrigger(UDTValue value) {
int type = value.getInt("type");
switch (type) {
case 0:
return new SingleExecutionTrigger(value.getLong("trigger_time"));
case 1:
return new RepeatingTrigger(
value.getLong("interval"),
value.getLong("delay"),
value.getLong("trigger_time"),
value.getInt("repeat_count"),
value.getInt("execution_count")
);
default:
throw new IllegalArgumentException("Trigger type [" + type + "] is not supported");
}
}
static UDTValue getTriggerValue(RxSession session, Trigger trigger) {
if (trigger instanceof RepeatingTrigger) {
return getRepeatingTriggerValue(session, (RepeatingTrigger) trigger);
}
if (trigger instanceof SingleExecutionTrigger) {
return getSingleExecutionTriggerValue(session, (SingleExecutionTrigger) trigger);
}
throw new IllegalArgumentException(trigger.getClass() + " is not a supported trigger type");
}
static UDTValue getSingleExecutionTriggerValue(RxSession session, SingleExecutionTrigger trigger) {
UserType triggerType = getKeyspace(session).getUserType("trigger_def");
UDTValue triggerUDT = triggerType.newValue();
triggerUDT.setInt("type", 0);
triggerUDT.setLong("trigger_time", trigger.getTriggerTime());
return triggerUDT;
}
static UDTValue getRepeatingTriggerValue(RxSession session, RepeatingTrigger trigger) {
UserType triggerType = getKeyspace(session).getUserType("trigger_def");
UDTValue triggerUDT = triggerType.newValue();
triggerUDT.setInt("type", 1);
triggerUDT.setLong("interval", trigger.getInterval());
triggerUDT.setLong("trigger_time", trigger.getTriggerTime());
if (trigger.getDelay() > 0) {
triggerUDT.setLong("delay", trigger.getDelay());
}
if (trigger.getRepeatCount() != null) {
triggerUDT.setInt("repeat_count", trigger.getRepeatCount());
triggerUDT.setInt("execution_count", trigger.getExecutionCount());
}
return triggerUDT;
}
}