nextflow.scheduler.SchedulerAgent.groovy Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of nf-ignite Show documentation
Show all versions of nf-ignite Show documentation
A DSL modelled around the UNIX pipe concept, that simplifies writing parallel and scalable pipelines in a portable manner(forked from nextflow.io)
The newest version!
/*
* Copyright 2013-2019, Centre for Genomic Regulation (CRG)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nextflow.scheduler
import java.nio.channels.ClosedByInterruptException
import java.util.concurrent.BlockingQueue
import java.util.concurrent.ConcurrentHashMap
import java.util.concurrent.ExecutorService
import java.util.concurrent.Executors
import java.util.concurrent.Future
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.RejectedExecutionException
import java.util.concurrent.TimeUnit
import java.util.concurrent.locks.Condition
import java.util.concurrent.locks.Lock
import java.util.concurrent.locks.ReentrantLock
import groovy.transform.CompileStatic
import groovy.transform.PackageScope
import groovy.transform.TupleConstructor
import groovy.util.logging.Slf4j
import nextflow.cloud.CloudDriver
import nextflow.cloud.CloudDriverFactory
import nextflow.daemon.IgGridFactory
import nextflow.executor.IgBaseTask
import nextflow.processor.TaskId
import nextflow.scheduler.Protocol.NodeData
import nextflow.scheduler.Protocol.NodeIdle
import nextflow.scheduler.Protocol.NodeShutdown
import nextflow.scheduler.Protocol.Resources
import nextflow.scheduler.Protocol.TaskAvail
import nextflow.scheduler.Protocol.TaskCancel
import nextflow.scheduler.Protocol.TaskComplete
import nextflow.scheduler.Protocol.TaskStart
import nextflow.util.ClusterConfig
import nextflow.util.Duration
import nextflow.util.MemoryUnit
import nextflow.util.SysHelper
import org.apache.ignite.Ignite
import org.apache.ignite.IgniteCache
import org.apache.ignite.cache.query.ScanQuery
import org.apache.ignite.cluster.ClusterGroupEmptyException
import org.apache.ignite.events.DiscoveryEvent
import org.apache.ignite.events.Event
import org.apache.ignite.events.EventType
import org.apache.ignite.lang.IgniteBiPredicate
import org.apache.ignite.lang.IgnitePredicate
import static nextflow.Const.ROLE_MASTER
import static nextflow.scheduler.Protocol.PENDING_TASKS_CACHE
import static nextflow.scheduler.Protocol.TOPIC_AGENT_EVENTS
import static nextflow.scheduler.Protocol.TOPIC_SCHEDULER_EVENTS
/**
* Implements the scheduler execution logic. Each worker deploy an instance of this class
* to process the tasks submitted by driver nextflow application
*
* @author Paolo Di Tommaso
*/
@Slf4j
@CompileStatic
class SchedulerAgent implements Closeable {
/**
* Predicate that scans all tasks whose resource request match the ones available in the scheduler agent
*/
static class MatchingResources implements IgniteBiPredicate {
int cpus
MemoryUnit memory
MemoryUnit disk
MatchingResources( Resources avail ) {
cpus = avail.cpus
memory = avail.memory
disk = avail.disk
}
@Override
boolean apply(TaskId taskId, IgBaseTask task) {
if(task.resources.cpus > cpus) return false
if(task.resources.memory && task.resources.memory > memory) return false
if(task.resources.disk && task.resources.disk > disk) return false
return true
}
String toString() {
"cpus=$cpus; mem=$memory; disk=$disk"
}
}
@CompileStatic
private class AgentProcessor extends Thread {
private Lock checkpoint = new ReentrantLock()
private Condition notEmpty = checkpoint.newCondition()
private Resources current
private volatile boolean stopped
private BlockingQueue eventsQueue = new LinkedBlockingQueue<>()
private long idleTimestamp
private long _1_min = Duration.of('1 min').toMillis()
private volatile int execErrCount
private int fetchErrCount
AgentProcessor() {
this.name = 'scheduler-agent'
}
@Override
void run() {
this.current = new Resources(config)
log.debug "=== Scheduler agent resources: $current"
while( !stopped ) {
try {
if( masterId ) {
// process any pending events
processEvents()
// check for spot/preemptive termination
checkSpotTermination()
// process any pending task
if( processPendingTasks(current) )
continue
// if got a stop event just exit
else if( stopped )
break
// check if this node is doing nothing
checkIfIdle()
// wait for new messages
checkpoint.withLock {
notEmpty.await(5, TimeUnit.SECONDS)
}
}
else if( !stopped ) {
resetState()
waitForMasterNodeToJoin()
}
}
catch( InterruptedException e ) {
log.trace "=== Message processor interrupted"
stopped = true
}
catch( RejectedExecutionException e ) {
log.trace "=== Task execution rejected -- ${e.message ?: e}"
}
catch( Exception e ) {
stopped = e?.message?.contains('grid is stopping') || e?.message?.contains('cache is stopped')
log.error "=== Unexpected scheduler agent error", e
}
}
}
private void abortPendingTasks() {
if( !runningTasks ) {
return
}
log.debug "=== aborting pending tasks: taskId=${runningTasks.keySet().join(",") ?: '-'}"
def itr = runningTasks.values().iterator()
while( itr.hasNext() ) {
RunHolder holder = itr.next()
holder.future.cancel(true)
}
}
private void checkSpotTermination() {
def termination = driver?.getLocalTerminationNotice()
if( termination || (simulateSpotTermination && runningTasks)) {
log.debug "=== Detected spot termination notice: $termination -- Starting shutdown"
abortPendingTasks()
notifyNodeRetired(termination ?: 'fake-spot-termination')
stopped = true
close(true)
}
}
private void resetState() {
if( runningTasks ) {
log.trace "=== Cancelling running tasks: taskId=${runningTasks.keySet().join(', ') ?: '-'}"
def itr = runningTasks.values().iterator()
while( itr.hasNext() ) {
RunHolder holder = itr.next()
holder.future.cancel(true)
}
runningTasks.clear()
}
// reset the distributed cache
pendingTasks.clear()
current = new Resources(config)
log.trace "=== Agent resources after reset: $current"
eventsQueue.clear()
idleTimestamp = 0
}
private void waitForMasterNodeToJoin() {
int c=0
while( !masterId && !stopped ) {
if ( c++ % 60 == 0 ) {
log.debug "=== Waiting for master node to join.."
}
try {
sleep 5_000
}
catch (InterruptedException e) {
stopped = true
}
}
}
void shutdown() {
stopped = true
newMessage()
}
void newMessage() {
checkpoint.withLock {
notEmpty.signal()
}
}
void async( Closure closure ) {
eventsQueue << closure
newMessage()
}
/**
* Process the messages added to the {@link #eventsQueue} queue
*
* @param res
*/
void processEvents() {
Closure msg
while( (msg=eventsQueue.poll()) && !stopped ) {
msg.call()
}
}
int processPendingTasks( Resources avail ) {
// -- introduce a penalty on error burst
int count = Math.max(fetchErrCount, execErrCount)
if( count ) {
final penalty = Math.round(Math.pow(count, 1.8d) * 1_000i)
log.debug "=== Error burst prevention: sleep penalty=${Duration.of(penalty)}; execErrCount=$execErrCount; fetchErrCount=$fetchErrCount"
sleep penalty
}
// -- process pending tasks
try {
processPendingTasks0(avail)
fetchErrCount = 0
}
catch (Throwable e) {
fetchErrCount++
throw e
}
}
int processPendingTasks0( Resources avail ) {
// -- find candidate tasks to be executed
def tasks = pendingTasks
.query(new ScanQuery(new MatchingResources(avail)))
.getAll()
.collect { it -> it.value }
// -- try to acquire tasks to run
def count=0
def itr = tasks.iterator()
while( itr.hasNext() && avail.cpus && avail.memory && !stopped ) {
count++
final it = itr.next()
final res = it.resources
if( !canRun(it, avail) )
continue
if( pendingTasks.getAndRemove(it.taskId) ) {
log.trace "=== Picked task up: taskId=${it.taskId}"
// -- decrement resources
avail.cpus -= res.cpus
avail.memory -= res.memory
avail.disk -= res.disk
// -- send to the executor
try {
runningTasks[it.taskId] = new RunHolder(taskExecutor.submit( runTask(it) ))
}
catch (RejectedExecutionException e) {
rollbackResources(it, true)
throw e
}
// -- reset the idle status
idleTimestamp = 0
}
}
return count
}
boolean canRun( IgBaseTask it, Resources avail ) {
final req = it.resources
log.trace "Check avail resources: taskId=${it.taskId}; req=[$req]; avail=[$avail]"
if( req.cpus && req.cpus > avail.cpus ) {
log.trace "=== Cannot execute task: taskId=${it.taskId} -- CPUs request exceed available (req=${req.cpus}; avail=${avail.cpus})"
return false
}
if( req.memory && req.memory > avail.memory ) {
log.trace "=== Cannot execute task: taskId=${it.taskId} -- Memory request exceed available (req=${req.memory}; avail=${avail.memory})"
return false
}
if( req.disk && req.disk > avail.disk ) {
log.trace "=== Cannot execute task: taskId=${it.taskId} -- Disk request exceed available (req=${req.disk}; avail=${avail.disk})"
return false
}
return true
}
Runnable runTask(IgBaseTask task) {
new Runnable() {
@Override void run() { runTask0(task) }
}
}
void runTask0( IgBaseTask task ) {
// -- signal that task has started
notifyTaskStart(task)
boolean error
try {
def result = task.call()
notifyComplete(task, result)
error = result instanceof Integer && ((int)result) > 0
}
catch( InterruptedException | ClosedByInterruptException e ) {
log.trace "=== Task execution was interrupted: taskId=${task.taskId} -- Message: ${e.message ?: e}"
error = true
}
catch( Throwable e ) {
notifyError(task, e)
error = true
}
finally {
// Note: since this method is run concurrently by an executor service
// invoke the `restoreResources` method by using the message to the owning thread
// In this way it not necessary to apply an expensive synchronisation
// logic when increasing/decreasing resources amount
async{ owner.rollbackResources(task, error) }
}
}
/**
* Restore the resources "consumed" by the task
*
* @param task The {@link IgBaseTask} that used some resources to be restored
*/
void rollbackResources(IgBaseTask task, boolean errorFlag) {
final used = task.resources
final taskId = task.taskId
// -- update pending resources
if( runningTasks.containsKey(taskId)) {
runningTasks.remove(taskId)
// restore resources
current.cpus += used.cpus
current.memory += used.memory
current.disk = SysHelper.getAvailDisk()
log.trace "=== Resources after task execution: taskId=$taskId; $current"
// track the time when enter in idle state
if( current.cpus == total.cpus ) {
idleTimestamp = System.currentTimeMillis()
}
}
// -- update error counter
if( errorFlag ) {
execErrCount++
}
else {
execErrCount = 0
}
log.trace "=== Errors: flag=$errorFlag; execErrCount=$execErrCount"
}
void checkIfIdle() {
final now = System.currentTimeMillis()
if( idleTimestamp && now-idleTimestamp >_1_min ) {
// send a message to notify the `idle` status of this node
notifyNodeIdle(idleTimestamp)
// reset the timestamp to avoid to send multiple notifications for the same idle condition
idleTimestamp = 0
}
}
}
/**
* Pair object holding a {@link Future} to a running task
*/
@TupleConstructor
private static class RunHolder {
Future future
}
/**
* The underlying executor service
*/
private ExecutorService taskExecutor
/**
* Distributed cached of all tasks waiting to be processed
*/
private IgniteCache pendingTasks
/**
* Local map of the current running tasks
*/
private Map runningTasks = new ConcurrentHashMap<>()
/**
* Reference to the {@link Ignite} instance
*/
private Ignite ignite
/**
* Reference to the {@link ClusterConfig} object
*/
private ClusterConfig config
private AgentProcessor eventProcessor
private Resources total
private volatile boolean closed
private volatile UUID masterId
private CloudDriver driver
private boolean simulateSpotTermination
/**
* Initialise the scheduler agent
*
* @param ignite The {@link Ignite} instance
* @param config The {@link ClusterConfig} instance
*/
SchedulerAgent(Ignite ignite, ClusterConfig config, UUID masterId = null) {
this.config = config
this.ignite = ignite
this.pendingTasks = ignite.cache(PENDING_TASKS_CACHE)
this.taskExecutor = Executors.newFixedThreadPool(SysHelper.getAvailCpus())
this.total = new Resources(config)
this.driver = getCloudDriver(config)
this.eventProcessor = new AgentProcessor()
this.simulateSpotTermination = config.getAttribute('simulateSpotTermination') as boolean
// -- register events to listen to
registerEvents()
// -- notify the node has started
this.masterId = masterId ?: getMasterNodeId()
if( this.masterId ) {
notifyNodeStart()
}
}
private CloudDriver getCloudDriver( ClusterConfig config ) {
final driverName = config.getCloudDriverName()
try {
return config.isCloudCluster() ? CloudDriverFactory.getDriver(driverName) : null
}
catch( Exception e ) {
log.error "=== Can't load cloud driver: `$driverName`", e
return null
}
}
private IgniteBiPredicate createMessageDispatcher() {
//
// agent messages
//
return { UUID uuid, Object message ->
if( message instanceof TaskAvail ) {
eventProcessor.newMessage()
}
else if( message instanceof TaskCancel ) {
onCancelTask(message)
}
else if( message instanceof NodeShutdown ) {
onNodeShutdown(uuid)
}
else {
throw new IllegalStateException("Unknown agent event: ${message?.getClass()?.getName()}")
}
return true
} as IgniteBiPredicate
}
private IgnitePredicate createEventDispatcher() {
return { Event event ->
if( event instanceof DiscoveryEvent ) {
if( event.type() == EventType.EVT_NODE_LEFT ) {
onNodeLeft(event.eventNode().id())
}
else if( event.type() == EventType.EVT_NODE_FAILED ) {
onNodeFailed(event.eventNode().id())
}
else if( event.type() == EventType.EVT_NODE_JOINED ) {
onNodeJoined(event.eventNode().id())
}
else
throw new IllegalArgumentException("Unknown event: $event")
}
return true
} as IgnitePredicate
}
private void onNodeJoined(UUID nodeId) {
if( !masterId && isMasterNode(nodeId) ) {
log.debug "=== Master node joined: nodeId=$nodeId"
masterId = nodeId
}
else {
log.debug "=== Cluster node joined: nodeId=$nodeId"
}
}
private void onNodeFailed(UUID nodeId) {
if( nodeId == masterId ) {
log.debug "=== Master node failed: nodeId=$nodeId"
masterId = null
eventProcessor.newMessage()
}
}
private void onNodeLeft(UUID nodeId) {
if( nodeId == masterId ) {
log.debug "=== Master node left: nodeId=$nodeId"
masterId = null
eventProcessor.newMessage()
}
}
private void onNodeShutdown(UUID nodeId) {
close(true)
}
/**
* Register event the scheduler agent listen for
*/
private void registerEvents() {
ignite.message().localListen(TOPIC_AGENT_EVENTS, createMessageDispatcher())
def dispatcher = createEventDispatcher()
ignite
.events()
.localListen( dispatcher, EventType.EVT_NODE_FAILED )
ignite
.events()
.localListen( dispatcher, EventType.EVT_NODE_LEFT )
ignite
.events()
.localListen( dispatcher, EventType.EVT_NODE_JOINED )
}
/**
* Launch the scheduler agent execution logic. It checks for task in
* the {@link #pendingTasks} distributed cached, pick the ones which resources
* match the avail ones and execute them.
*
* @return The {@link SchedulerAgent} instance itself
*/
SchedulerAgent run() {
eventProcessor.start()
return this
}
private isMasterNode(UUID nodeId) {
ignite.cluster().node(nodeId).attribute(IgGridFactory.NODE_ROLE) == ROLE_MASTER
}
private UUID getMasterNodeId() {
ignite.cluster().forAttribute(IgGridFactory.NODE_ROLE, ROLE_MASTER)?.node()?.id()
}
private void sendMessageToMaster( String topic, message ) {
if( !masterId ) {
log.debug "=== Master node is unknown -- Cannot send message: [${message.getClass().getSimpleName()}] $message"
return
}
if( closed ) {
log.debug "=== Shutdown in progress -- Wont send message: [${message.getClass().getSimpleName()}] $message"
return
}
try {
final master = ignite .cluster() .forNodeId(masterId)
ignite .message(master) .sendOrdered(topic, message, 0)
}
catch ( ClusterGroupEmptyException e ) {
log.debug "=== Master node is not available -- Cannot send message: [${message.getClass().getSimpleName()}] $message"
}
}
/**
* Notify the scheduler that the task execution has started by sending a
* {@link TaskStart} message
*/
@PackageScope void notifyTaskStart(IgBaseTask task) {
sendMessageToMaster(TOPIC_SCHEDULER_EVENTS, new TaskStart(task))
}
@PackageScope void notifyNodeStart() {
def data = NodeData.create(config, ignite)
sendMessageToMaster(TOPIC_SCHEDULER_EVENTS, data)
}
@PackageScope void notifyNodeRetired(String termination) {
sendMessageToMaster(TOPIC_SCHEDULER_EVENTS, new Protocol.NodeRetired(termination))
}
/**
* Notify the scheduler that a task execution has completed by sending a {@link TaskComplete} message
*
* @param task
* @param result
*/
@PackageScope void notifyComplete(IgBaseTask task, result) {
try {
log.trace "=== Notify task complete: taskId=${task.taskId}; result=$result"
final payload = TaskComplete.create(task, result)
sendMessageToMaster(TOPIC_SCHEDULER_EVENTS, payload)
}
catch( Exception e ) {
log.error "=== Failed to notify task completion: taskId=${task.taskId}; result=$result", e
}
}
/**
* Notify the scheduler that a task execution has failed by sending a {@link TaskComplete} message
*
* @param task
* @param error
*/
@PackageScope void notifyError(IgBaseTask task, Throwable error) {
try {
final taskId = task.taskId
log.trace "=== Notify task complete [error]: taskId=${taskId}; error=$error"
final payload = TaskComplete.error(task, error)
sendMessageToMaster(TOPIC_SCHEDULER_EVENTS, payload)
}
catch( Exception e ) {
log.error "=== Failed to notify task completion: taskId=${task.taskId}; error=$error", e
}
}
@PackageScope void notifyNodeIdle(long last) {
log.trace "=== Notify node idle"
sendMessageToMaster(TOPIC_SCHEDULER_EVENTS, new NodeIdle(last))
}
/**
* Method handler invoked when receiving a {@link TaskCancel} message. It
* cancels the specified running tasks.
*
* @param message
*/
@PackageScope void onCancelTask( TaskCancel message ) {
def holder = runningTasks.get(message.taskId)
if( holder ) {
log.trace "=== Cancelling task: taskId=${message.taskId}"
holder.future.cancel(true)
}
else {
log.trace "=== Unable to find task to cancel: taskId=${message.taskId}"
}
}
/**
* Shutdown the scheduler agent
*/
synchronized void close(boolean shutdownIgnite = false) {
if( closed ) return
log.debug "=== Scheduler agent shutting down"
closed = true
// -- shutdown executor & agent processor
taskExecutor.shutdownNow()
eventProcessor.shutdown()
if( !shutdownIgnite )
return
// -- shutdown the ignite instance
Thread.start {
print "Cleaning up.. "
sleep 3_000; // give a few seconds to send pending messages
ignite.close()
println "Done."
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy