org.apache.uima.ducc.rm.scheduler.Scheduler Maven / Gradle / Ivy
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.uima.ducc.rm.scheduler;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.uima.ducc.common.Node;
import org.apache.uima.ducc.common.NodeConfiguration;
import org.apache.uima.ducc.common.NodeIdentity;
import org.apache.uima.ducc.common.Pair;
import org.apache.uima.ducc.common.admin.event.RmAdminQLoadReply;
import org.apache.uima.ducc.common.admin.event.RmAdminQOccupancyReply;
import org.apache.uima.ducc.common.admin.event.RmAdminReply;
import org.apache.uima.ducc.common.admin.event.RmAdminVaryReply;
import org.apache.uima.ducc.common.admin.event.RmQueriedClass;
import org.apache.uima.ducc.common.admin.event.RmQueriedMachine;
import org.apache.uima.ducc.common.admin.event.RmQueriedNodepool;
import org.apache.uima.ducc.common.component.AbstractDuccComponent;
import org.apache.uima.ducc.common.persistence.rm.IRmPersistence;
import org.apache.uima.ducc.common.persistence.rm.RmPersistenceFactory;
import org.apache.uima.ducc.common.utils.DuccLogger;
import org.apache.uima.ducc.common.utils.DuccProperties;
import org.apache.uima.ducc.common.utils.DuccPropertiesResolver;
import org.apache.uima.ducc.common.utils.SystemPropertyResolver;
import org.apache.uima.ducc.common.utils.Version;
import org.apache.uima.ducc.common.utils.id.DuccId;
import org.apache.uima.ducc.common.utils.id.DuccIdFactory;
* This process orchestrates scheduling.
* - Receives requests from clients ( job manager, service manager, etc ) for resources
* - Forwards requests and current state to pluggable scheduling implementation
* - Receives a schedule, updates state, sends responses to requestors
* - Maintains state as needed (work item life cycle etc)
public class Scheduler
// extends Thread
implements ISchedulerMain,
IJobManager jobManager;
static DuccLogger logger = DuccLogger.getLogger(Scheduler.class, COMPONENT_NAME);
boolean done = false;
// Boolean force_epoch = false;
String ducc_home;
// Integer epoch = 5; // scheduling epoch, seconds
NodeConfiguration configuration = null; // UIMA-4142 make it global
String defaultDomain = null; // UIMA-4142
boolean needRecovery = false; // UIMA-4142 tell outer layer that recovery is required
AbstractDuccComponent baseComponent; // UIMA-4142, pass in the base for reconfig - reread ducc.properties
NodePool[] nodepools; // top-level nodepools
int max_order = 0;
// Fair-share and fixed-share use shares only, not machines
Map busyShares = new HashMap(); // Running "fair" share jobs
// incoming reports of machines that are now free
Map> vacatedShares= new HashMap>();
// boolean growthOccurred = false; // don't care which grew, just that something grew
List incomingJobs = new ArrayList(); // coming in from external world but not added our queues yet
List recoveredJobs = new ArrayList(); // coming in from external world but we don't now about them, (hopefully
// because we crashed and not for more nefarious reasons)
List completedJobs = new ArrayList(); // signaled complete from outside but not yet dealt with
List initializedJobs = new ArrayList(); // Init is complete so we can begin full (un)fair share allocation
//HashMap incomingNodes = new HashMap(); // node updates
Map deadNodes = new HashMap(); // missed too many heartbeats
Map illNodes = new HashMap(); // starting to miss, keep track of how many for the db
// HashMap allNodes = new HashMap(); // the guys we know
Map nodepoolsByNode = new HashMap(); // all nodes, and their associated pool
Map shortToLongNode = new HashMap(); //
Map users = new HashMap(); // Active users - has a job in the system
//HashMap runningJobs = new HashMap();
Map allJobs = new HashMap();
Map resourceClasses = new HashMap();
Map resourceClassesByName = new HashMap();
String defaultFairShareName = null;
String defaultReserveName = null;
int defaultNThreads = 1;
int defaultNTasks = 10;
int defaultMemory = 15;
// these two are initialized in constructor
String schedImplName;
IScheduler[] schedulers;
long share_free_dram = 0; // 0 GB in KB - minim memory after shares are allocated
long dramOverride = 0; // if > 0, use this instead of amount reported by agents (modeling and testing)
int pending_evictions = 0; // for queries
int pending_expansions = 0; // for queries
EvictionPolicy evictionPolicy = EvictionPolicy.SHRINK_BY_MACHINE;
// int nodeMetricsUpdateRate = 30000;
// int startupCountdown = 0; // update each epoch. only schedule when it's > nodeStability
int nodeStability = 3;
boolean stability = false;
private static DuccIdFactory idFactory;
IRmPersistence persistence = null;
// static boolean expandByDoubling = true;
// static int initializationCap = 2; // Max allocation until we know initialization works in
// units of *processes*, not shares (i.e.N-shares).
// Version
// 0 - major version
// 6 - minor version
// 3 - ptf - forced eviction under fragmentation.
// 4 - defrag code complete
// beta - not yet "real"!
// Bring up to speed with rest of ducc version. 2013-03-06 jrc
// 1.0.1 - RM can purge non-preemptables except for Unmanaged Reservations. UIMA-3614
// 1.0.2 - vary-on, vary-off
// 1.0.3 - fix bad check in recursion in NodepoolScheduler.doEvictions
// 1.1.0 - Syncnronize with release
final static int rmversion_major = 2;
final static int rmversion_minor = 0;
final static int rmversion_ptf = 0;
final static String rmversion_string = null;
boolean initialized = false; // we refuse nodeupdates until this is true
public Scheduler(AbstractDuccComponent baseComponent)
this.baseComponent = baseComponent; // UIMA-4142, pass in the base for reconfig
public synchronized void init()
throws Exception
String methodName = "init";
String ep = SystemPropertyResolver.getStringProperty("ducc.rm.eviction.policy", "SHRINK_BY_MACHINE");
evictionPolicy = EvictionPolicy.valueOf(ep);
// nodepool = new NodePool(null, evictionPolicy, 0); // global nodepool
share_free_dram = SystemPropertyResolver.getLongProperty("ducc.rm.reserved.dram", share_free_dram) * 1024 * 1024; // GB -> KB
ducc_home = SystemPropertyResolver.getStringProperty("DUCC_HOME");
// some defaults, for jobs that don't specify them
defaultNTasks = SystemPropertyResolver.getIntProperty("ducc.rm.default.tasks", 10);
defaultNThreads = SystemPropertyResolver.getIntProperty("ducc.rm.default.threads", 1);
defaultMemory = SystemPropertyResolver.getIntProperty("ducc.rm.default.memory", 15); // in GB
// expandByDoubling = RmUtil.getBooleanProperty("ducc.rm.expand.by.doubling", true);
nodeStability = SystemPropertyResolver.getIntProperty("ducc.rm.node.stability", 3); // number of node metrics updates to wait for before scheduling
// 0 means, just jump right in and don't wait
dramOverride = SystemPropertyResolver.getLongProperty("ducc.rm.override.dram", 0);
if ( dramOverride > 0 ) {
dramOverride = dramOverride * (1024 * 1024); // convert to KB
if ( idFactory == null ) { // UIMA-4142 only remake it on first boot
idFactory = new DuccIdFactory(1);
// try {
// schedImplName = SystemPropertyResolver.getStringProperty("ducc.rm.scheduler", "org.apache.uima.ducc.rm.ClassBasedScheduler");
// @SuppressWarnings("unchecked")
// Class cl = (Class) Class.forName(schedImplName);
// scheduler = (IScheduler) cl.newInstance();
// } catch (ClassNotFoundException e) {
// throw new SchedulingException(null, "Cannot find class " + schedImplName);
// } catch (InstantiationException e) {
// throw new SchedulingException(null, "Cannot instantiate class " + schedImplName);
// } catch (IllegalAccessException e) {
// throw new SchedulingException(null, "Cannot instantiate class " + schedImplName + ": can't access constructor.");
// }
String class_definitions = SystemPropertyResolver
.ducc_rm_class_definitions, "scheduler.classes");
class_definitions = System.getProperty("DUCC_HOME") + "/resources/" + class_definitions;
try {
} catch ( Exception e ) {
logger.error(methodName, null, e);
throw e;
// we share most of the state with the actual scheduling code - no need to keep passing this around
// TODO: Make sure these are all Sialized correctly
// scheduler.setEvictionPolicy(evictionPolicy);
// scheduler.setClasses(resourceClasses);
// scheduler.setNodePool(nodepools[0]);
logger.info(methodName, null, " reserved DRAM : ", (share_free_dram / (1024*1024)), " GB");
logger.info(methodName, null, " DRAM override : ", (dramOverride / (1024*1024)), " GB");
logger.info(methodName, null, " scheduler : ", schedImplName);
logger.info(methodName, null, " default threads : ", defaultNThreads);
logger.info(methodName, null, " default tasks : ", defaultNTasks);
logger.info(methodName, null, " default memory : ", defaultMemory);
logger.info(methodName, null, " default fairshare class : ", defaultFairShareName);
logger.info(methodName, null, " default reserve : ", defaultReserveName);
logger.info(methodName, null, " class definition file : ", class_definitions);
logger.info(methodName, null, " default domain : ", defaultDomain); // UIMA-4142
logger.info(methodName, null, " eviction policy : ", evictionPolicy);
logger.info(methodName, null, " database enabled : ", !System.getProperty("ducc.database.host").equals("--disabled--"));
logger.info(methodName, null, " database implementation : ", System.getProperty("ducc.rm.persistence.impl"));
logger.info(methodName, null, " use prediction : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.prediction", true));
logger.info(methodName, null, " prediction fudge factor : ", SystemPropertyResolver.getIntProperty("ducc.rm.prediction.fudge", 10000));
logger.info(methodName, null, " node stability : ", nodeStability);
logger.info(methodName, null, " init stability : ", SystemPropertyResolver.getIntProperty("ducc.rm.init.stability"));
logger.info(methodName, null, " fast recovery : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.fast.recovery", true));
logger.info(methodName, null, " metrics update rate : ", SystemPropertyResolver.getIntProperty("ducc.agent.node.metrics.publish.rate",
logger.info(methodName, null, " initialization cap : ", SystemPropertyResolver.getIntProperty("ducc.rm.initialization.cap"));
logger.info(methodName, null, " expand by doubling : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.expand.by.doubling", true));
logger.info(methodName, null, " fragmentation threshold : ", SystemPropertyResolver.getIntProperty("ducc.rm.fragmentation.threshold", 2));
logger.info(methodName, null, " do defragmentation : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.defragmentation", true));
logger.info(methodName, null, " DUCC home : ", System.getProperty("DUCC_HOME"));
logger.info(methodName, null, " ActiveMQ URL : ", SystemPropertyResolver.getStringProperty("ducc.broker.url"));
logger.info(methodName, null, " JVM : ", System.getProperty("java.vendor") +
" "+ System.getProperty("java.version"));
logger.info(methodName, null, " JAVA_HOME : ", System.getProperty("java.home"));
logger.info(methodName, null, " JVM Path : ", System.getProperty("ducc.jvm"));
logger.info(methodName, null, " JMX URL : ", System.getProperty("ducc.jmx.url"));
logger.info(methodName, null, " OS Architecture : ", System.getProperty("os.arch"));
logger.info(methodName, null, " OS Name : ", System.getProperty("os.name"));
logger.info(methodName, null, " DUCC Version : ", Version.version());
logger.info(methodName, null, " RM Version : ", ""+ rmversion_major + "."
+ rmversion_minor + "."
+ rmversion_ptf);
persistence = RmPersistenceFactory.getInstance(this.getClass().getName(), "RM");
initialized = true;
public RmAdminReply reconfigure() // UIMA-4142
String methodName = "reconfigure";
RmAdminReply ret = new RmAdminReply();
logger.info(methodName, null, "Reconfiguration starts.");
setInitialized(false); // stop receipt of OR and Agent publications
// First we run the logic that reads the configuration, and if it fails, we abort the reconfig without crashing
// We'll throw it away because we call init() in a minute, which will do the actual configuration, as if booting
try {
readConfiguration(); // UIMA-4142
} catch (Throwable e) {
logger.warn(methodName, null, "Reconfiguration aborted:", e.toString());
ret.setMessage("Reconfiguration failed: " + e.toString());
return ret;
HashMap offlineMachines = new HashMap();
for (NodePool np : nodepools) {
// (be careful, don't use the value, that must be discarded as it points to the OLD np)
List offlineHostnames = new ArrayList();
for ( Machine m : offlineMachines.values()) {
logger.info(methodName, null, "Saving offline status of", m.getId());
offlineMachines = null;
this.configuration = null;
this.defaultDomain = null;
this.nodepools = null;
this.max_order = 0;
try {
if ( offlineHostnames.size() > 0 ) {
String[] offline = offlineHostnames.toArray(new String[offlineHostnames.size()]);
} catch ( Throwable t ) {
// TODO do something? What? If this fails its pretty awful.
setRecovery(true); // signal to outer layer that full recovery is needed
setInitialized(true); // resume receipt of publications
logger.info(methodName, null, "Reconfiguration complete.");
ret.setMessage("Reconfiguration complete.");
return ret;
public synchronized void setRecovery(boolean v)
this.needRecovery = v;
public synchronized boolean mustRecover()
return this.needRecovery;
public synchronized boolean isInitialized()
return initialized;
public synchronized void setInitialized(boolean v)
this.initialized = v;
public Machine getMachine(Node n)
return getMachine(n.getNodeIdentity());
public Machine getMachine(NodeIdentity ni)
NodePool nodepool = getNodepoolByName(ni);
return nodepool.getMachine(ni);
public void setJobManager(IJobManager jobmanager)
this.jobManager = jobmanager;
public String getDefaultFairShareName()
return defaultFairShareName;
public String getDefaultReserveName()
return defaultReserveName;
public int getDefaultNThreads()
return defaultNThreads;
public int getDefaultNTasks()
return defaultNTasks;
public int getDefaultMemory()
return defaultMemory;
public ResourceClass getResourceClass(String name)
return resourceClassesByName.get(name);
public IRmJob getJob(DuccId id)
return allJobs.get(id);
public Share getShare(DuccId id)
return busyShares.get(id);
// public static int getInitializationCap()
// {
// return initializationCap;
// }
// public static boolean isExpandByDoubling()
// {
// return expandByDoubling;
// }
* Calculate share order, given some memory size in GB (as in from a job spec)
public int calcShareOrder(IRmJob j)
// Calculate its share order
long mem = j.getMemory() << 20 ; // to KB from GB
int share_quantum = j.getShareQuantum();
int share_order = (int) (mem / share_quantum); // liberal calc, round UP
if ( (mem % share_quantum) > 0 ) {
return share_order;
* Collect all the classes served by the indicated nodepool (property set). This fills
* in the 'ret' map from the parameter 'dp' and recursive calls to the children in dp.
* @param dp This is the properties object from the configurator for a top-level
* nodepool.
* @param ret This is the map to be filled in by this routine.
void getClassesForNodepool(DuccProperties dp, Map ret)
List class_set = (List) dp.get("classes");
if ( class_set != null ) {
for ( DuccProperties cl : class_set ) {
ResourceClass rc = resourceClassesByName.get(cl.getStringProperty("name"));
ret.put(rc, rc);
List children = (List) dp.get("children");
if ( children != null ) {
for (DuccProperties child : children ) {
getClassesForNodepool(child, ret);
* Map each node by name into the nodepool it belongs to
void mapNodesToNodepool(Map nodes, NodePool pool)
if ( nodes == null ) return;
for ( String s : nodes.keySet() ) {
updateNodepoolsByNode(s, pool); // maps from both the fully-qualified name and th shortnmae
* (Recursively) build up the heirarchy under the parent nodepool.
void createSubpools(NodePool parent, List children)
if ( children == null ) return;
for ( DuccProperties dp : children ) {
String id = dp.getStringProperty("name");
Map nodes = (Map) dp.get("nodes");
int search_order = dp.getIntProperty("search-order", 100);
NodePool child = parent.createSubpool(id, nodes, search_order);
mapNodesToNodepool(nodes, child);
List grandkids = (List) dp.get("children");
createSubpools(child, grandkids);
// UIMA-4142 better modularize this code
NodeConfiguration readConfiguration()
throws Exception
String class_definitions = SystemPropertyResolver
.ducc_rm_class_definitions, "scheduler.classes");
String user_registry = SystemPropertyResolver
.ducc_rm_user_registry, "ducc.users");
class_definitions = System.getProperty("DUCC_HOME") + "/resources/" + class_definitions;
String me = Scheduler.class.getName() + ".Config";
DuccLogger initLogger = new DuccLogger(me, COMPONENT_NAME);
NodeConfiguration nc = new NodeConfiguration(class_definitions, null, user_registry, initLogger); // UIMA-4142 make the config global
return nc; // UIMA-4142
// UIMA-4142, don't pass in class def file, instead use common readConfiguration
void initClasses()
String methodName = "initClasses";
try {
configuration = readConfiguration();
} catch (Throwable e) {
// RM boot. We must abort being has we have no prior working configuration to fall back to.
logger.error(methodName, null, e);
logger.error(methodName, null, "Scheduler exits: unable to read configuration.");
System.out.println("Scheduler exits: unable to read configuration.");
defaultDomain = configuration.getDefaultDomain(); // UIMA-4142
DuccProperties[] nps = configuration.getToplevelNodepools();
Map cls = configuration.getClasses();
nodepools = new NodePool[nps.length]; // top-level nodepools
schedulers = new IScheduler[nps.length]; // a schedler for each top-level nodepool
// Here build up the ResourceClass definitions
logger.info(methodName, null, "Classes:");
logger.info(methodName, null, ResourceClass.getHeader());
logger.info(methodName, null, ResourceClass.getDashes());
for ( DuccProperties props : cls.values() ) {
ResourceClass rc = new ResourceClass(props);
resourceClasses.put(rc, rc);
resourceClassesByName.put(rc.getName(), rc);
logger.info(methodName, null, rc.toString());
DuccProperties dc = configuration.getDefaultFairShareClass();
if ( dc != null ) {
defaultFairShareName = dc.getProperty("name");
dc = configuration.getDefaultReserveClass();
if ( dc != null ) {
defaultReserveName = dc.getProperty("name");
// Instatntiate one scheduler per top-level nodepool
try {
schedImplName = SystemPropertyResolver.getStringProperty("ducc.rm.scheduler", "org.apache.uima.ducc.rm.ClassBasedScheduler");
Class cl = (Class) Class.forName(schedImplName);
for ( int i = 0; i < nps.length; i++ ) {
logger.info(methodName, null, "Rebuilding", schedImplName, "for top level nodepool", nps[i].get("name"));
schedulers[i] = (IScheduler) cl.newInstance();
} catch (ClassNotFoundException e) {
throw new SchedulingException(null, "Cannot find class " + schedImplName);
} catch (InstantiationException e) {
throw new SchedulingException(null, "Cannot instantiate class " + schedImplName);
} catch (IllegalAccessException e) {
throw new SchedulingException(null, "Cannot instantiate class " + schedImplName + ": can't access constructor.");
// Here create the nodepool configuration
for ( int i = 0; i < nps.length; i++ ) {
DuccProperties np = nps[i];
String id = np.getStringProperty("name");
Map nodes = (Map) np.get("nodes");
int search_order = np.getIntProperty("search-order", 100);
int q = np.getIntProperty("share-quantum", 15) << 20 ; // to kB which is how the nodes report in
nodepools[i] = new NodePool(null, id, nodes, evictionPolicy, 0, search_order, q);
schedulers[i].setNodePool(nodepools[i]); // set its top-level nodepool
mapNodesToNodepool(nodes, nodepools[i]);
logger.info(methodName, null, "Created top-level nodepool", id);
List children = (List) np.get("children");
createSubpools(nodepools[i], children);
Map classesForNp = new HashMap();
getClassesForNodepool(np, classesForNp); // all classes served by this heirarchy - fills in classesForNp
for ( ResourceClass rc: classesForNp.values() ) { // UIMA-4065 tell each cl which np serves it
String rcid = rc.getNodepoolName();
if ( rcid != null ) {
// set the two-way pointers between rc and np
NodePool subpool = nodepools[i].getSubpool(rcid);
rc.setNodepool(subpool); // rc -> nodepool
logger.info(methodName, null, "Assign rc", rc.getName(), "to np", subpool.getId());
subpool.addResourceClass(rc); // nodepool -> rc
// Here create or update Users with constraints from the registry
Map usrs = configuration.getUsers(); // UIMA-4275
for ( Object o : usrs.keySet() ) { // iterate over users
String n = (String) o;
DuccProperties dp = usrs.get(n);
for ( Object l : dp.keySet() ) { // iterate over limits for the user
if ( !((String)l).startsWith("max-allotment")) continue; // only this supported at this time
String val = ((String) dp.get(l)).trim();
int lim = Integer.parseInt( val ); // verified parsable int during parsing
User user = users.get(n);
if (user == null) {
user = new User(n);
users.put(n, user);
if ( val.contains(".") ) {
String[] tmp = ((String)l).split("\\."); // max_allotment.classname
ResourceClass rc = resourceClassesByName.get(tmp[1]);
user.overrideLimit(rc, lim); // constrain allotment for this class to value in l
} else {
* Called only from schedule, under the 'this' monitor.
* We then take the SchedulingUpdate from the IScheduler and dispatches orders to
* the world to make it happen.
* For jobs that lose resources, job manager is asked to stop execution in specific shares.
* For jobs that gain resources, job manager is asked to start execution in specific shares.
* Jobs that don't change are leftovers. If they're not running at all, they're in the pending
* list; they might also be in the running list but had no allocation changes in the current epoch.
private JobManagerUpdate dispatch(SchedulingUpdate upd, JobManagerUpdate jmu)
String methodName = "dispatch";
HashMap jobs;
pending_evictions = 0; // for queries
pending_expansions = 0; // for queries
// Go through shrunken jobs - if they are shrunken to 0, move to dormant
jobs = upd.getShrunkenJobs();
for (IRmJob j : jobs.values()) {
logger.trace(methodName, j.getId(), ">>>>>>>>>> SHRINK");
HashMap sharesE = j.getAssignedShares();
HashMap sharesR = j.getPendingRemoves();
logger.trace(methodName, j.getId(), "removing", sharesR.size(), "of existing", sharesE.size(), "shares.");
pending_evictions += (sharesR.size() * j.getShareOrder());
for ( Share s : sharesE.values() ) {
logger.trace(methodName, j.getId(), " current", s.toString());
for ( Share s : sharesR.values() ) {
logger.trace(methodName, j.getId(), " remove ", s.toString());
logger.trace(methodName, j.getId(), ">>>>>>>>>>");
jmu.removeShares(j, sharesR);
// jobManager.stopJob(j, shares); // stops job on everything on the pendingRemoves list
// j.clearPendingRemoves();
// Go through expanded jobs - if they are dormant, remove from dormant
// then add to running.
// Tell the server it needs to start some machines for the job
jobs = upd.getExpandedJobs();
for (IRmJob j : jobs.values() ) {
HashMap sharesE = j.getAssignedShares();
HashMap sharesN = j.getPendingShares();
logger.trace(methodName, j.getId(), "<<<<<<<<<< EXPAND");
logger.trace(methodName, j.getId(), "adding", sharesN.size(), "new shares to existing", sharesE.size(), "shares.");
pending_expansions += (sharesN.size() * j.getShareOrder());
for ( Share s : sharesE.values()) {
logger.trace(methodName, j.getId(), " existing ", s.toString());
for ( Share s : sharesN.values()) {
logger.trace(methodName, j.getId(), " expanding", s.toString());
logger.trace(methodName, j.getId(), "<<<<<<<<<<");
sharesN = j.promoteShares();
if ( sharesN.size() == 0 ) {
// internal error - should not be marked expanded if no machines
throw new SchedulingException(j.getId(), "Trying to execute expanded job but no pending machines.");
for ( Share s : sharesN.values()) { // update machine books
// Sanity checks on the bookkeeping
busyShares.put(s.getId(), s);
// DuccId id = j.getId(); // pull from dormant, maybe
// if ( dormantJobs .containsKey(id) ) {
// dormantJobs .remove(id);
// }
//runningJobs.put(id, j);
jmu.addShares(j, sharesN);
// jobManager.executeJob(j, shares); // will update job's pending lists
jobs = upd.getStableJobs(); // squirrel these away to try next time
for (IRmJob j: jobs.values()) {
if ( j.countNShares() < 0 ) {
throw new SchedulingException(j.getId(), "Share count went negative " + j.countNShares());
logger.trace(methodName, j.getId(), ".......... STABLE with ", j.countNShares(), " shares.");
jobs = upd.getDormantJobs(); // squirrel these away to try next time
for (IRmJob j: jobs.values()) {
logger.trace(methodName, j.getId(), ".......... DORMANT");
// dormantJobs .put(j.getId(), j);
jobs = upd.getReservedJobs();
for (IRmJob j: jobs.values()) {
logger.trace(methodName, j.getId(), "<<<<<<<<<< RESERVE");
HashMap sharesE = j.getAssignedShares();
HashMap sharesN = j.getPendingShares();
if ( sharesE.size() == j.getMaxShares() ) {
logger.trace(methodName, j.getId(), "reserve_stable", sharesE.size(), "machines");
} else if ( sharesN.size() == j.getMaxShares() ) { // reservation is complete but not yet confirmed?
logger.trace(methodName, j.getId(), "reserve_adding", sharesN.size(), "machines");
for ( Share s : sharesN.values()) {
logger.trace(methodName, j.getId(), " reserve_expanding ", s.toString());
jmu.addShares(j, sharesN);
} else {
logger.trace(methodName, j.getId(), "reserve_pending", j.getMaxShares(), "machines");
logger.trace(methodName, j.getId(), "<<<<<<<<<<");
jobs = upd.getRefusedJobs();
Iterator iter = jobs.values().iterator();
while ( iter.hasNext() ) {
IRmJob j = iter.next();
logger.trace(methodName, j.getId(), ".......... REFUSED");
return jmu;
* We don't accept new work or even Orchestrator state updates until "ready". We do
* want machines, but be sure the internal structures are protected.
public synchronized boolean ready()
return stability;
public synchronized void start()
stability = true;
public void stop()
protected void handleIllNodes()
String methodName = "handleIllNodes";
if ( ! isInitialized() ) {
logger.info(methodName, null, "Waiting for (re)initialization.");
HashMap nodeUpdates = new HashMap();
synchronized(illNodes) {
synchronized(this) {
for ( Node n : nodeUpdates.keySet() ) {
Machine m = getMachine(n);
if ( m == null ) {
logger.warn(methodName, null, "Cannot find any record of machine", n.getNodeIdentity().getName());
int count = nodeUpdates.get(n);
if ( count == 0 ) {
} else {
protected void handleDeadNodes()
String methodName = "handleDeadNodes";
if ( ! isInitialized() ) {
logger.info(methodName, null, "Waiting for (re)initialization.");
HashMap nodeUpdates = new HashMap();
synchronized(deadNodes) {
synchronized(this) {
for ( Node n : nodeUpdates.values() ) {
Machine m = getMachine(n);
if ( m == null ) {
// must have been removed because of earlier missed hb
logger.warn(methodName, null, "***Purging machine***", m.getId(), "due to missed heartbeats. THreshold:", nodeStability);
NodePool np = m.getNodepool();
* We first accept any changes and requests from the outside world and place them where they
* can be acted on in this epoch.
* We then pass all relevant requests and resources to the IScheduler. This returns a
* SchedulingUpdate which is passed to the dispatcher to be acted upon.
public JobManagerUpdate schedule()
String methodName = "schedule";
// if ( startupCountdown++ < nodeStability ) {
// logger.info(methodName, null, "Startup countdown:", startupCountdown, "of", nodeStability);
// return null;
// }
if ( ! ready() ) {
return null;
if ( ! isInitialized() ) {
logger.info(methodName, null, "Waiting for (re)initialization.");
return null;
// tracking the OR hang problem - are topics being delivered?
logger.info("nodeArrives", null, "Total arrivals:", total_arrivals);
synchronized(this) {
// TODO: Can we combine these two into one?
SchedulingUpdate upd = new SchedulingUpdate(); // state from internal scheduler
JobManagerUpdate jmu = new JobManagerUpdate(); // state we forward to job manager
// int nchanges = 0;
ArrayList jobsToRecover = new ArrayList();
synchronized(recoveredJobs) {
// nchanges += jobsToRecover.size();
ArrayList newJobs = new ArrayList();
// If there are new jobs we need to init some things and start a scheduling cycle.
synchronized(incomingJobs) {
// nchanges += newJobs.size();
// If some jobs pased initializion we need to signal a scheduling cycle to get
// them their fair share
// synchronized(initializedJobs) {
// if ( initializedJobs.size() > 0 ) {
// nchanges++;
// }
// initializedJobs.clear();
// }
// If some jobs completed we need to process clearning them out and signal a
// scheduling cycle to try to reuse their resources.
ArrayList doneJobs = new ArrayList();
synchronized(completedJobs) {
//nchanges += doneJobs.size();
// If some shares were vacated we need to clear them out and run a scheduling cycle.
ArrayList> doneShares= new ArrayList>();
synchronized(vacatedShares) {
//nchanges += doneShares.size();
// we use the vacatedShares object to control share growth as well
//if ( growthOccurred ) nchanges++;
//growthOccurred = false;
// boolean must_run = false;
// synchronized(force_epoch) {
// must_run = force_epoch;
// force_epoch = false;
// }
// if ( (nchanges == 0) && !must_run ) {
// jmu.setAllJobs(allJobs);
// return jmu;
// }
// TODO if we remove this code above be sure to clear out all the force_epoch nonsense
// TODO does this even use growthOccurred?
synchronized(this) {
// before looking at jobs, insure we're updated after a crash
for ( IRmJob j : jobsToRecover ) {
// process these next to free up resources for the scheduling cycle
for (Pair p : doneShares) {
processCompletion(p.first(), p.second());
for (IRmJob j : doneJobs) {
// update user records, "check in" new jobs
if ( newJobs.size() > 0 ) {
logger.info(methodName, null, "Jobs arrive:");
logger.info(methodName, null, "submit", RmJob.getHeader());
Iterator iter = newJobs.iterator();
while ( iter.hasNext() ) {
IRmJob j = iter.next();
if ( j.isRefused() ) { // the JobManagerConverter has already refused it
logger.info(methodName, j.getId(), "Bypassing previously refused job.");
upd.refuse(j, j.getRefusalReason());
String user = j.getUserName();
User u = users.get(user);
if ( u == null ) {
u = new User(user);
users.put(user, u);
// Calculate its share order
int share_order = calcShareOrder(j);
// Assign it to its priority class
String clid = j.getClassName();
ResourceClass prclass = resourceClassesByName.get(clid);
allJobs.put(j.getId(), j);
if ( prclass == null ) {
upd.refuse(j, "Cannot find priority class " + clid + " for job");
// UIMA-4275 never refuse impossible work, just let it hang out
// if ( share_order > max_order ) {
// upd.refuse(j, "Memory requested " + j.getMemory() + "GB exceeds the capacity of any machine in the cluster.");
// continue;
// }
* We want to allow this - a normal job, submitted to a reservation class.
if ( (prclass.getPolicy() == Policy.RESERVE ) && ( ! j.isReservation() ) ) {
upd.refuse(j, "Reservaction class " +
prclass.getId() + " specified but work is not a reservation.");
if ( ((prclass.getPolicy() != Policy.RESERVE ) && (prclass.getPolicy() != Policy.FIXED_SHARE)) && ( j.isReservation() ) ) {
upd.refuse(j, "Class " + prclass.getName() + " is policy " +
prclass.getPolicy() + " but the work is submitted as a reservation.");
try {
} catch (Exception e) {
logger.warn(methodName, j.getId(), "Cannot persist new job in database:", e);
logger.info(methodName, j.getId(), "submit", j.toString());
logger.info(methodName, null, "Scheduling " + newJobs.size(), " new jobs. Existing jobs: " + allJobs.size());
for ( int i = 0; i < schedulers.length; i++ ) {
logger.info(methodName, null, "Run scheduler", i, "with top-level nodepool", nodepools[i].getId());
for ( IRmJob j : allJobs.values() ) { // UIMA-4577 persist 'demand'
try {
} catch (Exception e) {
logger.warn(methodName, j.getId(), "Cannot update demand in database:", e);
logger.info(methodName, null, "--------------- Scheduler returns ---------------");
logger.info(methodName, null, "\n", upd.toString());
logger.info(methodName, null, "------------------------------------------------");
dispatch(upd, jmu); // my own job lists get updated by this
return jmu;
synchronized public void shutdown()
done = true;
// public void run()
// {
// String methodName = "run";
// while ( ! done ) {
// try { sleep(epoch); } catch (InterruptedException e) { }
// logger.info(methodName, null, "========================== Epoch starts ===========================");
// try {
// schedule();
// } catch ( SchedulingException e ) {
// logger.info(methodName, e.jobid, e);
// }
// logger.info(methodName, null, "========================== Epoch ends ===========================");
// }
// }
* maps from both the fully-qualified name and th shortnmae
void updateNodepoolsByNode(String longname, NodePool np)
String methodName = "updateNodepoolsByNode";
String shortname = longname;
int ndx = longname.indexOf(".");
logger.info(methodName, null, "Map", longname, "to", np.getId());
nodepoolsByNode.put(longname, np);
if ( ndx >= 0 ) {
shortname = longname.substring(0, ndx);
nodepoolsByNode.put(shortname, np);
shortToLongNode.put(shortname, longname);
logger.info(methodName, null, "Map", shortname, "to", np.getId());
// Return a nodepool by Node. If the node can't be associated with a nodepool, return the
// default nodepool, which is always the first one defined in the config file.
NodePool getNodepoolByName(NodeIdentity ni)
NodePool np = nodepoolsByNode.get( ni.getName() );
if ( np == null ) {
np = nodepoolsByNode.get( ni.getIp() );
if ( np == null ) {
np = nodepools[0];
updateNodepoolsByNode(ni.getName(), np); // assign this guy to the default np
// nodepoolsByNode.put( ni.getName(), np); // assign this guy to the default np
return np;
private int total_arrivals = 0;
public synchronized void nodeArrives(Node node)
String methodName = "nodeArrives";
if ( ! isInitialized() ) {
logger.info(methodName, null, "Waiting for (re)initialization; node = " + node.getNodeIdentity().getName());
synchronized(illNodes) { // stop flagging it as a problem
// String methodName = "nodeArrives";
// The first block insures the node is in the scheduler's records as soon as possible
total_arrivals++; // report these in the main schedule loop
NodePool np = getNodepoolByName(node.getNodeIdentity()); // finds np assigned in ducc.nodes; if none, returns the default np
Machine m = np.getMachine(node);
int share_order = 0;
// let's always recalculate this in case it changes for whatever bizarre reason (reboot, or pinned process gone, or whatever)
long allocatable_mem = node.getNodeMetrics().getNodeMemory().getMemFree() - share_free_dram;
if ( dramOverride > 0 ) {
allocatable_mem = dramOverride;
share_order = (int) (allocatable_mem / np.getShareQuantum()); // conservative - rounds down (this will always cast ok)
// NOTE: we cannot set the order into the machine yet, in case it has changed, because NodePool needs to adjust based
// on current and new
max_order = Math.max(share_order, max_order);
m = np.nodeArrives(node, share_order); // announce to the nodepools
public void nodeHb(Node n, int count)
synchronized(illNodes) {
illNodes.put(n, count);
public void nodeDeath(Map nodes)
synchronized(deadNodes) {
* User passed us a node by name. Maybe did and maybe didn't qualify it.
* Maybe the node checked in qualified maybe it didn't. Here we try to find
* something that kind of matches.
* UIMA-4142. Technically a bug on vary-on and vary-off but found and fixed as part of
* the indicated Jira.
synchronized String resolve(String node)
NodePool np = nodepoolsByNode.get(node);
if ( np == null ) return null; // indexed by long and short so if not found we're stuck
if ( np.hasNode(node) ) return node; // he knows it by this name we're done
int ndx = node.indexOf(".");
if ( ndx > 0 ) {
// np MUST know it by either long or short or it wouldn't be in nodepoolsByNode
// so it must be short
return node.substring(0, ndx);
} else {
// and vice-versa, it must be the long
return shortToLongNode.get(node);
public synchronized RmAdminReply varyon(String[] nodes)
String methodName = "varyon";
RmAdminVaryReply ret = new RmAdminVaryReply();
StringBuffer sb = new StringBuffer();
for (String n : nodes ) {
String rn = resolve(n);
if ( rn == null ) {
sb.append("VaryOn: " + n + " cannot be found in the RM.\n");
} else {
NodePool np = nodepoolsByNode.get(rn); // if null, resolve will fail
if ( np == null ) {
sb.append("VaryOn: " + n + " cannot find associated nodepool.\n");
} else {
String repl = np.varyon(rn);
logger.info(methodName, null, repl);
return ret;
public synchronized RmAdminReply varyoff(String[] nodes)
String methodName = "varyoff";
RmAdminVaryReply ret = new RmAdminVaryReply();
StringBuffer sb = new StringBuffer();
for (String n : nodes ) {
String rn = resolve(n);
if ( rn == null ) {
sb.append("VaryOff: " + n + " cannot be found in the RM.\n");
} else {
NodePool np = nodepoolsByNode.get(rn); // if null, resolve will fail
if ( np == null ) {
} else {
String repl = np.varyoff(rn);
logger.info(methodName, null, repl);
return ret;
RmQueriedNodepool getNpStats(NodePool np)
RmQueriedNodepool ret = new RmQueriedNodepool();
int[] onlineMachines = np.makeArray();
int[] freeMachines = np.makeArray();
for ( int i = 1; i < freeMachines.length; i++ ) {
freeMachines[i] += np.countFreeMachines(i); // (these are local, as we want)
// logger.info(methodName, null, np.getId() + ": online", online, "dead", dead, "offline", offline, "shares_available", shares_available, "shares_free", shares_free);
// logger.info(methodName, null, np.getId() + ": allMachines ", Arrays.toString(allMachines));
// logger.info(methodName, null, np.getId() + ": onlineByOrder ", Arrays.toString(onlineMachines));
// logger.info(methodName, null, np.getId() + "------- freeMachines should match free -------");
// logger.info(methodName, null, np.getId() + ": freeMachines ", Arrays.toString(freeMachines));
// logger.info(methodName, null, np.getId() + ": free ", Arrays.toString(free));
// logger.info(methodName, null, np.getId() + "----------------------------------------------");
// logger.info(methodName, null, np.getId() + ": virtualMachines", Arrays.toString(virtualMachines));
return ret;
void calculateLoad(RmAdminQLoadReply reply)
for ( ResourceClass cl : resourceClasses.values() ) {
RmQueriedClass qcl = new RmQueriedClass();
switch ( cl.getPolicy() ) {
// int[] demanded = NodePool.makeArray();
// int[] awarded = NodePool.makeArray();
// HashMap jobs = cl.getAllJobs();
// for ( IRmJob j : jobs.values() ) {
// int o = j.getShareOrder();
// demanded[o] += j.queryDemand();
// awarded[o] += j.countNShares();
// }
// qcl.setName(cl.getName());
// qcl.setDemanded(demanded);
// qcl.setAwarded(awarded);
// reply.addClass(qcl);
void listAllNodepools(NodePool parent, ArrayList list)
for (NodePool np : parent.getChildren().values() ) {
listAllNodepools(np, list);
public synchronized RmAdminQLoadReply queryLoad()
RmAdminQLoadReply ret = new RmAdminQLoadReply();
if ( ! ready() ) {
return ret;
ArrayList allpools = new ArrayList();
for ( NodePool np : nodepools ) {
listAllNodepools(np, allpools);
for ( NodePool np : allpools ) {
return ret;
public synchronized RmAdminQOccupancyReply queryOccupancy()
RmAdminQOccupancyReply ret = new RmAdminQOccupancyReply();
if ( ! ready() ) {
return ret;
// iterate top-level nodepools to get all their subpools
// iterate the subpools to get all their machines
// iterage the machines and request a query object
// add query object to ret
// return ret
// We want to be dependent on common project, not the other way around, so
// we keep the query objects in common and put knowledge of how to construc
// them into rm's Machine class.
// The alternative, passing RM's Machine to the query object creates a circular
// dependency with RM depending on common and common depending on RM.
// Not a cheap query, by the way.
// NOTE: No longer used by the rm_qoccupancy script which now goes directly to the database
for ( NodePool np : nodepools ) {
// NOTE: The offline & dead nodes are also in the AllMachines list so must be removed
Map allMachs = np.getAllMachines();
Map offline = np.getOfflineMachines(); // UIMA-4234
Map unresponsive = np.getUnresponsiveMachines(); // UIMA-4234
for ( Node n : offline.keySet() ) {
Machine m = offline.get(n);
RmQueriedMachine qm = m.queryMachine();
if ( unresponsive.containsKey(n) ) {
for ( Node n : unresponsive.keySet() ) {
Machine m = unresponsive.get(n);
RmQueriedMachine qm = m.queryMachine();
for ( Node n : allMachs.keySet() ) {
Machine m = allMachs.get(n);
return ret;
public synchronized void signalState(DuccId jobid, String state)
IRmJob j = allJobs.get(jobid);
if ( j != null ) { // might not be here yet, we'll get it later
* Callback from job manager, need shares for a new fair-share job.
public void signalNewWork(IRmJob job)
// We'll synchronize only on the incoming job list
synchronized(incomingJobs) {
// public void signalForceEpoch()
// {
// synchronized( force_epoch ) {
// force_epoch = true;
// }
// }
public void signalInitialized(IRmJob job)
// We'll synchronize only on the incoming job list
synchronized(initializedJobs) {
public void signalRecovery(IRmJob job)
synchronized(recoveredJobs) {
public void jobCancelled(DuccId id)
// TODO Fill this in.
* Callback from job manager when a job completes. We just believe him, no sanity checks or other such stuff.
public void signalCompletion(DuccId id)
String methodName = "signalCompletion";
synchronized(completedJobs) {
try {
IRmJob job = allJobs.get(id);
if ( job == null ) {
logger.warn(methodName, id, "Job completion signal: early termination; nothing to complete.");
return; // canceled or terminated very soon.
logger.info(methodName, id, "Job completion signal.");
} catch (Throwable t) {
logger.warn(methodName, id, t);
* Callback from job manager when a specific share exits but the job is still alive.
public void signalCompletion(IRmJob job, Share share)
String methodName = "signalCompletion";
synchronized(vacatedShares) {
logger.info(methodName, job.getId(), "Job vacate signal share: ", share.toString());
vacatedShares.put(share.getId(), new Pair(job, share));
* Callback from job manager when a specific share gets a process associated.
// public void signalGrowth(DuccId jobid, Share share)
// {
// String methodName = "signalGrowth";
// synchronized(vacatedShares) {
// logger.info(methodName, jobid, "Job growth signal share: ", share.toString());
// growthOccurred = true;
// }
// }
* Called in scheduling cycle, to actually complete the job - avoids deadlock
private synchronized void processCompletion(IRmJob job)
String methodName = "processCompletion";
logger.info(methodName, job.getId(), "Job completes.");
try {
persistence.deleteJob(job); // UIMA-4577
} catch (Exception e) {
logger.warn(methodName, job.getId(), "Cannot delete job from database:", e);
// -- clean up the running jobs list
IRmJob j = allJobs.remove(job.getId());
if ( j == null ) {
logger.info(methodName, job.getId(), "Job is not in run list!"); // can happen if job is refused very early
// -- clean up user list
User user = users.get(j.getUserName());
user.remove(job); // UIMA4275 don't clean up users list because it may have registry things in it
ResourceClass rc = job.getResourceClass();
if ( rc != null ) {
rc.removeJob(j); // also clears it if it's a reservation
} else if ( !j.isRefused() ) {
throw new SchedInternalError(j.getId(), "Job exits from class " + job.getClassName() + " but we cannot find the priority class definition.");
// -- clean up machine lists
HashMap shares= job.getAssignedShares();
for (Share s: shares.values()) {
purgeShare(s, job);
* Called from scheduling cycle - a specific share has run out of work for the give job (but the
* job is not done yet).
private synchronized void processCompletion(IRmJob job, Share share)
String methodName = "processCompletion";
logger.debug(methodName, job.getId(), "Job vacates share ", share.toString());
purgeShare(share, job);
* Log following / reconstruction, needed to init before recovery.
public void resetNodepools()
for ( NodePool np : nodepools ) {
* Determine if the given share is in a nodepool that this job is allowed to be scheduled over.
* You can get a mismatch if the classes or nodepools are reconfigured and RM is restarted
* with jobs still in the system.
* UIMA-4142
* @param s The share to validate.
* @param j The job to validate against.
* @return true if s and j are compatible, false otherwise.
boolean compatibleNodepool(Share s, IRmJob j)
// cut to the chase and ask the NP directly if this dude is allowed
NodePool np = s.getNodepool();
ResourceClass rc = j.getResourceClass();
Policy p = rc.getPolicy();
return np.compatibleNodepool(p, rc);
* Make this public for log following.
public synchronized void processRecovery(IRmJob j)
String methodName = "processRecovery";
ResourceClass rc = resourceClassesByName.get(j.getClassName());
int share_order = calcShareOrder(j);
HashMap shares = j.getRecoveredShares();
List sharesToShrink = new ArrayList(); // UIMA-4142
StringBuffer sharenames = new StringBuffer();
for ( Share s : shares.values() ) {
sharenames.append(" ");
switch ( rc.getPolicy() ) {
if ( !compatibleNodepool(s, j) ) { // UIMA-4142
logger.info(methodName, j.getId(), "Set fixed bit for FIXED job");
if ( !compatibleNodepool(s, j) ) { // UIMA-4142
if ( j.isService() ) {
sharesToShrink.add(s); // nodepool reconfig snafu, SM will reallocate the process
} else {
logger.warn(methodName, j.getId(), "Share is in incompatible nodepool but cannot be evicted:", s);
logger.info(methodName, j.getId(), "Set fixed bit for RESERVE job");
if ( j.isService() && !compatibleNodepool(s, j) ) { // UIMA-4142
sharesToShrink.add(s); // nodepool reconfig snafu, SM will reallocate the process
// if ( rc.getPolicy() != Policy.RESERVE ) { // if it's RESERVE, the share order is already set from
// // the machine when the job arrives.
// s.setShareOrder(share_order);
// }
Machine m = s.getMachine();
NodePool np = m.getNodepool();
np.connectShare(s, m, j, s.getShareOrder());
busyShares.put(s.getId(), s);
String username = j.getUserName();
User user = users.get(username);
if ( user == null ) {
user = new User(username);
users.put(username, user);
logger.info(methodName, j.getId(), "&&&&&&&&&&&&&&&& new user", user.toString(), "-------------------");
j.promoteShares(); // NOT expanded, just recovered, promote them right away
String clid = j.getClassName();
ResourceClass prclass = resourceClassesByName.get(clid);
allJobs.put(j.getId(), j);
logger.info(methodName, j.getId(), "Recovered job:", j.toString());
logger.info(methodName, j.getId(), "Recovered shares:", sharenames.toString());
try {
} catch (Exception e) {
logger.warn(methodName, j.getId(), "Cannot persist recovered job in database:", j);
// After a reconfig/restart the share may be in the wrong place, in which case it
// needs to be removed. We have to wait until it is fully hooked into the structures
// before scheduling for removal because it could take a while to go away and
// we have to be careful not to overcommit.
// UIMA-4142
for ( Share s : sharesToShrink ) {
logger.info(methodName, j.getId(), "Recovery - Removing share from wrong nodepool after reconfiguration:", s);
* The share is gone, purge from our structures.
private void purgeShare(Share s, IRmJob j)
busyShares.remove(s.getId()); // so long, and thanks for all the fish
Machine m = s.getMachine();
public synchronized static DuccId newId()
return idFactory.next();
public synchronized static DuccId newId(long id)
return idFactory.next(id);
public void queryMachines()
for ( NodePool np : nodepools ) {
class MachineByOrderSorter
implements Comparator
public int compare(Machine m1, Machine m2)
if ( m1.equals(m2) ) return 0;
if (m1.getShareOrder() == m2.getShareOrder()) {
return (m1.getId().compareTo(m2.getId()));
return (int) (m1.getShareOrder() - m2.getShareOrder());
© 2015 - 2025 Weber Informatics LLC | Privacy Policy