org.apache.uima.ducc.rm.scheduler.Scheduler Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of uima-ducc-rm Show documentation
There is a newer version: 3.0.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
*/
package org.apache.uima.ducc.rm.scheduler;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.uima.ducc.common.Node;
import org.apache.uima.ducc.common.NodeConfiguration;
import org.apache.uima.ducc.common.NodeIdentity;
import org.apache.uima.ducc.common.Pair;
import org.apache.uima.ducc.common.admin.event.RmAdminQLoadReply;
import org.apache.uima.ducc.common.admin.event.RmAdminQOccupancyReply;
import org.apache.uima.ducc.common.admin.event.RmAdminReply;
import org.apache.uima.ducc.common.admin.event.RmAdminVaryReply;
import org.apache.uima.ducc.common.admin.event.RmQueriedClass;
import org.apache.uima.ducc.common.admin.event.RmQueriedMachine;
import org.apache.uima.ducc.common.admin.event.RmQueriedNodepool;
import org.apache.uima.ducc.common.component.AbstractDuccComponent;
import org.apache.uima.ducc.common.persistence.rm.IRmPersistence;
import org.apache.uima.ducc.common.persistence.rm.RmPersistenceFactory;
import org.apache.uima.ducc.common.utils.DuccLogger;
import org.apache.uima.ducc.common.utils.DuccProperties;
import org.apache.uima.ducc.common.utils.DuccPropertiesResolver;
import org.apache.uima.ducc.common.utils.SystemPropertyResolver;
import org.apache.uima.ducc.common.utils.Version;
import org.apache.uima.ducc.common.utils.id.DuccId;
import org.apache.uima.ducc.common.utils.id.DuccIdFactory;


/**
 * This process orchestrates scheduling.
 * - Receives requests from clients ( job manager, service manager, etc ) for resources
 * - Forwards requests and current state to pluggable scheduling implementation
 * - Receives a schedule, updates state, sends responses to requestors
 * - Maintains state as needed (work item life cycle etc)
 */
public class Scheduler
//    extends Thread
    implements ISchedulerMain,
    	SchedConstants
{
    IJobManager jobManager;
    static DuccLogger     logger = DuccLogger.getLogger(Scheduler.class, COMPONENT_NAME);

    boolean done = false;
    // Boolean force_epoch = false;
    String ducc_home;
    // Integer epoch = 5;                                                 // scheduling epoch, seconds

    NodeConfiguration configuration = null;                               // UIMA-4142 make it global
    
    String defaultDomain = null;                                          // UIMA-4142
    boolean needRecovery = false;                                         // UIMA-4142 tell outer layer that recovery is required
    AbstractDuccComponent baseComponent;                                  // UIMA-4142, pass in the base for reconfig - reread ducc.properties
    NodePool[] nodepools;                                                 // top-level nodepools
    int max_order = 0;

    //
    // Fair-share and fixed-share use shares only, not machines
    //
    Map busyShares        = new HashMap(); // Running "fair" share jobs

    // incoming reports of machines that are now free
    Map> vacatedShares= new HashMap>();
    // boolean growthOccurred = false;                                           // don't care which grew, just that something grew

    List        incomingJobs    = new ArrayList();       // coming in from external world but not added our queues yet
    List        recoveredJobs   = new ArrayList();       // coming in from external world but we don't now about them, (hopefully
                                                                         //    because we crashed and not for more nefarious reasons)
    List        completedJobs   = new ArrayList();       // signaled complete from outside but not yet dealt with
    List        initializedJobs = new ArrayList();       // Init is complete so we can begin full (un)fair share allocation

    //HashMap incomingNodes  = new HashMap();         // node updates
    Map deadNodes      = new HashMap();           // missed too many heartbeats
    Map illNodes    = new HashMap();        // starting to miss, keep track of how many for the db
    // HashMap allNodes       = new HashMap();           // the guys we know
    Map    nodepoolsByNode = new HashMap(); // all nodes, and their associated pool
    Map      shortToLongNode = new HashMap();   // 

    Map    users     = new HashMap();         // Active users - has a job in the system
    //HashMap    runningJobs = new HashMap();

    Map  allJobs = new HashMap();

    Map resourceClasses = new HashMap();
    Map resourceClassesByName = new HashMap();

    String defaultFairShareName = null;
    String defaultReserveName = null;

    int defaultNThreads = 1;
    int defaultNTasks = 10;
    int defaultMemory = 15;

    // these two are initialized in constructor
    String schedImplName;
    IScheduler[] schedulers;

    long share_free_dram  = 0;               // 0  GB in KB  - minim memory after shares are allocated
    long dramOverride     = 0;               // if > 0, use this instead of amount reported by agents (modeling and testing)

    int pending_evictions = 0;                    // for queries
    int pending_expansions = 0;                  // for queries

    EvictionPolicy evictionPolicy = EvictionPolicy.SHRINK_BY_MACHINE;

//     int nodeMetricsUpdateRate = 30000;
//     int startupCountdown = 0;       // update each epoch.  only schedule when it's > nodeStability
    int nodeStability = 3;
    boolean stability = false;

    private static DuccIdFactory idFactory;
    IRmPersistence persistence = null;

    // static boolean expandByDoubling = true;
    // static int initializationCap = 2;      // Max allocation until we know initialization works in
                                           // units of *processes*, not shares (i.e.N-shares).

    //
    // Version
    //    0 - major version
    //    6 - minor version
    //    3 - ptf - forced eviction under fragmentation.
    //    4 - defrag code complete
    //  beta - not yet "real"!
    //
    // Bring up to speed with rest of ducc version. 2013-03-06 jrc
    //
    // 1.0.1 - RM can purge non-preemptables except for Unmanaged Reservations.  UIMA-3614
    // 1.0.2 - vary-on, vary-off
    // 1.0.3 - fix bad check in recursion in NodepoolScheduler.doEvictions
    // 1.1.0 - Syncnronize with release
    final static int rmversion_major = 2;
    final static int rmversion_minor = 0;
    final static int rmversion_ptf   = 0;  
    final static String rmversion_string = null;

    boolean initialized = false;           // we refuse nodeupdates until this is true
    public Scheduler(AbstractDuccComponent baseComponent)
    {
        this.baseComponent = baseComponent;                // UIMA-4142, pass in the base for reconfig
    }

    public synchronized void init()
        throws Exception
    {
        String methodName = "init";
        //setName("Scheduler");

        DuccLogger.setUnthreaded();

        String ep         = SystemPropertyResolver.getStringProperty("ducc.rm.eviction.policy", "SHRINK_BY_MACHINE");
        evictionPolicy    = EvictionPolicy.valueOf(ep);        

        // nodepool          = new NodePool(null, evictionPolicy, 0);   // global nodepool
        share_free_dram   = SystemPropertyResolver.getLongProperty("ducc.rm.reserved.dram", share_free_dram) * 1024 * 1024;   // GB -> KB
        ducc_home         = SystemPropertyResolver.getStringProperty("DUCC_HOME");

        // some defaults, for jobs that don't specify them
        defaultNTasks     = SystemPropertyResolver.getIntProperty("ducc.rm.default.tasks", 10); 
        defaultNThreads   = SystemPropertyResolver.getIntProperty("ducc.rm.default.threads", 1);
        defaultMemory     = SystemPropertyResolver.getIntProperty("ducc.rm.default.memory", 15);      // in GB
        // expandByDoubling  = RmUtil.getBooleanProperty("ducc.rm.expand.by.doubling", true);

        nodeStability     = SystemPropertyResolver.getIntProperty("ducc.rm.node.stability", 3);        // number of node metrics updates to wait for before scheduling
                                                                                  // 0 means, just jump right in and don't wait

        dramOverride = SystemPropertyResolver.getLongProperty("ducc.rm.override.dram", 0);
        if ( dramOverride > 0 ) {
            dramOverride = dramOverride * (1024 * 1024);         // convert to KB
        }

        if ( idFactory == null ) {                               // UIMA-4142 only remake it on first boot
            idFactory = new DuccIdFactory(1);
        }

//        try {
//            schedImplName = SystemPropertyResolver.getStringProperty("ducc.rm.scheduler", "org.apache.uima.ducc.rm.ClassBasedScheduler");
//            @SuppressWarnings("unchecked")
//			Class cl = (Class) Class.forName(schedImplName);
//            scheduler = (IScheduler) cl.newInstance();
//        } catch (ClassNotFoundException e) {
//            throw new SchedulingException(null, "Cannot find class " + schedImplName);
//        } catch (InstantiationException e) {
//            throw new SchedulingException(null, "Cannot instantiate class " + schedImplName);            
//        } catch (IllegalAccessException e) {
//            throw new SchedulingException(null, "Cannot instantiate class " + schedImplName + ": can't access constructor.");            
//        }

        String class_definitions = SystemPropertyResolver
            .getStringProperty(DuccPropertiesResolver
                               .ducc_rm_class_definitions, "scheduler.classes");
        class_definitions = System.getProperty("DUCC_HOME") + "/resources/" + class_definitions;

        try {
            initClasses();
        } catch ( Exception e ) {
            logger.error(methodName, null, e);
            throw e;
        }

        // we share most of the state with the actual scheduling code - no need to keep passing this around
        // TODO: Make sure these are all Sialized correctly
//         scheduler.setEvictionPolicy(evictionPolicy);
//         scheduler.setClasses(resourceClasses);
//         scheduler.setNodePool(nodepools[0]);

        logger.info(methodName, null, "                       reserved DRAM           : ", (share_free_dram / (1024*1024)), " GB");
        logger.info(methodName, null, "                       DRAM override           : ", (dramOverride / (1024*1024)), " GB");
        logger.info(methodName, null, "                       scheduler               : ", schedImplName);
        logger.info(methodName, null, "                       default threads         : ", defaultNThreads);
        logger.info(methodName, null, "                       default tasks           : ", defaultNTasks);
        logger.info(methodName, null, "                       default memory          : ", defaultMemory);
        logger.info(methodName, null, "                       default fairshare class : ", defaultFairShareName);
        logger.info(methodName, null, "                       default reserve         : ", defaultReserveName);
        logger.info(methodName, null, "                       class definition file   : ", class_definitions);
        logger.info(methodName, null, "                       default domain          : ", defaultDomain);      // UIMA-4142
        logger.info(methodName, null, "                       eviction policy         : ", evictionPolicy);
        logger.info(methodName, null, "                       database enabled        : ", !System.getProperty("ducc.database.host").equals("--disabled--"));
        logger.info(methodName, null, "                       database implementation : ", System.getProperty("ducc.rm.persistence.impl"));
        logger.info(methodName, null, "                       use prediction          : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.prediction", true));
        logger.info(methodName, null, "                       prediction fudge factor : ", SystemPropertyResolver.getIntProperty("ducc.rm.prediction.fudge", 10000));
        logger.info(methodName, null, "                       node stability          : ", nodeStability);
        logger.info(methodName, null, "                       init stability          : ", SystemPropertyResolver.getIntProperty("ducc.rm.init.stability"));
        logger.info(methodName, null, "                       fast recovery           : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.fast.recovery", true));
        logger.info(methodName, null, "                       metrics update rate     : ", SystemPropertyResolver.getIntProperty("ducc.agent.node.metrics.publish.rate", 
                                                                                                                                 DEFAULT_NODE_METRICS_RATE));
        logger.info(methodName, null, "                       initialization cap      : ", SystemPropertyResolver.getIntProperty("ducc.rm.initialization.cap"));
        logger.info(methodName, null, "                       expand by doubling      : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.expand.by.doubling", true));
        logger.info(methodName, null, "                       fragmentation threshold : ", SystemPropertyResolver.getIntProperty("ducc.rm.fragmentation.threshold", 2));
        logger.info(methodName, null, "                       do defragmentation      : ", SystemPropertyResolver.getBooleanProperty("ducc.rm.defragmentation", true));
        logger.info(methodName, null, "                       DUCC home               : ", System.getProperty("DUCC_HOME"));
        logger.info(methodName, null, "                       ActiveMQ URL            : ", SystemPropertyResolver.getStringProperty("ducc.broker.url"));
        logger.info(methodName, null, "                       JVM                     : ", System.getProperty("java.vendor") +
                                                                                      " "+ System.getProperty("java.version"));
        logger.info(methodName, null, "                       JAVA_HOME               : ", System.getProperty("java.home"));
        logger.info(methodName, null, "                       JVM Path                : ", System.getProperty("ducc.jvm"));
        logger.info(methodName, null, "                       JMX URL                 : ", System.getProperty("ducc.jmx.url"));
        logger.info(methodName, null, "                       OS Architecture         : ", System.getProperty("os.arch"));
        logger.info(methodName, null, "                       OS Name                 : ", System.getProperty("os.name"));
        logger.info(methodName, null, "                       DUCC Version            : ", Version.version());
        logger.info(methodName, null, "                       RM Version              : ", ""+ rmversion_major   + "." 
                                                                                             + rmversion_minor   + "." 
                                                                                             + rmversion_ptf);
        persistence = RmPersistenceFactory.getInstance(this.getClass().getName(), "RM");
        persistence.clear();
        initialized = true;
    }

    public RmAdminReply reconfigure()          // UIMA-4142
    {
        String methodName = "reconfigure";

        RmAdminReply ret = new RmAdminReply();
        logger.info(methodName, null, "Reconfiguration starts.");

        setInitialized(false);           // stop receipt of OR and Agent publications

        // First we run the logic that reads the configuration, and if it fails, we abort the reconfig without crashing
        // We'll throw it away because we call init() in a minute, which will do the actual configuration, as if booting
		try {
			readConfiguration();                                    // UIMA-4142
		} catch (Throwable e) {
            setInitialized(true);
            logger.warn(methodName, null, "Reconfiguration aborted:", e.toString());
            ret.setRc(false);
            ret.setMessage("Reconfiguration failed: " + e.toString());
            return ret;
		}

        HashMap offlineMachines                   = new HashMap();
        for (NodePool np : nodepools) {
            offlineMachines.putAll(np.getOfflineMachines());            
        }

        // (be careful, don't use the value, that must be discarded as it points to the OLD np)
        List offlineHostnames = new ArrayList();
        for ( Machine m : offlineMachines.values()) {
        	logger.info(methodName, null, "Saving offline status of", m.getId());
            offlineHostnames.add(m.getId());
        }
        offlineMachines = null;

        this.configuration = null;
        this.defaultDomain = null;
        this.nodepools = null;
        this.max_order = 0;
        this.busyShares.clear();
        this.vacatedShares.clear();
        this.incomingJobs.clear();
        this.recoveredJobs.clear();
        this.initializedJobs.clear();
        this.deadNodes.clear();
        this.nodepoolsByNode.clear();
        this.shortToLongNode.clear();
        this.users.clear();
        this.allJobs.clear();
        this.resourceClasses.clear();
        this.resourceClassesByName.clear();

        try {
            baseComponent.reloadProperties("ducc.deploy.configuration");
            init();

            if ( offlineHostnames.size() > 0 ) {
                String[] offline = offlineHostnames.toArray(new String[offlineHostnames.size()]);
                varyoff(offline);
            }
        } catch ( Throwable t ) {
        	// TODO do something?  What?  If this fails its pretty awful.
        }
        
        setRecovery(true);               // signal to outer layer that full recovery is needed
        setInitialized(true);            // resume receipt of publications
        logger.info(methodName, null, "Reconfiguration complete.");

        ret.setMessage("Reconfiguration complete.");
        return ret;
    }

    public synchronized void setRecovery(boolean v)
    {
        this.needRecovery = v;
    }

    public synchronized boolean mustRecover()
    {
        return this.needRecovery;
    }

    public synchronized boolean isInitialized()
    {
        return initialized;
    }

    public synchronized void setInitialized(boolean v)
    {
        this.initialized = v;
    }

    public Machine getMachine(Node n)
    {
        return getMachine(n.getNodeIdentity());
    }

    public Machine getMachine(NodeIdentity ni)
    {
        NodePool nodepool = getNodepoolByName(ni);
    	return nodepool.getMachine(ni);        
    }

    public void setJobManager(IJobManager jobmanager)
    {
        this.jobManager = jobmanager;
    }

    public String getDefaultFairShareName()
    {
    	return defaultFairShareName;
    }

    public String getDefaultReserveName()
    {
    	return defaultReserveName;
    }

    public int getDefaultNThreads()
    {
    	return defaultNThreads;
    }

    public int getDefaultNTasks()
    {
    	return defaultNTasks;
    }

    public int getDefaultMemory()
    {
    	return defaultMemory;
    }

    public ResourceClass getResourceClass(String name)
    {
        return resourceClassesByName.get(name);
    }

    public IRmJob getJob(DuccId id)
    {
        return allJobs.get(id);
    }

    public Share getShare(DuccId id)
    {
        return busyShares.get(id);
    }

//    public static int getInitializationCap()
//    {
//        return initializationCap;
//    }
//
//    public static boolean isExpandByDoubling()
//    {
//        return expandByDoubling;
//    }

    /**
     * Calculate share order, given some memory size in GB (as in from a job spec)
     */
    public int calcShareOrder(IRmJob j)
    {
        // Calculate its share order
        long mem = j.getMemory() << 20 ;                             // to KB from GB
        int share_quantum = j.getShareQuantum();
        
        int share_order = (int) (mem / share_quantum);               // liberal calc, round UP
        if ( (mem % share_quantum) > 0 ) {
            share_order++;
        }
        return share_order;
    }


    /**
     * Collect all the classes served by the indicated nodepool (property set).  This fills
     * in the 'ret' map from the parameter 'dp' and recursive calls to the children in dp.

     * @param dp This is the properties object from the configurator for a top-level
     *            nodepool.
     * @param ret This is the map to be filled in by this routine.
     */
    void getClassesForNodepool(DuccProperties dp, Map ret)
    {
        @SuppressWarnings("unchecked")
		List class_set = (List) dp.get("classes");
        if ( class_set != null ) {
            for ( DuccProperties cl : class_set ) {
                ResourceClass rc = resourceClassesByName.get(cl.getStringProperty("name"));
                ret.put(rc, rc);
            }
        }

        @SuppressWarnings("unchecked")
		List children = (List) dp.get("children");
        if ( children != null ) {
            for (DuccProperties child : children ) {
                getClassesForNodepool(child, ret);
            }
        }        
    }

    /**
     * Map each node by name into the nodepool it belongs to
     */
    void mapNodesToNodepool(Map nodes, NodePool pool)
    {
        if ( nodes == null ) return;

        for ( String s : nodes.keySet() ) {
            updateNodepoolsByNode(s, pool);        // maps from both the fully-qualified name and th shortnmae
        }
    }

    /**
     * (Recursively) build up the heirarchy under the parent nodepool.
     */
    void createSubpools(NodePool parent, List children)
    {
        if ( children == null ) return;

        for ( DuccProperties dp : children ) {
            String id = dp.getStringProperty("name");
            @SuppressWarnings("unchecked")
			Map nodes = (Map) dp.get("nodes");
            int search_order = dp.getIntProperty("search-order", 100);
            NodePool child = parent.createSubpool(id, nodes, search_order);
            mapNodesToNodepool(nodes, child);

            @SuppressWarnings("unchecked")
			List grandkids = (List) dp.get("children");
            createSubpools(child, grandkids);            
        }
    }

    // UIMA-4142 better modularize this code
    NodeConfiguration readConfiguration()
        throws Exception
    {
        String class_definitions = SystemPropertyResolver
            .getStringProperty(DuccPropertiesResolver
                               .ducc_rm_class_definitions, "scheduler.classes");
        String user_registry = SystemPropertyResolver
            .getStringProperty(DuccPropertiesResolver
                               .ducc_rm_user_registry, "ducc.users");
        class_definitions = System.getProperty("DUCC_HOME") + "/resources/" + class_definitions;
        String me = Scheduler.class.getName() + ".Config";
        DuccLogger initLogger = new DuccLogger(me, COMPONENT_NAME);
        NodeConfiguration nc = new NodeConfiguration(class_definitions, null, user_registry, initLogger);        // UIMA-4142 make the config global
        nc.readConfiguration();
        return nc;                                             // UIMA-4142
    }

    // UIMA-4142, don't pass in class def file, instead use common readConfiguration
    void initClasses()
    {
    	String methodName = "initClasses";
        
		try {
            configuration = readConfiguration();
		} catch (Throwable e) {
            // RM boot.  We must abort being has we have no prior working configuration to fall back to.
            logger.error(methodName, null, e);
            logger.error(methodName, null, "Scheduler exits: unable to read configuration.");
            System.out.println("Scheduler exits: unable to read configuration.");
            e.printStackTrace();
            System.exit(1);
		}

        defaultDomain = configuration.getDefaultDomain();                   // UIMA-4142
        configuration.printConfiguration();

        DuccProperties[] nps = configuration.getToplevelNodepools();
        Map cls = configuration.getClasses();

        nodepools = new NodePool[nps.length];                   // top-level nodepools
        schedulers = new IScheduler[nps.length];                // a schedler for each top-level nodepool

        // Here build up the ResourceClass definitions
        logger.info(methodName, null, "Classes:");
        logger.info(methodName, null, ResourceClass.getHeader());
        logger.info(methodName, null, ResourceClass.getDashes());
        for ( DuccProperties props : cls.values() ) {
            ResourceClass rc = new ResourceClass(props);
            resourceClasses.put(rc, rc);
            resourceClassesByName.put(rc.getName(), rc);
            logger.info(methodName, null, rc.toString());
        }

        DuccProperties dc = configuration.getDefaultFairShareClass();
        if ( dc != null ) {
            defaultFairShareName = dc.getProperty("name");
        }

        dc = configuration.getDefaultReserveClass();
        if ( dc != null ) {
            defaultReserveName = dc.getProperty("name");
        }

        // Instatntiate one scheduler per top-level nodepool
        try {
            schedImplName = SystemPropertyResolver.getStringProperty("ducc.rm.scheduler", "org.apache.uima.ducc.rm.ClassBasedScheduler");
            @SuppressWarnings("unchecked")
			Class cl = (Class) Class.forName(schedImplName);
            for ( int i = 0; i < nps.length; i++ ) {
                logger.info(methodName, null, "Rebuilding", schedImplName, "for top level nodepool", nps[i].get("name"));
                schedulers[i] = (IScheduler) cl.newInstance();
                schedulers[i].setEvictionPolicy(evictionPolicy);
            }

        } catch (ClassNotFoundException e) {
            throw new SchedulingException(null, "Cannot find class " + schedImplName);
        } catch (InstantiationException e) {
            throw new SchedulingException(null, "Cannot instantiate class " + schedImplName);            
        } catch (IllegalAccessException e) {
            throw new SchedulingException(null, "Cannot instantiate class " + schedImplName + ": can't access constructor.");            
        }

        // Here create the nodepool configuration
        for ( int i = 0; i < nps.length; i++ ) {
            DuccProperties np = nps[i];
            String id = np.getStringProperty("name");
            @SuppressWarnings("unchecked")
			Map nodes = (Map) np.get("nodes");
            int search_order = np.getIntProperty("search-order", 100);
            int q = np.getIntProperty("share-quantum", 15) << 20 ;      // to kB which is how the nodes report in
            nodepools[i] = new NodePool(null, id, nodes, evictionPolicy, 0, search_order, q);
            schedulers[i].setNodePool(nodepools[i]);                    // set its top-level nodepool

            mapNodesToNodepool(nodes, nodepools[i]);
            logger.info(methodName, null, "Created top-level nodepool", id);

            @SuppressWarnings("unchecked")
			List children = (List) np.get("children");
            createSubpools(nodepools[i], children);

            Map classesForNp = new HashMap();
            getClassesForNodepool(np, classesForNp);           // all classes served by this heirarchy - fills in classesForNp
            for ( ResourceClass rc: classesForNp.values() ) {               // UIMA-4065 tell each cl which np serves it
                String rcid = rc.getNodepoolName();
                if ( rcid != null ) {
                    // set the two-way pointers between rc and np
                    NodePool subpool = nodepools[i].getSubpool(rcid);
                    rc.setNodepool(subpool);                  // rc -> nodepool
                    logger.info(methodName, null, "Assign rc", rc.getName(), "to np", subpool.getId());
                    subpool.addResourceClass(rc);             // nodepool -> rc
                }
            }

            schedulers[i].setClasses(classesForNp);

        }

        // Here create or update Users with constraints from the registry
        Map usrs = configuration.getUsers();                // UIMA-4275
        for ( Object o : usrs.keySet() ) {                     // iterate over users
            String n = (String) o;
            DuccProperties dp = usrs.get(n);
            for ( Object l : dp.keySet() ) {                  // iterate over limits for the user
                if ( !((String)l).startsWith("max-allotment")) continue; // only this supported at this time
                String val = ((String) dp.get(l)).trim();

                int lim = Integer.parseInt( val ); // verified parsable int during parsing


                User user = users.get(n);
                if (user == null) {
                    user = new User(n);
                    users.put(n, user);
                }

                if ( val.contains(".") ) {
                    String[] tmp = ((String)l).split("\\.");                // max_allotment.classname
                    ResourceClass rc = resourceClassesByName.get(tmp[1]);
                    user.overrideLimit(rc, lim);   // constrain allotment for this class to value in l
                } else {
                    user.overrideGlobalLimit(lim);
                }
            }
        }

    }

    /**
     * Called only from schedule, under the 'this' monitor.
     *
     * We then take the SchedulingUpdate from the IScheduler and dispatches orders to
     * the world to make it happen.
     *
     * For jobs that lose resources, job manager is asked to stop execution in specific shares.
     * For jobs that gain resources, job manager is asked to start execution in specific shares.
     * Jobs that don't change are leftovers.  If they're not running at all, they're in the pending
     *  list; they might also be in the running list but had no allocation changes in the current epoch.
     */
    private JobManagerUpdate dispatch(SchedulingUpdate upd, JobManagerUpdate jmu)
    {
        String methodName = "dispatch";
        HashMap jobs;

        pending_evictions = 0;                    // for queries
        pending_expansions = 0;                  // for queries

        // Go through shrunken jobs - if they are shrunken to 0, move to dormant
        jobs = upd.getShrunkenJobs();
        for (IRmJob j : jobs.values()) {
            
            logger.trace(methodName, j.getId(), ">>>>>>>>>> SHRINK");

            HashMap sharesE = j.getAssignedShares();
            HashMap sharesR = j.getPendingRemoves();
            logger.trace(methodName, j.getId(), "removing", sharesR.size(), "of existing", sharesE.size(), "shares.");
            pending_evictions += (sharesR.size() * j.getShareOrder());

            for ( Share s : sharesE.values() ) {
                logger.trace(methodName, j.getId(), "    current", s.toString());
            }

            for ( Share s : sharesR.values() ) {
                logger.trace(methodName, j.getId(), "    remove ", s.toString());
            }
            logger.trace(methodName, j.getId(), ">>>>>>>>>>");

            jmu.removeShares(j, sharesR);
            // jobManager.stopJob(j, shares);                 // stops job on everything on the pendingRemoves list
            // j.clearPendingRemoves();
        }

        // Go through expanded jobs - if they are dormant, remove from dormant
        //                            then add to running.
        // Tell the server it needs to start some machines for the job
        jobs = upd.getExpandedJobs();
        for (IRmJob j : jobs.values() ) {
            HashMap sharesE = j.getAssignedShares();
            HashMap sharesN = j.getPendingShares();        	

            logger.trace(methodName, j.getId(), "<<<<<<<<<<  EXPAND");
            logger.trace(methodName, j.getId(), "adding", sharesN.size(), "new shares to existing", sharesE.size(), "shares.");
            pending_expansions += (sharesN.size() * j.getShareOrder());

            for ( Share s : sharesE.values()) {
                logger.trace(methodName, j.getId(), "    existing ", s.toString());
            }

            for ( Share s : sharesN.values()) {
                logger.trace(methodName, j.getId(), "    expanding", s.toString());
            }
            logger.trace(methodName, j.getId(), "<<<<<<<<<<");

            sharesN = j.promoteShares();
            if ( sharesN.size() == 0 ) {
                // internal error - should not be marked expanded if no machines
                throw new SchedulingException(j.getId(), "Trying to execute expanded job but no pending machines.");
            }

            for ( Share s : sharesN.values()) {                           // update machine books                
                // Sanity checks on the bookkeeping
                busyShares.put(s.getId(), s);                
            }

//            DuccId id = j.getId();                                  // pull from dormant, maybe
//            if ( dormantJobs .containsKey(id) ) {
//                dormantJobs .remove(id);
//            }

            //runningJobs.put(id, j);
            jmu.addShares(j, sharesN);
            // jobManager.executeJob(j, shares);                      // will update job's pending lists

        }

        jobs = upd.getStableJobs();                             // squirrel these away to try next time
        for (IRmJob j: jobs.values()) {
            if ( j.countNShares() < 0 ) {
                throw new SchedulingException(j.getId(), "Share count went negative " + j.countNShares());
            }
            logger.trace(methodName, j.getId(), ".......... STABLE with ", j.countNShares(), " shares.");
        }

        jobs = upd.getDormantJobs();                             // squirrel these away to try next time
        for (IRmJob j: jobs.values()) {
            logger.trace(methodName, j.getId(), ".......... DORMANT");
//            dormantJobs .put(j.getId(), j);
        }

        jobs = upd.getReservedJobs();
        for (IRmJob j: jobs.values()) {
            logger.trace(methodName, j.getId(), "<<<<<<<<<<  RESERVE");

            HashMap sharesE = j.getAssignedShares();
            HashMap sharesN = j.getPendingShares();        	

            if ( sharesE.size() == j.getMaxShares() ) {
                logger.trace(methodName, j.getId(), "reserve_stable", sharesE.size(), "machines");
            } else  if ( sharesN.size() == j.getMaxShares() ) {           // reservation is complete but not yet confirmed?
                logger.trace(methodName, j.getId(), "reserve_adding", sharesN.size(), "machines");
                for ( Share s : sharesN.values()) {
                    logger.trace(methodName, j.getId(), "    reserve_expanding ", s.toString());
                }
                jmu.addShares(j, sharesN);                
                j.promoteShares();
            } else {
                logger.trace(methodName, j.getId(), "reserve_pending", j.getMaxShares(), "machines");
            }
            logger.trace(methodName, j.getId(), "<<<<<<<<<<");
        }

        jmu.setAllJobs((HashMap)allJobs);

        jobs = upd.getRefusedJobs();
        Iterator iter = jobs.values().iterator();
        while ( iter.hasNext() ) {
            IRmJob j = iter.next();
            logger.trace(methodName, j.getId(), ".......... REFUSED");
        }

        return jmu;
    }

    /**
     * We don't accept new work or even Orchestrator state updates until "ready". We do
     * want machines, but be sure the internal structures are protected.
     */
    public synchronized boolean ready()
    {
    	return stability;
    }

    public synchronized void start()
    {
        stability = true;
    }

    public void stop()
    {
        persistence.close();
    }

    protected void handleIllNodes()
    {
    	String methodName = "handleIllNodes";
    	
        if ( ! isInitialized() ) {
            logger.info(methodName, null, "Waiting for (re)initialization.");
            return;
        }

        HashMap nodeUpdates = new HashMap();
        synchronized(illNodes) {
            nodeUpdates.putAll(illNodes);
            illNodes.clear();
        }
        
        synchronized(this) {
            for ( Node n : nodeUpdates.keySet() ) {
                Machine m = getMachine(n);

                if ( m == null ) {
                    logger.warn(methodName, null, "Cannot find any record of machine", n.getNodeIdentity().getName());
                    continue;
                }

                int count = nodeUpdates.get(n);
                if ( count == 0 ) {
                    m.heartbeatArrives();
                } else {
                    m.heartbeatMissed(count);
                }
            }
        }
    }

    protected void handleDeadNodes()
    {
    	String methodName = "handleDeadNodes";
    	
        if ( ! isInitialized() ) {
            logger.info(methodName, null, "Waiting for (re)initialization.");
            return;
        }

        HashMap nodeUpdates = new HashMap();
        synchronized(deadNodes) {
            nodeUpdates.putAll(deadNodes);
            deadNodes.clear();
        }

        synchronized(this) {

            for ( Node n : nodeUpdates.values() ) {
                Machine m = getMachine(n);

                if ( m == null ) {
                    // must have been removed because of earlier missed hb
                    continue;
                }

                logger.warn(methodName, null, "***Purging machine***", m.getId(), "due to missed heartbeats. THreshold:",  nodeStability);
                NodePool np = m.getNodepool();
                np.nodeLeaves(m);
            }
        }        
    }

    /**
     * We first accept any changes and requests from the outside world and place them where they
     * can be acted on in this epoch.
     *
     * We then pass all relevant requests and resources to the IScheduler.  This returns a
     * SchedulingUpdate which is passed to the dispatcher to be acted upon.
     */
    public JobManagerUpdate schedule()
    {
    	String methodName = "schedule";


//         if ( startupCountdown++ < nodeStability ) {
//             logger.info(methodName, null, "Startup countdown:", startupCountdown, "of", nodeStability);
//             return null;
//         }

        if ( ! ready() ) {
            return null;
        }

        if ( ! isInitialized() ) {
            logger.info(methodName, null, "Waiting for (re)initialization.");
            return null;
        }

        // tracking the OR hang problem - are topics being delivered?
        logger.info("nodeArrives", null, "Total arrivals:", total_arrivals);

        synchronized(this) {
            handleIllNodes();
            handleDeadNodes();
            resetNodepools();
        }

        // TODO: Can we combine these two into one?
        SchedulingUpdate upd = new SchedulingUpdate();              // state from internal scheduler
        JobManagerUpdate jmu = new JobManagerUpdate();              // state we forward to job manager

        // int nchanges = 0;
    	

        ArrayList jobsToRecover = new ArrayList();
        synchronized(recoveredJobs) {
            jobsToRecover.addAll(recoveredJobs);
            recoveredJobs.clear();
            // nchanges += jobsToRecover.size();
        }

        ArrayList newJobs = new ArrayList();
        // 
        // If there are new jobs we need to init some things and start a scheduling cycle.
        //
        synchronized(incomingJobs) {            
            newJobs.addAll(incomingJobs);
            incomingJobs.clear();
            // nchanges += newJobs.size();
        }

        //
        // If some jobs pased initializion we need to signal a scheduling cycle to get
        // them their fair share
        //
//        synchronized(initializedJobs) {            
//            if ( initializedJobs.size() > 0 ) {
//                nchanges++;
//            }
//            initializedJobs.clear();
//        }

        //
        // If some jobs completed we need to process clearning them out and signal a
        // scheduling cycle to try to reuse their resources.
        //
        ArrayList doneJobs = new ArrayList();
        synchronized(completedJobs) {
            doneJobs.addAll(completedJobs);
            completedJobs.clear();
            //nchanges += doneJobs.size();
        }

        //
        // If some shares were vacated we need to clear them out and run a scheduling cycle.
        //
        ArrayList> doneShares= new ArrayList>();
        synchronized(vacatedShares) {
            doneShares.addAll(vacatedShares.values());
            vacatedShares.clear();
            //nchanges += doneShares.size();

            // we use the vacatedShares object to control share growth as well
            //if ( growthOccurred ) nchanges++;
            //growthOccurred = false;
        }

//         boolean must_run = false;
//         synchronized(force_epoch) {
//             must_run = force_epoch;
//             force_epoch = false;
//         }

//         if ( (nchanges == 0) && !must_run ) { 
//             jmu.setAllJobs(allJobs);
//             return jmu;
//         }
// TODO if we remove this code above be sure to clear out all the force_epoch nonsense
// TODO does this even use growthOccurred?

        synchronized(this) {

            // before looking at jobs, insure we're updated after a crash
            for ( IRmJob j : jobsToRecover ) {
                processRecovery(j);
            }

            // process these next to free up resources for the scheduling cycle
            for (Pair p : doneShares) {
                processCompletion(p.first(), p.second());
            }

            for (IRmJob j : doneJobs) {
                processCompletion(j);
            }

            // update user records, "check in" new jobs
            if ( newJobs.size() > 0 ) {
                logger.info(methodName, null, "Jobs arrive:");
                logger.info(methodName, null, "submit", RmJob.getHeader());
            }

            Iterator iter = newJobs.iterator();
            while ( iter.hasNext() ) {
                IRmJob j = iter.next();


                if ( j.isRefused() ) {          // the JobManagerConverter has already refused it
                    logger.info(methodName, j.getId(), "Bypassing previously refused job.");
                    upd.refuse(j, j.getRefusalReason());
                }

                String user = j.getUserName();
                User u = users.get(user);
                if ( u == null ) {
                    u = new User(user);
                    users.put(user, u);
                }
                j.setUser(u);

                // Calculate its share order
                int share_order = calcShareOrder(j);
                j.setShareOrder(share_order);

                // Assign it to its priority class
                String clid = j.getClassName();
                ResourceClass prclass = resourceClassesByName.get(clid);

                u.addJob(j);
                allJobs.put(j.getId(), j);
                if ( prclass == null ) {                    
                    upd.refuse(j, "Cannot find priority class " + clid + " for job");
                    continue;
                }

                // UIMA-4275 never refuse impossible work, just let it hang out
//                 if ( share_order > max_order ) {
//                     upd.refuse(j, "Memory requested " + j.getMemory() + "GB exceeds the capacity of any machine in the cluster.");
//                     continue;
//                 }

                /**
                 * We want to allow this - a normal job, submitted to a reservation class.
                   if ( (prclass.getPolicy() == Policy.RESERVE ) && ( ! j.isReservation() ) ) {
                   upd.refuse(j, "Reservaction class " + 
                   prclass.getId() + " specified but work is not a reservation.");
                   continue;
                   }
                */

                if ( ((prclass.getPolicy() != Policy.RESERVE ) && (prclass.getPolicy() != Policy.FIXED_SHARE)) && ( j.isReservation() ) ) {
                    upd.refuse(j, "Class " + prclass.getName() + " is policy " + 
                               prclass.getPolicy() + " but the work is submitted as a reservation.");
                    continue;
                }

                prclass.addJob(j);
                j.setResourceClass(prclass);
                try {
					persistence.addJob(j);
				} catch (Exception e) {
					logger.warn(methodName, j.getId(), "Cannot persist new job in database:", e);					
				}
                logger.info(methodName, j.getId(), "submit", j.toString());
            }

            logger.info(methodName, null, "Scheduling " + newJobs.size(), " new jobs.  Existing jobs: " + allJobs.size());
            for ( int i = 0; i < schedulers.length; i++ ) {
                logger.info(methodName, null, "Run scheduler", i, "with top-level nodepool", nodepools[i].getId());
                schedulers[i].schedule(upd);
            }

            for ( IRmJob j : allJobs.values() ) {       // UIMA-4577 persist 'demand'
                try {
					persistence.updateDemand(j);
				} catch (Exception e) {
					logger.warn(methodName, j.getId(), "Cannot update demand in database:", e);
				}
            }

            logger.info(methodName, null, "--------------- Scheduler returns ---------------");
            logger.info(methodName, null, "\n", upd.toString());
            logger.info(methodName, null, "------------------------------------------------");                
            dispatch(upd, jmu);                 // my own job lists get updated by this

            return jmu;
        }
    }

    synchronized public void shutdown()
    {
        done = true;
    }

//     public void run()
//     {
//     	String methodName = "run";
//         while ( ! done ) {
//             try { sleep(epoch); } catch (InterruptedException e) { }

//             logger.info(methodName, null, "========================== Epoch starts ===========================");
//             try {
//                 schedule();
//             } catch ( SchedulingException e ) {
//                 logger.info(methodName, e.jobid, e);
//             }

//             logger.info(methodName, null, "========================== Epoch ends   ===========================");
//         }
//     }


    /**
     * maps from both the fully-qualified name and th shortnmae
     */
    void updateNodepoolsByNode(String longname, NodePool np)
    {
    	String methodName = "updateNodepoolsByNode";
        String shortname = longname;
        int ndx = longname.indexOf(".");

        logger.info(methodName, null, "Map", longname, "to", np.getId());
        nodepoolsByNode.put(longname, np);

        if ( ndx >= 0 ) {
            shortname = longname.substring(0, ndx);
            nodepoolsByNode.put(shortname, np);
            shortToLongNode.put(shortname, longname);
            logger.info(methodName, null, "Map", shortname, "to", np.getId());
        }
    }

    //
    // Return a nodepool by Node.  If the node can't be associated with a nodepool, return the
    // default nodepool, which is always the first one defined in the config file.
    //
    NodePool getNodepoolByName(NodeIdentity ni)
    {
        NodePool np = nodepoolsByNode.get( ni.getName() );
        if ( np == null ) {
            np = nodepoolsByNode.get( ni.getIp() );
        }
        if ( np == null ) {
            np = nodepools[0];
            updateNodepoolsByNode(ni.getName(), np);     // assign this guy to the default np
            // nodepoolsByNode.put( ni.getName(), np);          // assign this guy to the default np
        }
        return np;
    }

    private int total_arrivals = 0;
    public synchronized void nodeArrives(Node node)
    {        
        String methodName = "nodeArrives";
        if ( ! isInitialized() ) {
            logger.info(methodName, null, "Waiting for (re)initialization; node = " + node.getNodeIdentity().getName());
            return;
        }

        synchronized(illNodes) {        // stop flagging it as a problem
            illNodes.remove(node);
        }

    	// String methodName = "nodeArrives";
        // The first block insures the node is in the scheduler's records as soon as possible

        total_arrivals++;       // report these in the main schedule loop
        
        NodePool np = getNodepoolByName(node.getNodeIdentity());  // finds np assigned in ducc.nodes; if none, returns the default np
        Machine m = np.getMachine(node);
        int share_order = 0;

        // let's always recalculate this in case it changes for whatever bizarre reason (reboot, or pinned process gone, or whatever)
        long allocatable_mem =  node.getNodeMetrics().getNodeMemory().getMemFree() - share_free_dram;
        if ( dramOverride > 0 ) {
            allocatable_mem = dramOverride;
        }
        share_order = (int) (allocatable_mem / np.getShareQuantum());           // conservative - rounds down (this will always cast ok)                
        // NOTE: we cannot set the order into the machine yet, in case it has changed, because NodePool needs to adjust based
        //       on current and new

        max_order = Math.max(share_order, max_order);
        m = np.nodeArrives(node, share_order);                         // announce to the nodepools
        m.heartbeatArrives();
    }

    public void nodeHb(Node n, int count)
    {
        synchronized(illNodes) {
            illNodes.put(n, count);
        }
    }

    public void nodeDeath(Map nodes)
    {
        synchronized(deadNodes) {
            deadNodes.putAll(nodes);
        }
    }

    /**
     * User passed us a node by name.  Maybe did and maybe didn't qualify it.  
     * Maybe the node checked in qualified maybe it didn't.  Here we try to find
     * something that kind of matches.
     * UIMA-4142.  Technically a bug on vary-on and vary-off but found and fixed as part of
     *             the indicated Jira.
     */
    synchronized String resolve(String node)
    {
        NodePool np = nodepoolsByNode.get(node);
        if ( np == null ) return null;                  // indexed by long and short so if not found we're stuck

        if ( np.hasNode(node) ) return node;            // he knows it by this name we're done

        int ndx = node.indexOf(".");
        if ( ndx > 0 ) {
            // np MUST know it by either long or short or it wouldn't be in nodepoolsByNode
            // so it must be short
            return node.substring(0, ndx);
        } else {
            // and vice-versa, it must be the long
            return shortToLongNode.get(node);
        }
    }

    public synchronized RmAdminReply varyon(String[] nodes)
    {
        String methodName = "varyon";
        RmAdminVaryReply ret = new RmAdminVaryReply();
        StringBuffer sb = new StringBuffer();
        for (String n : nodes ) {

            String rn = resolve(n);
            if ( rn == null ) {
                ret.setRc(false);
                ret.addFailedHost(n);
                sb.append("VaryOn: " + n + " cannot be found in the RM.\n");
            } else {                
                NodePool np = nodepoolsByNode.get(rn);  // if null, resolve will fail
                if ( np == null ) {
                    ret.setRc(false);
                    ret.addFailedHost(rn);
                    sb.append("VaryOn: " + n + " cannot find associated nodepool.\n");
                } else {
                    String repl = np.varyon(rn);
                    logger.info(methodName, null, repl);
                    sb.append(repl);
                    sb.append("\n");
                }
            }
        }
        ret.setMessage(sb.toString());
        return ret;
    }

    public synchronized RmAdminReply varyoff(String[] nodes)
    {
        String methodName = "varyoff";
        RmAdminVaryReply ret = new RmAdminVaryReply();
        StringBuffer sb = new StringBuffer();
        for (String n : nodes ) {

            String rn = resolve(n);
            if ( rn == null ) {
                ret.setRc(false);
                ret.addFailedHost(n);
                sb.append("VaryOff: " + n + " cannot be found in the RM.\n");
            } else {
                NodePool np = nodepoolsByNode.get(rn);  // if null, resolve will fail
                if ( np == null ) {
                    ret.setRc(false);
                    ret.addFailedHost(rn);
                } else {
                    String repl = np.varyoff(rn);
                    logger.info(methodName, null, repl);
                    sb.append(repl);
                    sb.append("\n");
                }
            }
        }
        ret.setMessage(sb.toString());
    	return ret;
    }

    RmQueriedNodepool  getNpStats(NodePool np)
    {

        RmQueriedNodepool ret = new RmQueriedNodepool();
        
        ret.setName(np.getId());
        ret.setOnline(np.countLocalMachines());
        ret.setDead(np.countLocalUnresponsiveMachines());
        ret.setOffline(np.countLocalOfflineMachines());
        
        ret.setSharesAvailable(np.countLocalShares());
        ret.setSharesFree(np.countLocalQShares());

        ret.setAllMachines(np.countAllLocalMachines());

        int[] onlineMachines = np.makeArray();
        int[] freeMachines = np.makeArray();
        for ( int i = 1; i < freeMachines.length; i++ ) {
            freeMachines[i] += np.countFreeMachines(i);         // (these are local, as we want)
        }

        //np.getLocalOnlineByOrder(onlineMachines);
        ret.setOnlineMachines(onlineMachines);
        ret.setFreeMachines(freeMachines);

        ret.setVirtualMachines(np.countLocalVMachinesByOrder());

        // logger.info(methodName, null, np.getId() + ": online", online, "dead", dead, "offline", offline, "shares_available", shares_available, "shares_free", shares_free);
        // logger.info(methodName, null, np.getId() + ": allMachines    ", Arrays.toString(allMachines));
        // logger.info(methodName, null, np.getId() + ": onlineByOrder  ", Arrays.toString(onlineMachines));

        // logger.info(methodName, null, np.getId() + "------- freeMachines should match free -------");
        // logger.info(methodName, null, np.getId() + ": freeMachines   ", Arrays.toString(freeMachines));
        // logger.info(methodName, null, np.getId() + ": free           ", Arrays.toString(free));
        // logger.info(methodName, null, np.getId() + "----------------------------------------------");
        // logger.info(methodName, null, np.getId() + ": virtualMachines", Arrays.toString(virtualMachines));

        return ret;
    }

    void calculateLoad(RmAdminQLoadReply reply)
    {

        for ( ResourceClass cl : resourceClasses.values() ) {
            RmQueriedClass qcl = new RmQueriedClass();

            switch ( cl.getPolicy() ) {
                case FAIR_SHARE:
                    qcl.setPolicy("FAIR_SHARE");
                    break;
                case FIXED_SHARE:
                    qcl.setPolicy("FIXED_SHARE");
                    break;
                case RESERVE:
                    qcl.setPolicy("RESERVE");
                    break;
            }

            // TODO MUST FIX THIS

            // int[] demanded = NodePool.makeArray();
            // int[] awarded  = NodePool.makeArray();

            // HashMap jobs = cl.getAllJobs();
            // for ( IRmJob j : jobs.values() ) {
            //     int o = j.getShareOrder();
            //     demanded[o] += j.queryDemand();
            //     awarded[o]  += j.countNShares();
            // }
            
            // qcl.setName(cl.getName());
            // qcl.setDemanded(demanded);
            // qcl.setAwarded(awarded);
            // reply.addClass(qcl);
        }
    }

    void listAllNodepools(NodePool parent, ArrayList list)
    {
        list.add(parent);
        for (NodePool np : parent.getChildren().values() ) {
            listAllNodepools(np, list);
        }
    }

    public synchronized RmAdminQLoadReply queryLoad()
    {        

        RmAdminQLoadReply ret = new RmAdminQLoadReply();
        if ( ! ready() ) {
            ret.notReady();
            return ret;
        }

        calculateLoad(ret);

        ArrayList allpools = new ArrayList();
        for ( NodePool np : nodepools ) {
            listAllNodepools(np, allpools);
        }

        for ( NodePool np : allpools ) {
            ret.addNodepool(getNpStats(np));
        }
        
        return ret;
    }


    public synchronized RmAdminQOccupancyReply queryOccupancy()
    {
        RmAdminQOccupancyReply ret = new RmAdminQOccupancyReply();
        if ( ! ready() ) {
            ret.notReady();
            return ret;
        }

        //
        // iterate top-level nodepools to get all their subpools
        //   iterate the subpools to get all their machines
        //      iterage the machines and request a query object
        //         add query object to ret
        // return ret

        // We want to be dependent on common project, not the other way around, so
        // we keep the query objects in common and put knowledge of how to construc
        // them into rm's Machine class.
        //
        // The alternative, passing RM's Machine to the query object creates a circular
        // dependency with RM depending on common and common depending on RM.
        //
        
        //
        // Not a cheap query, by the way.
        //
        // NOTE: No longer used by the rm_qoccupancy script which now goes directly to the database
        //
        
        
        for ( NodePool np : nodepools ) {

            // NOTE:  The offline & dead nodes are also in the AllMachines list so must be removed
            Map allMachs     = np.getAllMachines();
            Map offline      = np.getOfflineMachines();          // UIMA-4234
            Map unresponsive = np.getUnresponsiveMachines();     // UIMA-4234

            for ( Node n : offline.keySet() ) {
                Machine m = offline.get(n);
                RmQueriedMachine qm = m.queryMachine();
                qm.setOffline();
                if ( unresponsive.containsKey(n) ) {
                    unresponsive.remove(n);
                    qm.setUnresponsive();
                }
                ret.addMachine(qm);
                allMachs.remove(n);
            }

            for ( Node n : unresponsive.keySet() ) {
                Machine m = unresponsive.get(n);
                RmQueriedMachine qm = m.queryMachine();
                qm.setUnresponsive();
                ret.addMachine(qm);
                allMachs.remove(n);
            }
            
            for ( Node n : allMachs.keySet() ) { 
              Machine m = allMachs.get(n);
              ret.addMachine(m.queryMachine());
          }
        }

        return ret;
    }

    public synchronized void signalState(DuccId jobid, String state)
    {
        IRmJob j = allJobs.get(jobid);
        if ( j != null ) {                // might not be here yet, we'll get it later
            j.setState(state);
        }
    }

    /**
     * Callback from job manager, need shares for a new fair-share job.
     */
    public void signalNewWork(IRmJob job)
    {
        // We'll synchronize only on the incoming job list 
        synchronized(incomingJobs) {            
            incomingJobs.add(job);
        }
    }

//     public void signalForceEpoch()
//     {
//         synchronized( force_epoch ) {
//             force_epoch = true;
//         }
//     }

    public void signalInitialized(IRmJob job)
    {
        // We'll synchronize only on the incoming job list 
        synchronized(initializedJobs) {            
            initializedJobs.add(job);
        }
    }

    public void signalRecovery(IRmJob job)
    {
        synchronized(recoveredJobs) {
            recoveredJobs.add(job);
        }
    }

    public void jobCancelled(DuccId id)
    {
        // TODO Fill this in.
    }

    /**
     * Callback from job manager when a job completes. We just believe him, no sanity checks or other such stuff.
     */
    public void signalCompletion(DuccId id)
    {
        String methodName = "signalCompletion";
        synchronized(completedJobs) {
            try {
                IRmJob job = allJobs.get(id);
                if ( job == null ) {
                    logger.warn(methodName, id, "Job completion signal: early termination; nothing to complete.");
                    return;  // canceled or terminated very soon.
                }

                logger.info(methodName, id, "Job completion signal.");
                completedJobs.add(job);
            } catch (Throwable t) {
                logger.warn(methodName, id, t);
            }
        }
    }

    /**
     * Callback from job manager when a specific share exits but the job is still alive.
     */
    public void signalCompletion(IRmJob job, Share share)
    {
        String methodName = "signalCompletion";
        synchronized(vacatedShares) {
            logger.info(methodName, job.getId(), "Job vacate signal share: ", share.toString());
            vacatedShares.put(share.getId(), new Pair(job, share));
        }            
    }

    /**
     * Callback from job manager when a specific share gets a process associated.
     */
//    public void signalGrowth(DuccId jobid, Share share)
//    {
//        String methodName = "signalGrowth";
//        synchronized(vacatedShares) {
//            logger.info(methodName, jobid, "Job growth signal share: ", share.toString());
//            growthOccurred = true;
//        }            
//    }

    /**
     * Called in scheduling cycle, to actually complete the job - avoids deadlock
     */
    private synchronized void processCompletion(IRmJob job)
    {
        String methodName = "processCompletion";
        logger.info(methodName, job.getId(), "Job completes.");

        try {
			persistence.deleteJob(job);     // UIMA-4577
		} catch (Exception e) {
			logger.warn(methodName, job.getId(), "Cannot delete job from database:", e);
		}

        // -- clean up the running jobs list
        IRmJob j = allJobs.remove(job.getId());
        if ( j == null ) {
            logger.info(methodName, job.getId(), "Job is not in run list!");  // can happen if job is refused very early
            return;
        }

        j.markComplete();

        // -- clean up user list
        User user = users.get(j.getUserName());
        user.remove(job);         // UIMA4275 don't clean up users list because it may have registry things in it

        ResourceClass rc = job.getResourceClass();
        if ( rc != null ) {
            rc.removeJob(j);            // also clears it if it's a reservation
        } else if ( !j.isRefused() ) {
            throw new SchedInternalError(j.getId(), "Job exits from class " + job.getClassName() + " but we cannot find the priority class definition.");
        }


        // -- clean up machine lists
        HashMap shares= job.getAssignedShares();        
        for (Share s: shares.values()) {
            purgeShare(s, job);
        }
        job.removeAllShares();
    }

    /**
     * Called from scheduling cycle - a specific share has run out of work for the give job (but the
     * job is not done yet).
     */
    private synchronized void processCompletion(IRmJob job, Share share)
    {
        String methodName = "processCompletion";
        
        logger.debug(methodName, job.getId(), "Job vacates share ", share.toString());
        //share.removeJob();
        job.removeShare(share);
        purgeShare(share, job);
    }

    /**
     * Log following / reconstruction, needed to init before recovery.
     */
    public void resetNodepools()
    {
        for ( NodePool np : nodepools ) {
            np.reset(np.getMaxOrder());
        }
    }

    /**
     * Determine if the given share is in a nodepool that this job is allowed to be scheduled over.
     * You can get a mismatch if the classes or nodepools are reconfigured and RM is restarted
     * with jobs still in the system.
     *
     * UIMA-4142
     *
     * @param     s  The share to validate.
     * @param     j  The job to validate against.
     * @return true  if s and j are compatible, false otherwise.
     */
    boolean compatibleNodepool(Share s, IRmJob j)
    {
        // cut to the chase and ask the NP directly if this dude is allowed

        NodePool np = s.getNodepool();
        ResourceClass rc = j.getResourceClass();
        Policy p = rc.getPolicy();

        return np.compatibleNodepool(p, rc);
    }

    /**
     * Make this public for log following.
     */
    public synchronized void processRecovery(IRmJob j)
    {
    	String methodName = "processRecovery";

        ResourceClass rc = resourceClassesByName.get(j.getClassName());
        j.setResourceClass(rc);
        
        int share_order = calcShareOrder(j);
        j.setShareOrder(share_order);

        HashMap shares = j.getRecoveredShares();
        List sharesToShrink = new ArrayList();       // UIMA-4142
        StringBuffer sharenames = new StringBuffer();
        for ( Share s : shares.values() ) {
            sharenames.append(s.toString());
            sharenames.append(" ");

            switch ( rc.getPolicy() ) {
                case FAIR_SHARE:
                    s.setShareOrder(share_order);
                    if ( !compatibleNodepool(s, j) ) {            // UIMA-4142
                        sharesToShrink.add(s);
                        break;
                    } 
                    break;
                case FIXED_SHARE:
                    logger.info(methodName, j.getId(), "Set fixed bit for FIXED job");
                    s.setShareOrder(share_order);
                    s.setFixed();
                    if ( !compatibleNodepool(s, j) ) {            // UIMA-4142
                        if ( j.isService() ) {
                            sharesToShrink.add(s);   // nodepool reconfig snafu, SM will reallocate the process
                        } else {
                            logger.warn(methodName, j.getId(), "Share is in incompatible nodepool but cannot be evicted:", s);
                        }
                    }
                    break;
                case RESERVE:
                    logger.info(methodName, j.getId(), "Set fixed bit for RESERVE job");
                    s.setFixed();
                    if ( j.isService() && !compatibleNodepool(s, j) ) {       // UIMA-4142
                        sharesToShrink.add(s);   // nodepool reconfig snafu, SM will reallocate the process
                    }
                    break;
            }

            // if ( rc.getPolicy() != Policy.RESERVE ) {          // if it's RESERVE, the share order is already set from
            //                                                    // the machine when the job arrives. 
            //     s.setShareOrder(share_order);
            // }

            Machine m = s.getMachine();
            NodePool np = m.getNodepool();
            np.connectShare(s, m, j, s.getShareOrder());

            busyShares.put(s.getId(), s);
        }
        String username = j.getUserName();
        User user = users.get(username);
        if ( user == null ) {
            user = new User(username);
            users.put(username, user);
            logger.info(methodName, j.getId(), "&&&&&&&&&&&&&&&& new user", user.toString(), "-------------------");
        }
        j.setUser(user);
        user.addJob(j);

    	j.promoteShares();                       // NOT expanded, just recovered, promote them right away
        j.clearRecoveredShares();

        String clid = j.getClassName();
        ResourceClass prclass = resourceClassesByName.get(clid);
        
        allJobs.put(j.getId(), j);
        prclass.addJob(j);
        j.setResourceClass(prclass);
        logger.info(methodName, j.getId(), "Recovered job:", j.toString());
        logger.info(methodName, j.getId(), "Recovered shares:", sharenames.toString());

        try {
			persistence.addJob(j);
		} catch (Exception e) {
			logger.warn(methodName, j.getId(), "Cannot persist recovered job in database:", j);
		}
        // After a reconfig/restart the share may be in the wrong place, in which case it
        // needs to be removed.  We have to wait until it is fully hooked into the structures
        // before scheduling for removal because it could take a while to go away and
        // we have to be careful not to overcommit.
        // UIMA-4142
        for ( Share s : sharesToShrink ) {
            logger.info(methodName, j.getId(), "Recovery - Removing share from wrong nodepool after reconfiguration:", s);
            j.shrinkByOne(s);
        }

    }

    /**
     * The share is gone, purge from our structures.
     */
    private void purgeShare(Share s, IRmJob j)
    {
        busyShares.remove(s.getId());         // so long, and thanks for all the fish
        Machine m = s.getMachine();
        m.removeShare(s);
    }

    public synchronized static DuccId newId()
    {
        return idFactory.next();
    }

    public synchronized static DuccId newId(long id)
    {
        return idFactory.next(id);
    }

    public void queryMachines()
    {
        for ( NodePool np : nodepools ) {
            np.queryMachines();
        }
    }

    class MachineByOrderSorter
    	implements Comparator
    {	
    	public int compare(Machine m1, Machine m2)
        {
            if ( m1.equals(m2) ) return 0;

            if (m1.getShareOrder() == m2.getShareOrder()) {
                return (m1.getId().compareTo(m2.getId()));
            }
            return (int) (m1.getShareOrder() - m2.getShareOrder());
        }
    }


}