com.bigdata.service.LoadBalancerService Maven / Gradle / Ivy
package com.bigdata.service;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileFilter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Properties;
import java.util.UUID;
import java.util.Vector;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.log4j.Logger;
import com.bigdata.counters.AbstractStatisticsCollector;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.DefaultInstrumentFactory;
import com.bigdata.counters.History;
import com.bigdata.counters.HistoryInstrument;
import com.bigdata.counters.ICounter;
import com.bigdata.counters.ICounterSet;
import com.bigdata.counters.IHostCounters;
import com.bigdata.counters.IRequiredHostCounters;
import com.bigdata.counters.PeriodEnum;
import com.bigdata.counters.ICounterSet.IInstrumentFactory;
import com.bigdata.counters.query.QueryUtil;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.Journal;
import com.bigdata.journal.ConcurrencyManager.IConcurrencyManagerCounters;
import com.bigdata.resources.ResourceManager.IResourceManagerCounters;
import com.bigdata.resources.StoreManager.IStoreManagerCounters;
import com.bigdata.service.DataService.IDataServiceCounters;
import com.bigdata.service.EventReceiver.EventBTree;
import com.bigdata.util.Bytes;
import com.bigdata.util.DaemonThreadFactory;
import com.bigdata.util.concurrent.ThreadPoolExecutorStatisticsTask;
import com.bigdata.util.concurrent.IQueueCounters.IThreadPoolExecutorTaskCounters;
/**
* The {@link LoadBalancerService} collects a variety of performance counters
* from hosts and services, identifies over- and under- utilized hosts and
* services based on the collected data and reports those to {@link DataService}
* s so that they can auto-balance, and acts as a clearing house for WARN and
* URGENT alerts for hosts and services.
*
* While the {@link LoadBalancerService} MAY observe service start/stop events,
* it does NOT get directly informed of actions that change the load
* distribution, such as index partition moves or reading from a failover
* service. Instead, {@link DataService}s determine whether or not they are
* overloaded and, if so, query the {@link LoadBalancerService} for the identity
* of under-utilized services. If under-utilized {@link DataService}s are
* reported by the {@link LoadBalancerService} then the {@link DataService} will
* self-identify index partitions to be shed and move them onto the identified
* under-utilized {@link DataService}s. The {@link LoadBalancerService} learns
* of these actions solely through their effect on host and service load as
* self- reported by various services.
*
* Note: utilization should be defined in terms of transient system resources :
* CPU, IO (DISK and NET), RAM. DISK exhaustion on the other hand is the basis
* for WARN or URGENT alerts since it can lead to immediate failure of all
* services on the same host.
*
* Note: When new services are made available, either on new hosts or on the
* existing hardware, and service utilization discrepancies should become
* rapidly apparent (within a few minutes). Once we have collected performance
* counters for the new hosts / services, a subsequent overflow event(s) on
* existing {@link DataService}(s) will cause index partition moves to be
* nominated targeting the new hosts and services. The amount of time that it
* takes to re-balance the load on the services will depend in part on the write
* rate, since writes drive overflow events and index partition splits, both of
* which lead to pre-conditions for index partition moves.
*
* Note: If a host is suffering high IOWAIT then it is probably "hot for read"
* (writes are heavily buffered and purely sequential and therefore unlikely to
* cause high IOWAIT where as reads are typically random on journals even
* through a key range scan is sequential on index segments). Therefore a "hot
* for read" condition should be addressed by increasing the replication count
* for those service(s) which are being swamped by read requests on a host
* suffering from high IOWAIT.
*
* @todo we could significantly accelerate overflow events when new hardware is
* made available by setting the forceOverflow flag on the highly utilized
* data services. we probably don't want to do this for all highly
* utilized services at once since that could cause a lot of perturbation.
*
* @todo Work out high-level alerting for resource exhaustion and failure to
* maintain QOS on individual machines, indices, and across the
* federation.
*
* @todo All clients ({@link IBigdataClient}, {@link DataService}, etc) should
* issue WARN and URGENT notices. The client-side rules for those alerts
* should be configurable / pluggable / declarative. It would be great if
* the WARN and URGENT notices were able to carry some information about
* the nature of the emergency.
*
* @todo logging on this {@link LoadBalancerService#log} can provide a single
* point for administrators to configure email or other alerts.
*
* @todo refactor so that we can use the same basic infrastructure for load
* balancing of other service classes as well, e.g., map/reduce services.
*
* @see IRequiredHostCounters, A core set of variables to support
* decision-making.
*
* @todo ping hosts?
*
* @see http://www.google.com/search?hl=en&q=load+balancing+jini
*
* @todo SNMP
*
* http://en.wikipedia.org/wiki/Simple_Network_Management_Protocol
*
* http://www.snmp4j.org/
*
* http://sourceforge.net/projects/joesnmp/ (used by jboss)
*
* http://net-snmp.sourceforge.net/ (Un*x and Windows SNMP support).
*
* @todo Consider an SNMP adaptor for the clients so that they can report to
* SNMP aware applications. In this context we could report both the
* original counters (as averaged) and the massaged metrics on which we
* plan to make decisions.
*
* @todo Would it make sense to replace the counters XML mechanism with MXBeans
* specific to bigdata and additional MXBeans for performance counters for
* the operating system?
*/
abstract public class LoadBalancerService extends AbstractService
implements ILoadBalancerService, IServiceShutdown, IEventReportingService {
final static protected Logger log = Logger.getLogger(LoadBalancerService.class);
final protected String ps = ICounterSet.pathSeparator;
/**
* Used to read {@link CounterSet} XML. This must support overwrite since
* new data arrives every minute and eventually the ring buffer must
* overwrite old values. In addition, this may support multiple levels of
* aggregation so that minutes may be rolled into hours and hours into days.
*/
final private IInstrumentFactory instrumentFactory = DefaultInstrumentFactory.OVERWRITE_60M;
/**
* Service join timeout in milliseconds - used when we need to wait for a
* service to join before we can recommend an under-utilized service.
*
* @see Options#SERVICE_JOIN_TIMEOUT
*/
final protected long serviceJoinTimeout;
/**
* Lock is used to control access to data structures that are not
* thread-safe.
*/
final protected ReentrantLock lock = new ReentrantLock();
/**
* Used to await a service join when there are no services.
*/
final protected Condition joined = lock.newCondition();
/**
* The active hosts (one or more services).
*
* @todo get rid of hosts that are no longer active. e.g., we no longer
* receive {@link #notify(String, byte[])} events from the host and
* the host can not be pinged. this will require tracking the #of
* services on the host which we do not do directly right now.
*/
protected ConcurrentHashMap activeHosts = new ConcurrentHashMap();
/**
* The set of active services.
*/
protected ConcurrentHashMap activeDataServices = new ConcurrentHashMap();
/**
* Scores for the hosts in ascending order (least utilized to most
* utilized).
*
* This array is initially null
and gets updated periodically
* by the {@link UpdateTask}. The main consumer of this information is the
* logic in {@link UpdateTask} that computes the service utilization.
*/
protected AtomicReference hostScores = new AtomicReference(null);
/**
* Scores for the services in ascending order (least utilized to most
* utilized).
*
* This array is initially null
and gets updated periodically
* by the {@link UpdateTask}. The methods that report service utilization
* and under-utilized services are all based on the data in this array.
* Since services can leave at any time, that logic MUST also test for
* existence of the service in {@link #activeDataServices} before assuming that the
* service is still live.
*/
protected AtomicReference serviceScores = new AtomicReference(null);
/**
* The #of {@link UpdateTask}s which have run so far.
*
* @see Options#INITIAL_ROUND_ROBIN_UPDATE_COUNT
*
* @see #getUnderUtilizedDataServices(int, int, UUID)
*/
protected long nupdates = 0;
/**
* The #of updates during which
* {@link #getUnderUtilizedDataServices(int, int, UUID)} will apply a round
* robin policy.
*
* @see Options#INITIAL_ROUND_ROBIN_UPDATE_COUNT
*/
protected final long initialRoundRobinUpdateCount;
/**
* Used to make round-robin assignments.
*/
private final RoundRobinServiceLoadHelper roundRobinServiceLoadHelper;
/**
* The directory in which the service will log the {@link CounterSet}s
* and {@link Event}s.
*
* @see Options#LOG_DIR
*/
protected final File logDir;
/**
* true
iff the LBS will refrain from writing state on the
* disk. This option causes the LBS to use an in memory {@link #eventStore}.
* In addition, it will refuse to write counter snapshots when this option
* is specified.
*
* @see Options#TRANSIENT
*/
protected final boolean isTransient;
/**
* A copy of the properties used to start the service.
*/
private final Properties properties;
/**
* An object wrapping the properties provided to the constructor.
*/
public Properties getProperties() {
return new Properties(properties);
}
/**
* Return the canonical hostname of the client in the context of a RMI
* request. If the request is not remote then return the canonical hostname
* for this host.
*/
abstract protected String getClientHostname();
/**
* Runs a periodic {@link UpdateTask}.
*/
final protected ScheduledExecutorService updateService;
/**
* The delay between writes of the {@link CounterSet} on a log file.
*/
private final long logDelayMillis;
/**
* The #of distinct log files to retain.
*/
private final long logMaxFiles;
/**
* Time that the {@link CounterSet} was last written onto a log file.
*/
private long logLastMillis = System.currentTimeMillis();
/**
* A one-up counter of the #of times the {@link CounterSet} was written onto
* a log file.
*/
private int logFileCount = 0;
/**
* The #of minutes of history that will be smoothed into an average when
* {@link UpdateTask} updates the {@link HostScore}s and the
* {@link ServiceScore}s.
*
* @see Options#HISTORY_MINUTES
*/
protected final int historyMinutes;
/**
* Used to persist the logged events.
*/
final protected Journal eventStore;
protected final EventReceiver eventReceiver;
/**
* Options understood by the {@link LoadBalancerService}.
*
* @author Bryan
* Thompson
* @version $Id$
*
* @todo The LBS needs to support a 'transient' option in which it (a) does
* not log counters; and (b) keeps the events in a transient B+Tree
* (not backed by a file on the disk). Without this we can not have a
* transient {@link EmbeddedFederation} or
* {@link LocalDataServiceFederation} instances.
*/
public interface Options {
/**
* The load balancer service will use a round robin approach to
* recommending under-utilized services until this the load balancer has
* re-computed the service scores N times (default
* {@value #DEFAULT_INITIAL_ROUND_ROBIN_UPDATE_COUNT}). This makes it
* more likely that the initial index partitions will be allocated on
* services on different hosts for a new federation, but it is really a
* hack since it depends entirely on the time elapsed since the load
* balancer service (re-)started. This "feature" may be disabled by
* setting this property to ZERO (0).
*/
String INITIAL_ROUND_ROBIN_UPDATE_COUNT = LoadBalancerService.class
.getName()
+ ".initialRoundRobinUpdateCount";
/**
* The default gives you a few minutes after you setup the federation in
* which newly registered indices will be allocated based on a
* round-robin.
*/
String DEFAULT_INITIAL_ROUND_ROBIN_UPDATE_COUNT = "5";
/**
* The delay between scheduled invocations of the {@link UpdateTask}.
*
* Note: the {@link AbstractStatisticsCollector} implementations SHOULD
* sample at one minute intervals by default and clients SHOULD report
* the collected performance counters at approximately one minute
* intervals. The update rate can be no more frequent than the reporting
* rate, but could be 2-5x slower, especially if we use WARN and URGENT
* events to immediately re-score services.
*
* @see #DEFAULT_UPDATE_DELAY
*
* @see AbstractStatisticsCollector.Options#PERFORMANCE_COUNTERS_SAMPLE_INTERVAL
*/
String UPDATE_DELAY = LoadBalancerService.class.getName()+".updateDelay";
/**
* The default {@link #UPDATE_DELAY}.
*/
String DEFAULT_UPDATE_DELAY = ""+(60*1000);
/**
* The #of minutes of history that will be smoothed into an average when
* {@link UpdateTask} updates the {@link HostScore}s and the
* {@link ServiceScore}s (default {@value #DEFAULT_HISTORY_MINUTES}).
*
* @see ThreadPoolExecutorStatisticsTask
*/
String HISTORY_MINUTES = LoadBalancerService.class.getName()
+ ".historyMinutes";
String DEFAULT_HISTORY_MINUTES = "5";
/**
* When true
the load balancer will not record any state on
* the disk (neither events nor counters). The default is
* false
. This option is used by some unit tests to
* simplify cleanup.
*/
String TRANSIENT = LoadBalancerService.class.getName() + ".transient";
String DEFAULT_TRANSIENT = "false";
/**
* The path of the data directory for the load balancer. The load
* balancer will log a copy of the counters every time it runs its
* {@link UpdateTask}. It will also log {@link Event}s received from
* other services here. By default, the load balancer will use the
* directory in which it was started. You may specify an alternative
* directory using this property.
*/
String LOG_DIR = LoadBalancerService.class.getName()+".log.dir";
String DEFAULT_LOG_DIR = ".";
/**
* The delay in milliseconds between writes of the {@link CounterSet} on
* a log file (default is {@value #DEFAULT_LOG_DELAY}, which is
* equivalent to one hour).
*/
String LOG_DELAY = LoadBalancerService.class.getName()+".log.delay";
String DEFAULT_LOG_DELAY = "" + 1000 * 60 * 60;
/**
* The maximum #of distinct log files to retain (default is one week
* based on a {@link #LOG_DELAY} equivalent to one hour).
*/
String LOG_MAX_FILES = LoadBalancerService.class.getName()+".log.maxFiles";
String DEFAULT_LOG_MAX_FILES = "" + 24 * 7;
/**
* Service join timeout in milliseconds - used when we need to wait for
* a service to join before we can recommend an under-utilized service.
*/
String SERVICE_JOIN_TIMEOUT = LoadBalancerService.class.getName()
+ ".serviceJoinTimeout";
String DEFAULT_SERVICE_JOIN_TIMEOUT = "" + (3 * 1000);
/**
* The maximum age of an {@link Event} that will be keep "on the books".
* Events older than this are purged. An error is logged if an event is
* purged before its end() event arrives. This generally indicates a
* code path where {@link Event#end()} is not getting called but could
* also indicate a disconnected client or service.
*
* @see EventReceiver
*/
String EVENT_HISTORY_MILLIS = LoadBalancerService.class.getName()
+ ".eventHistoryMillis";
/**
* Default is one hour of completed events.
*/
String DEFAULT_EVENT_HISTORY_MILLIS = "" + (60 * 60 * 1000);
}
/**
*
* Note: The load balancer MUST NOT collect host statistics unless it is the
* only service running on that host. Normally it relies on another service
* running on the same host to collect statistics for that host and those
* statistics are then reported to the load balancer and aggregated along
* with the rest of the performance counters reported by the other services
* in the federation. However, if the load balancer itself collects host
* statistics then it will only know about and report the current (last 60
* seconds) statistics for the host rather than having the historical data
* for the host.
*
* @param properties
* See {@link Options}
*/
public LoadBalancerService(final Properties properties) {
if (properties == null)
throw new IllegalArgumentException();
this.properties = (Properties) properties.clone();
this.isTransient = Boolean.valueOf(properties.getProperty(
Options.TRANSIENT, Options.DEFAULT_TRANSIENT));
if (log.isInfoEnabled())
log.info(Options.TRANSIENT + "=" + isTransient);
if(isTransient) {
logDir = null;
} else {
// setup the log directory.
final String val = properties.getProperty(
Options.LOG_DIR,
Options.DEFAULT_LOG_DIR);
logDir = new File(val);
if (log.isInfoEnabled())
log.info(Options.LOG_DIR + "=" + logDir);
// ensure exists.
logDir.mkdirs();
}
// logDelayMillis
{
logDelayMillis = Long.parseLong(properties.getProperty(
Options.LOG_DELAY, Options.DEFAULT_LOG_DELAY));
if (log.isInfoEnabled())
log.info(Options.LOG_DELAY + "=" + logDelayMillis);
}
// logMaxFiles
{
logMaxFiles = Integer.parseInt(properties.getProperty(
Options.LOG_MAX_FILES, Options.DEFAULT_LOG_MAX_FILES));
if (log.isInfoEnabled())
log.info(Options.LOG_MAX_FILES + "=" + logMaxFiles);
}
{
historyMinutes = Integer.parseInt(properties.getProperty(
Options.HISTORY_MINUTES,
Options.DEFAULT_HISTORY_MINUTES));
if (log.isInfoEnabled())
log.info(Options.HISTORY_MINUTES+ "="
+ historyMinutes);
// a reasonable range check.
if (historyMinutes <= 0 || historyMinutes > 60)
throw new RuntimeException(Options.HISTORY_MINUTES
+ " must be in [1:60].");
}
{
serviceJoinTimeout = Long.parseLong(properties.getProperty(
Options.SERVICE_JOIN_TIMEOUT,
Options.DEFAULT_SERVICE_JOIN_TIMEOUT));
if (log.isInfoEnabled())
log.info(Options.SERVICE_JOIN_TIMEOUT + "="
+ serviceJoinTimeout);
if (serviceJoinTimeout <= 0L)
throw new RuntimeException(Options.SERVICE_JOIN_TIMEOUT
+ " must be positive.");
}
// setup scheduled runnable for periodic updates of the service scores.
{
initialRoundRobinUpdateCount = Long.parseLong(properties
.getProperty(Options.INITIAL_ROUND_ROBIN_UPDATE_COUNT,
Options.DEFAULT_INITIAL_ROUND_ROBIN_UPDATE_COUNT));
if (log.isInfoEnabled())
log.info(Options.INITIAL_ROUND_ROBIN_UPDATE_COUNT + "="
+ initialRoundRobinUpdateCount);
this.roundRobinServiceLoadHelper = new RoundRobinServiceLoadHelper();
final long delay = Long.parseLong(properties.getProperty(
Options.UPDATE_DELAY,
Options.DEFAULT_UPDATE_DELAY));
if (log.isInfoEnabled())
log.info(Options.UPDATE_DELAY + "=" + delay);
/*
* Wait a bit longer for the first update task since service may be
* starting up as well and we need to have the performance counter
* data on hand before we can do anything.
*/
final long initialDelay = delay * 2;
final TimeUnit unit = TimeUnit.MILLISECONDS;
updateService = Executors
.newSingleThreadScheduledExecutor(new DaemonThreadFactory
(getClass().getName()+".updateService"));
updateService.scheduleWithFixedDelay(new UpdateTask(), initialDelay,
delay, unit);
}
// eventHistoryMillis
{
final long eventHistoryMillis = Long.parseLong(properties
.getProperty(Options.EVENT_HISTORY_MILLIS,
Options.DEFAULT_EVENT_HISTORY_MILLIS));
if (log.isInfoEnabled())
log.info(Options.EVENT_HISTORY_MILLIS + "="
+ eventHistoryMillis);
/*
* Setup a BTree backed that will be used to persist the completed
* events. This is passed to the EventReceiver. The BTree is used to
* get the events out of RAM and to decouple the reporting from the
* receiving. We delegate everything dealing with the events to that
* class.
*/
if(isTransient) {
/*
* Use an in-memory store.
*/
final Properties p = new Properties();
p.setProperty(com.bigdata.journal.Options.BUFFER_MODE,
BufferMode.Transient.toString());
eventStore = new Journal(p);
} else {
/*
* Use a restart-safe store.
*/
final Properties p = new Properties();
p.setProperty(com.bigdata.journal.Options.FILE, new File(
logDir, "events" + com.bigdata.journal.Options.JNL)
.toString());
eventStore = new Journal(p);
}
EventBTree eventBTree = (EventBTree) eventStore.getIndex("events");
if (eventBTree == null) {
eventStore.registerIndex("events", eventBTree = EventBTree
.create(eventStore));
}
eventReceiver = new EventReceiver(eventHistoryMillis, eventBTree);
}
}
@Override
synchronized public LoadBalancerService start() {
return this;
}
public boolean isOpen() {
return ! updateService.isShutdown();
}
protected void finalized() throws Throwable {
super.finalize();
shutdownNow();
}
@Override
synchronized public void shutdown() {
if(!isOpen()) return;
if (log.isInfoEnabled())
log.info("begin");
updateService.shutdown();
// log the final state of the counters.
logCounters("final");
/*
* Obtain the exclusive write lock for the event BTree before flushing
* writes.
*/
final Lock tmpLock = eventReceiver.getWriteLock();
tmpLock.lock();
try {
// Flush any buffered writes to the event store.
eventStore.getIndex("events").writeCheckpoint();
// Normal shutdown of the event store.
eventStore.shutdown();
} catch (Throwable t) {
log.error(t, t);
} finally {
tmpLock.unlock();
}
super.shutdown();
if (log.isInfoEnabled())
log.info("done");
}
@Override
synchronized public void shutdownNow() {
if(!isOpen()) return;
if (log.isInfoEnabled())
log.info("begin");
updateService.shutdownNow();
// log the final state of the counters.
logCounters("final");
// immediate shutdown.
eventStore.shutdownNow();
super.shutdownNow();
if (log.isInfoEnabled())
log.info("done");
}
@Override
synchronized public void destroy() {
super.destroy();
if (!isTransient) {
eventStore.destroy();
final File[] logFiles = logDir.listFiles(new FileFilter() {
public boolean accept(File pathname) {
return pathname.getName().startsWith("counters")
&& pathname.getName().endsWith(".xml");
}
});
if (logFiles != null) {
for (File file : logFiles) {
if (!file.delete())
log.warn("Could not delete: " + file);
}
}
// delete the log directory (works iff it is empty).
logDir.delete();
}
}
/**
* Returns {@link ILoadBalancerService}.
*/
@Override
final public Class getServiceIface() {
return ILoadBalancerService.class;
}
/**
* Normalizes the {@link ServiceScore}s and set them in place.
*
* @param a
* The new service scores.
*/
protected void setHostScores(final HostScore[] a) {
/*
* sort scores into ascending order (least utilized to most utilized).
*/
Arrays.sort(a);
/*
* Compute the totalRawScore.
*/
double totalRawScore = 0d;
for (HostScore s : a) {
totalRawScore += s.rawScore;
}
/*
* Compute normalized score, rank, and drank.
*/
for (int i = 0; i < a.length; i++) {
final HostScore score = a[i];
score.rank = i;
score.drank = ((double)i)/a.length;
score.score = HostScore.normalize(score.rawScore, totalRawScore);
// update score in global map.
activeHosts.put(score.hostname, score);
if (log.isInfoEnabled())
log.info(score.toString());
}
if(log.isInfoEnabled()) {
log.info("The most active host was: " + a[a.length - 1]);
log.info("The least active host was: " + a[0]);
}
// Atomic replace of the old scores.
LoadBalancerService.this.hostScores.set( a );
}
/**
* Normalizes the {@link ServiceScore}s and set them in place.
*
* @param a
* The new service scores.
*/
protected void setServiceScores(final ServiceScore[] a) {
/*
* Sort scores into ascending order (least utilized to most utilized).
*/
Arrays.sort(a);
/*
* Compute the totalRawScore.
*/
double totalRawScore = 0d;
for (ServiceScore s : a) {
totalRawScore += s.rawScore;
}
/*
* compute normalized score, rank, and drank.
*/
for (int i = 0; i < a.length; i++) {
final ServiceScore score = a[i];
score.rank = i;
score.drank = ((double) i) / a.length;
score.score = HostScore.normalize(score.rawScore, totalRawScore);
// update score in global map.
activeDataServices.put(score.serviceUUID, score);
if (log.isInfoEnabled())
log.info(score.toString());
}
if (log.isInfoEnabled()) {
log.info("The most active service was: " + a[a.length - 1]);
log.info("The least active service was: " + a[0]);
}
// Atomic replace of the old scores.
LoadBalancerService.this.serviceScores.set(a);
}
/**
* Computes and updates the {@link ServiceScore}s based on an examination
* of aggregated performance counters.
*
* @todo There could be a score for the last minute, hour, and day or the
* last minute, five minutes, and ten minutes.
*
* @todo For starters, we can just run some hand-coded rules. Consider
* special transition states for new hosts and services.
*
* @todo The scoring logic should be pluggable so that people can reply on
* the data that they have for their platform(s) that seems to best
* support decision-making and can apply rules for their platforms,
* environment, and applications which provide the best overall QOS.
*
* @todo The logic to choose the under- and over-utilized services based on
* the services scores should be configurable (this is different from
* the logic to compute those scores).
*
* @todo if a client does not
* {@link ILoadBalancerService#notify(String, byte[])} for 120 seconds
* then presume dead? this requires that we compute the age of the
* last reported counter value. e.g., do a counter scan for the
* service and report the largest value for lastModified() on any
* counter for that service.
*
* @see QueryUtil#getRequiredPerformanceCountersFilter()
*
* @author Bryan Thompson
* @version $Id$
*/
protected class UpdateTask implements Runnable {
/**
* Note: The logger is named for this class, but since it is an inner
* class the name uses a "$" delimiter (vs a ".") between the outer and
* the inner class names.
*/
final protected transient Logger log = Logger.getLogger(UpdateTask.class);
public UpdateTask() {
}
/**
* Note: Don't throw anything here since we don't want to have the task
* suppressed!
*/
public void run() {
try {
updateHostScores();
updateServiceScores();
setupCounters();
logCounters();
} catch (Throwable t) {
log.error("Problem in update task?", t);
} finally {
nupdates++;
}
}
/**
* (Re-)compute the utilization score for each active host.
*/
protected void updateHostScores() {
if(activeHosts.isEmpty()) {
if (log.isInfoEnabled())
log.info("No active hosts");
return;
}
/*
* Update scores for the active hosts.
*/
final Vector scores = new Vector();
// For each host
final Iterator itrh = getFederation().getCounters()
.counterSetIterator();
while(itrh.hasNext()) {
final CounterSet hostCounterSet = (CounterSet) itrh.next();
// Note: name on hostCounterSet is the fully qualified hostname.
final String hostname = hostCounterSet.getName();
if(!activeHosts.containsKey(hostname)) {
// Host is not active.
if (log.isDebugEnabled())
log.debug("Host is not active: " + hostname);
continue;
}
/*
* Compute the score for that host.
*/
HostScore score;
try {
score = computeScore(hostname, hostCounterSet);
} catch (Exception ex) {
log.error("Problem computing host score: " + hostname, ex);
/*
* Keep the old score if we were not able to compute a new
* score.
*
* Note: if the returned value is null then the host was
* asynchronously removed from the set of active hosts.
*/
score = activeHosts.get(hostname);
if (score == null) {
log.warn("Host gone during update task: " + hostname);
continue;
}
}
/*
* Add to collection of scores.
*/
scores.add(score);
}
if (scores.isEmpty()) {
log.warn("No performance counters for hosts, but "
+ activeHosts.size() + " active hosts");
LoadBalancerService.this.hostScores.set( null );
return;
}
// scores as an array.
final HostScore[] a = scores.toArray(new HostScore[] {});
setHostScores(a);
}
/**
* (Re-)compute the utilization score for each active service.
*
* Note: There is a dependency on
* {@link AbstractFederation#getServiceCounterPathPrefix(UUID, Class, String)}.
* This method assumes that the service {@link UUID} is found in a
* specific place in the constructed path.
*/
protected void updateServiceScores() {
if(activeDataServices.isEmpty()) {
if (log.isInfoEnabled())
log.info("No active services");
LoadBalancerService.this.serviceScores.set( null );
return;
}
/*
* Update scores for the active services.
*/
final Vector scores = new Vector();
// For each host
final Iterator itrh = getFederation().getCounters()
.counterSetIterator();
while(itrh.hasNext()) {
final CounterSet hostCounterSet = (CounterSet) itrh.next();
// Note: name on hostCounterSet is the fully qualified hostname.
final String hostname = hostCounterSet.getName();
// Pre-computed score for the host on which the service is running.
final HostScore hostScore = activeHosts.get(hostname);
if (hostScore == null) {
// Host is not active.
if (log.isInfoEnabled())
log.info("Host is not active: " + hostname);
continue;
}
// lookup path: /hostname/service
final CounterSet serviceIfacesCounterSet = (CounterSet) hostCounterSet
.getPath("service");
if (serviceIfacesCounterSet == null) {
log.warn("No services interfaces? hostname=" + hostname);
continue;
}
// for each service interface type: /hostname/service/iface
final Iterator itrx = serviceIfacesCounterSet
.counterSetIterator();
// for each service under that interface type
while (itrx.hasNext()) {
// path: /hostname/service/iface/UUID
final CounterSet servicesCounterSet = (CounterSet) itrx
.next();
// For each service.
final Iterator itrs = servicesCounterSet
.counterSetIterator();
while (itrs.hasNext()) {
final CounterSet serviceCounterSet = (CounterSet) itrs
.next();
/*
* Note: [name] on serviceCounterSet is the serviceUUID.
*
* Note: This creates a dependency on
* AbstractFederation#getServiceCounterPathPrefix(...)
*/
final String serviceName = serviceCounterSet.getName();
final UUID serviceUUID;
try {
serviceUUID = UUID.fromString(serviceName);
} catch (Exception ex) {
log.error("Could not parse service name as UUID?\n"
+ "hostname=" + hostname
+ ", serviceCounterSet.path="
+ serviceCounterSet.getPath()
+ ", serviceCounterSet.name="
+ serviceCounterSet.getName(), ex);
continue;
}
if (!activeDataServices.containsKey(serviceUUID)) {
/*
* Note: Only data services are entered in this map,
* so this filters out the non-dataServices from the
* load balancer's computations.
*/
continue;
}
/*
* Compute the score for that service.
*/
ServiceScore score;
try {
score = computeScore(hostScore, serviceUUID,
hostCounterSet, serviceCounterSet);
} catch (Exception ex) {
log.error("Problem computing service score: "
+ serviceCounterSet.getPath(), ex);
/*
* Keep the old score if we were not able to compute
* a new score.
*
* Note: if the returned value is null then the
* service asynchronously was removed from the set
* of active services.
*/
score = activeDataServices.get(serviceUUID);
if (score == null) {
if (log.isInfoEnabled())
log.info("Service leave during update task: "
+ serviceCounterSet.getPath());
continue;
}
}
/*
* Add to collection of scores.
*/
scores.add(score);
}
}
}
if (scores.isEmpty()) {
log.warn("No performance counters for services, but "
+ activeDataServices.size() + " active services");
LoadBalancerService.this.serviceScores.set( null );
return;
}
// scores as an array.
final ServiceScore[] a = scores.toArray(new ServiceScore[] {});
// normalize and set in place.
setServiceScores(a);
}
/**
* Compute the score for a host.
*
* The host scores MUST reflect critical resource exhaustion, especially
* DISK free space, which can take down all services on the host, and
* SWAPPING, which can bring the effective throughput of the host to a
* halt. All other resources fail soft, by causing the response time to
* increase.
*
* Note: DISK exhaustion can lead to immediate failure of all services
* on the same host. A host that is nearing DISK exhaustion SHOULD get
* heavily dinged and an admin SHOULD be alerted.
*
* The correct response for heavy swapping is to alert an admin to
* shutdown one or more processes on that host. If you do not
* have failover provisioned for your data services then DO NOT shutdown
* data services or you WILL loose data!
*
* Note: If we are not getting critical counters for some host then we
* are assuming a reasonable values for the missing data and computing
* the utilization based on those assumptions. Note that a value of zero
* (0) may be interpreted as either critically high utilization or no
* utilization depending on the performance counter involved and that
* the impact of the different counters can vary depending on the
* formula used to compute the utilization score.
*
* @param hostname
* The fully qualified hostname.
* @param hostCounterSet
* The performance counters for that host.
* @param serviceCounterSet
* The performance counters for that service.
*
* @return The computed host score.
*/
protected HostScore computeScore(final String hostname,
final ICounterSet hostCounterSet) {
/*
* Is the host swapping heavily?
*
* @todo if heavy swapping persists then lower the score even
* further.
*
* @todo The % of the physical memory and the % of the swap space
* that have been used are also strong indicators.
*/
final int majorFaultsPerSec = (int) getCurrentValue(hostCounterSet,
IRequiredHostCounters.Memory_majorFaultsPerSecond, 0d/* default */);
/*
* Is the host out of disk?
*
* FIXME Need the swap space remaining. Low swap presages heavy
* swapping.
*
* @todo this will issue a warning for a windows host on which a
* service is just starting up. For some reason, it takes a few
* cycles to begin reporting performance counters on a windows host
* and the initial counters will therefore all be reported as zeros.
* This problem should be fixed, but we also need to discount an
* average whose result is zero if the #of samples is also zero. In
* that case we just don't have any information. Likewise, when the
* #of samples to date (cumulative or just the #of minutes in 0:60
* of data in the minutes history) is less than 5 minutes worth of
* data then we may still need to discount the data. Also consider
* adding a "moving average" computation to the History so that we
* can smooth short term spikes.
*/
final double percentDiskFreeSpace = getCurrentValue(hostCounterSet,
IRequiredHostCounters.LogicalDisk_PercentFreeSpace, .5d/* default */);
/*
* The percent of the time that the CPUs are idle.
*/
final double percentProcessorIdle = 1d - getAverageValueForMinutes(
hostCounterSet, IRequiredHostCounters.CPU_PercentProcessorTime,
.5d, historyMinutes);
/*
* The percent of the time that the CPUs are idle when there is an
* outstanding IO request.
*/
final double percentIOWait = getAverageValueForMinutes(
hostCounterSet, IHostCounters.CPU_PercentIOWait,
.01d/* default */, historyMinutes);
/*
* Note: This reflects the disk IO utilization primarily through
* IOWAIT.
*
* @todo Play around with other formulas too.
*/
double adjustedRawScore;
final double baseRawScore = adjustedRawScore = (1d + percentIOWait * 100d)
/ (1d + percentProcessorIdle);
if (majorFaultsPerSec > 50) {
// much higher utilization if the host is swapping heavily.
adjustedRawScore *= 10;
log.warn("hostname=" + hostname
+ " : swapping heavily: pages/sec="
+ majorFaultsPerSec);
} else if (majorFaultsPerSec > 10) {
// higher utilization if the host is swapping.
adjustedRawScore *= 2d;
log.warn("hostname=" + hostname + " : swapping: pages/sec="
+ majorFaultsPerSec);
}
if (percentDiskFreeSpace < .05) {
// much higher utilization if the host is very short on disk.
adjustedRawScore *= 10d;
log.warn("hostname=" + hostname
+ " : very short on disk: freeSpace="
+ percentDiskFreeSpace * 100 + "%");
} else if (percentDiskFreeSpace < .10) {
// higher utilization if the host is short on disk.
adjustedRawScore *= 2d;
log.warn("hostname=" + hostname
+ " : is short on disk: freeSpace="
+ percentDiskFreeSpace * 100 + "%");
}
if (log.isInfoEnabled()) {
log.info("hostname=" + hostname + " : adjustedRawScore("
+ scoreFormat.format(adjustedRawScore)
+ "), baseRawScore(" + scoreFormat.format(baseRawScore)
+ ") = (1d + percentIOWait("
+ percentFormat.format(percentIOWait)
+ ") * 100d) / (1d + percentProcessorIdle("
+ percentFormat.format(percentProcessorIdle)
+ "), majorFaultsPerSec=" + majorFaultsPerSec
+ ", percentDiskSpaceFree="
+ percentFormat.format(percentDiskFreeSpace));
}
final HostScore hostScore = new HostScore(hostname, adjustedRawScore);
return hostScore;
}
/**
* Format for the computed scores.
*/
final NumberFormat scoreFormat;
{
scoreFormat = NumberFormat.getInstance();
scoreFormat.setMaximumFractionDigits(2);
scoreFormat.setMinimumIntegerDigits(1);
}
/**
* Format for percentages such as IO Wait
where the
* values are in [0.00:1.00].
*/
final NumberFormat percentFormat;
{
percentFormat = NumberFormat.getInstance();
percentFormat.setMaximumFractionDigits(2);
percentFormat.setMinimumIntegerDigits(1);
}
/**
* Format for elapsed times measured in milliseconds.
*/
final NumberFormat millisFormat;
{
millisFormat = NumberFormat.getIntegerInstance();
}
/**
* Format for bytes.
*/
final NumberFormat bytesFormat;
{
bytesFormat = NumberFormat.getIntegerInstance();
bytesFormat.setGroupingUsed(true);
}
/**
* Compute the score for a service.
*
* Note: utilization is defined in terms of transient system resources :
* CPU, IO (DISK and NET), RAM. A host with enough CPU/RAM/IO/DISK can
* support more than one data service. Therefore it is important to look
* at not just host utilization but also at process utilization.
*
* @param hostScore
* The pre-computed score for the host on which the service
* is running.
* @param serviceUUID
* The service {@link UUID}.
* @param hostCounterSet
* The performance counters for that host (in case you need
* anything that is not already in the {@link HostScore}).
* @param serviceCounterSet
* The performance counters for that service.
*
* @return The computed score for that service.
*/
protected ServiceScore computeScore(final HostScore hostScore,
final UUID serviceUUID, final ICounterSet hostCounterSet,
final ICounterSet serviceCounterSet) {
assert hostScore != null;
assert serviceUUID != null;
assert hostCounterSet != null;
assert serviceCounterSet != null;
// verify that the host score has been normalized.
assert hostScore.rank != -1 : hostScore.toString();
// resolve the service name : @todo refactor RMI out of this method.
String serviceName = "N/A";
try {
serviceName = getFederation().getDataService(serviceUUID)
.getServiceName();
} catch (Throwable t) {
log.warn(t.getMessage(), t);
}
/*
* The average queuing time for the unisolated write service is used
* as the primary indicator of the write load of the service. The
* average queueing time is preferred to the average queue length as
* the queueing time is directly correlated to the throughput of the
* service.
*
* Note: We use the measure of write load to drive load balancing
* decisions. This is in contrast to high availability for readers,
* where readers can be directed to failover instances.
*
* @todo verify that the queueing time measurement in millis is
* sufficient rather than nanos as queuing times can become quite
* short.
*
* @todo There is a lot more that can be considered and under linux
* we have access to per-process counters for CPU, DISK, and MEMORY.
*/
// final double averageQueueLength = getAverageValueForMinutes(
// serviceCounterSet, IDataServiceCounters.concurrencyManager
// + ps + IConcurrencyManagerCounters.writeService
// + ps + IThreadPoolExecutorCounters.AverageQueueLength,
// 0d/* default (queueLength) */, historyMinutes);
final double averageQueueingTime = getAverageValueForMinutes(
serviceCounterSet, IDataServiceCounters.concurrencyManager
+ ps + IConcurrencyManagerCounters.writeService
+ ps + IThreadPoolExecutorTaskCounters.AverageQueuingTime,
10d/* default (ms) */, historyMinutes);
final double dataDirBytesAvailable = getAverageValueForMinutes(
serviceCounterSet, IDataServiceCounters.resourceManager
+ ps + IResourceManagerCounters.StoreManager
+ ps + IStoreManagerCounters.DataDirBytesAvailable,
Bytes.gigabyte * 20/* default */, historyMinutes);
final double tmpDirBytesAvailable = getAverageValueForMinutes(
serviceCounterSet, IDataServiceCounters.resourceManager
+ ps + IResourceManagerCounters.StoreManager
+ ps + IStoreManagerCounters.TmpDirBytesAvailable,
Bytes.gigabyte * 10/* default */, historyMinutes);
final double rawScore = (averageQueueingTime + 1) * (hostScore.score + 1);
double adjustedRawScore = rawScore;
/*
* dataDir
*
* Note: If you set these thresholds to GT the default value
* reported when the counters are not yet available then you will
* see false 'short on disk' claims. They will go away once the
* performance counters arrive with real disk space measurements.
*/
if (dataDirBytesAvailable < Bytes.gigabyte * 1) {
// much higher utilization if the host is very short on disk.
adjustedRawScore *= 10d;
log.warn("service=" + serviceName + " : very short on disk: "
+ IStoreManagerCounters.DataDirBytesAvailable + "="
+ bytesFormat.format(dataDirBytesAvailable));
} else if (dataDirBytesAvailable < Bytes.gigabyte * 10) {
// higher utilization if the host is short on disk.
adjustedRawScore *= 2d;
log.warn("service=" + serviceName + " : is short on disk: "
+ IStoreManagerCounters.DataDirBytesAvailable + "="
+ bytesFormat.format(dataDirBytesAvailable));
}
/*
* tmpDir
*
* Note: If you set these thresholds to GT the default value
* reported when the counters are not yet available then you will
* see false 'short on disk' claims. They will go away once the
* performance counters arrive with real disk space measurements.
*
* These thresholds are currently set to trigger at any value LT the
* default, which masks the issue.
*/
if (tmpDirBytesAvailable < Bytes.gigabyte * 1) {
// much higher utilization if the host is very short on disk.
adjustedRawScore *= 10d;
log.warn("service=" + serviceName + " : very short on disk: "
+ IStoreManagerCounters.TmpDirBytesAvailable + "="
+ bytesFormat.format(tmpDirBytesAvailable));
} else if (tmpDirBytesAvailable < Bytes.gigabyte * 10) {
// higher utilization if the host is short on disk.
adjustedRawScore *= 2d;
log.warn("service=" + serviceName + " : is short on disk: "
+ IStoreManagerCounters.TmpDirBytesAvailable + "="
+ bytesFormat.format(tmpDirBytesAvailable));
}
if (log.isInfoEnabled()) {
log.info("serviceName=" + serviceName//
+ ", serviceUUID=" + serviceUUID //
// + ", averageQueueLength=" + averageQueueLength//
+ ", averageQueueingTime=" + millisFormat.format(averageQueueingTime)//
+ ", dataDirBytesAvail="+bytesFormat.format(dataDirBytesAvailable)//
+ ", tmpDirBytesAvail="+bytesFormat.format(tmpDirBytesAvailable)//
+ ", adjustedRawStore="+adjustedRawScore//
+ ", rawScore(" + scoreFormat.format(rawScore) + ") "//
+ "= (averageQueueingTime("+ averageQueueingTime+ ") + 1) "//
+ "* (hostScore("+ scoreFormat.format(hostScore.score) + ") + 1)"//
);
}
final ServiceScore score = new ServiceScore(hostScore.hostname,
serviceUUID, serviceName, adjustedRawScore);
return score;
}
protected double getCurrentValue(ICounterSet counterSet, String path,
double defaultValue) {
assert counterSet != null;
assert path != null;
final ICounter c = (ICounter) counterSet.getPath(path);
if (c == null)
return defaultValue;
try {
double val = (Double) c.getValue();
return val;
} catch (Exception ex) {
log.warn("Could not read double value: counterSet="
+ counterSet.getPath() + ", counter=" + path);
return defaultValue;
}
}
/**
* Return the average of the counter having the given path over the last
* minutes minutes.
*
* @param counterSet
* @param path
* @param defaultValue
* @param minutes
* @return
*
* FIXME should be a weighted average, right?
*/
protected double getAverageValueForMinutes(
final ICounterSet counterSet, final String path,
final double defaultValue, final int minutes) {
assert counterSet != null;
assert path != null;
final ICounter c = (ICounter) counterSet.getPath(path);
if (c == null)
return defaultValue;
try {
if (c.getInstrument() instanceof HistoryInstrument) {
final HistoryInstrument inst = (HistoryInstrument) c
.getInstrument();
final double val = ((Number) inst.getHistory().getAverage(
minutes)).doubleValue();
return val;
} else {
/*
* When the LBS is run as an embedded process it can wind up
* having the performance counters collected within its
* process in which case it will not have histories for the
* data and we just return the current value.
*/
log.warn("Not a history: " + c);
return ((Number)c.getValue()).doubleValue();
}
} catch (Exception ex) {
log.warn("Could not read: counterSet=" + counterSet.getPath()
+ ", counter=" + path, ex);
return defaultValue;
}
}
/**
* Sets up reporting for the computed per-host and per-service scores.
* These counters are reported under the service {@link UUID} for the
* {@link LoadBalancerService} itself. This makes it easy to consult the
* scores for the various hosts and services.
*
* Note: The host and service scores will not appear until the
* {@link UpdateTask} has executed and those scores have been computed.
*
* @see LoadBalancerService.Options#UPDATE_DELAY
*
* @todo counters for service scores should be eventually removed after
* the service leaves. Likewise for host scores. However, these
* counters SHOULD remain available for a while for post-mortem of
* the service/host, e.g., at least 2-3 days. This would be fixed
* with a persistence model for the scores.
*/
protected void setupCounters() {
final CounterSet serviceRoot = getFederation()
.getServiceCounterSet();
final long now = System.currentTimeMillis();
// per-host scores.
{
final CounterSet hosts = serviceRoot.makePath("hosts");
final CounterSet tmpScores = hosts.makePath("scores");
final CounterSet tmpFormula = hosts.makePath("formula");
synchronized (tmpScores) {
for (HostScore hs : activeHosts.values()) {
final String hn = hs.hostname;
if (tmpScores.getChild(hn) == null) {
tmpScores
.addCounter(hn,
new HistoryInstrument(
new History(new Double[60], PeriodEnum.Minutes
.getPeriodMillis(), true/*overwrite*/)));
}
{
final ICounter counter = (ICounter) tmpScores
.getChild(hn);
final HistoryInstrument inst = (HistoryInstrument) counter
.getInstrument();
final HostScore score = activeHosts.get(hn);
if (score != null) {
inst.add(now, score.drank);
}
}
}
} // synchronized(scores)
synchronized (tmpFormula) {
for (HostScore hs : activeHosts.values()) {
final String hn = hs.hostname;
if (tmpFormula.getChild(hn) == null) {
tmpFormula.addCounter(hn,
new HistoryInstrument(
new History(
new String[60],
PeriodEnum.Minutes
.getPeriodMillis(),
true/*overwrite*/)));
}
{
final ICounter counter = (ICounter) tmpFormula
.getChild(hn);
final HistoryInstrument inst = (HistoryInstrument) counter
.getInstrument();
final HostScore score = activeHosts.get(hn);
if (score != null) {
inst.add(now, score.toString());
}
}
}
} // synchronized(formula)
} // host scores
// per-service scores.
{
final CounterSet services = serviceRoot.makePath("services");
final CounterSet tmpScores = services.makePath("scores");
final CounterSet tmpFormula = services.makePath("formula");
synchronized (tmpScores) {
for (ServiceScore ss : activeDataServices.values()) {
/*
* @todo use serviceName, but it has embedded slashes
* (in bigdata-jini) just like a path which makes life
* difficult.
*/
final String idStr = ss.serviceUUID.toString();
if (tmpScores.getChild(idStr) == null) {
tmpScores
.addCounter(
idStr,
new HistoryInstrument(
new History(
new Double[60],
PeriodEnum.Minutes
.getPeriodMillis(),
true/*overwrite*/)));
}
{
final ICounter counter = (ICounter) tmpScores.getChild(idStr);
final HistoryInstrument inst = (HistoryInstrument) counter
.getInstrument();
final ServiceScore score = activeDataServices
.get(ss.serviceUUID);
if (score != null) {
inst.add(now, score.drank);
}
}
}
} // synchronized(scores)
synchronized (tmpFormula) {
for (ServiceScore ss : activeDataServices.values()) {
/*
* @todo use serviceName, but it has embedded slashes
* (in bigdata-jini) just like a path which makes life
* difficult.
*/
final String idStr = ss.serviceUUID.toString();
if (tmpFormula.getChild(idStr) == null) {
tmpFormula
.addCounter(
idStr,
new HistoryInstrument(
new History(
new String[60],
PeriodEnum.Minutes
.getPeriodMillis(),
true/*overwrite*/)));
}
{
final ICounter counter = (ICounter) tmpFormula.getChild(idStr);
final HistoryInstrument inst = (HistoryInstrument) counter
.getInstrument();
final ServiceScore score = activeDataServices
.get(ss.serviceUUID);
if (score != null) {
inst.add(now, score.toString());
}
}
}
}// synchronized(formula)
} // service scores.
} // end method
/**
* Writes the counters on a file.
*
* @see Options
*/
protected void logCounters() {
final long now = System.currentTimeMillis();
final long elapsed = now - logLastMillis;
if (elapsed > logDelayMillis) {
final String basename = "" + (logFileCount % logMaxFiles);
LoadBalancerService.this.logCounters(basename);
logFileCount++;
logLastMillis = now;
}
}
}
/**
* Writes the counters on a file.
*
* @param basename
* The basename of the file. The file will be written in the
* {@link #logDir}.
*/
protected void logCounters(final String basename) {
if(isTransient) {
log.warn("LBS is transient - request ignored.");
return;
}
final File file = new File(logDir, "counters" + basename + ".xml");
logCounters(file);
}
/**
* Writes the counters on a file.
*
* @param file
* The file. If the file exists it will be overwritten.
*/
protected void logCounters(final File file) {
if (file == null)
throw new IllegalArgumentException();
if (log.isInfoEnabled())
log.info("Writing counters on " + file);
OutputStream os = null;
try {
os = new BufferedOutputStream( new FileOutputStream(file) );
getFederation().getCounters().asXML(os, "UTF-8", null/* filter */);
} catch(Exception ex) {
log.error(ex.getMessage(), ex);
} finally {
if (os != null) {
try {
os.close();
} catch (Exception ex) {
// Ignore.
}
}
}
}
/**
* Logs the counters on a file created using
* {@link File#createTempFile(String, String, File)} in the log
* directory.
*
* @throws IOException
*
* @todo this method is not exposed to RMI (it is not on any
* {@link Remote} interface) but it could be.
*/
public void logCounters() throws IOException {
if (isTransient) {
log.warn("LBS is transient - request ignored.");
return;
}
final File file = File.createTempFile("counters-hup", ".xml", logDir);
logCounters(file);
}
public void sighup() throws IOException {
logCounters();
}
/**
* Notify the {@link LoadBalancerService} that a new service is available.
*
* Note: Embedded services must invoke this method directly when
* they start up.
*
* Note: Distributed services implementations MUST discover services using a
* framework, such as jini, and invoke this method the first time a given
* service is discovered.
*
* @param serviceUUID
* @param serviceIface
* @param hostname
*
* @see IFederationDelegate#serviceJoin(IService, UUID)
* @see #leave(String, UUID)
*/
public void join(final UUID serviceUUID, final Class serviceIface,
final String hostname) {
if (serviceUUID == null)
throw new IllegalArgumentException();
if (serviceIface == null)
throw new IllegalArgumentException();
if (hostname == null)
throw new IllegalArgumentException();
if (log.isInfoEnabled())
log.info("serviceUUID=" + serviceUUID + ", serviceIface="
+ serviceIface + ", hostname=" + hostname);
/*
* @todo should really be passed in to avoid boundback RMI. Also, this
* is available for jini as an attribute on the ServiceItem. And in any
* case the serviceName can be cached here.
*/
String serviceName;
if (IDataService.class == serviceIface) {
final IBigdataFederation> fed;
try {
fed = getFederation();
} catch (IllegalStateException t) {
/*
* Note: Indicates that the federation is closed.
*/
return;
}
try {
serviceName = fed.getDataService(serviceUUID).getServiceName();
} catch (Throwable t) {
log.warn(t.getMessage(), t);
serviceName = serviceUUID.toString();
}
} else {
serviceName = serviceUUID.toString();
}
lock.lock();
try {
if (activeHosts.putIfAbsent(hostname, new HostScore(hostname)) == null) {
if (log.isInfoEnabled())
log.info("New host joined: hostname=" + hostname);
}
if (IDataService.class == serviceIface) {
/*
* Add to set of known services.
*
* Only data services are registered as [activeServices] since
* we only make load balancing decisions for the data services.
*/
if (activeDataServices.putIfAbsent(serviceUUID,
new ServiceScore(hostname, serviceUUID, serviceName)) == null) {
if (log.isInfoEnabled())
log.info("Data service join: hostname=" + hostname
+ ", serviceUUID=" + serviceUUID);
}
}
if (getServiceUUID() != null) {
/*
* Create node for the joined service's history in the load
* balancer's counter set. This just gives eager feedback in the
* LBS's counter set if you are using the httpd service to watch
* for service joins.
*
* Note: We can't do this until the load balancer has its own
* serviceUUID. If that is not available now, then the node for
* the joined service will be created when that service
* notify()s the LBS (60 seconds later).
*/
getFederation().getCounters().makePath(
AbstractFederation.getServiceCounterPathPrefix(
serviceUUID, serviceIface, hostname));
}
joined.signal();
} finally {
lock.unlock();
}
}
/**
* Notify the {@link LoadBalancerService} that a service is no longer
* available.
*
* Note: Embedded services must invoke this method directly when
* they shut down.
*
*
* Note: Distributed services implementations MUST discover services using a
* framework, such as jini, and invoke this method when a service is no
* longer registered.
*
* @param serviceUUID
* The service {@link UUID}.
*
* @see IFederationDelegate#serviceLeave(UUID)
* @see #join(UUID, Class, String)
*/
public void leave(final UUID serviceUUID) {
if (log.isInfoEnabled())
log.info("serviceUUID=" + serviceUUID);
try {
lock.lock();
/*
* Note: [activeServices] only contains the DataServices so a null
* return means either that this is not a data service -or- that we
* do not have a score for that data service yet.
*/
final ServiceScore info = activeDataServices.remove(serviceUUID);
if (info != null) {
/*
* @todo remove history from counters - path is
* /host/serviceUUID? Consider scheduling removal after a few
* hours or just sweeping periodically for services with no
* updates in the last N hours so that people have access to
* post-mortem data. For the same reason, we should probably
* snapshot the data prior to the leave (especially if there are
* WARN or URGENT events for the service) and perhaps
* periodically snapshot all of the counter data onto rolling
* log files.
*/
// root.deletePath(path);
}
} finally {
lock.unlock();
}
}
/**
* Accepts the event, either updates the existing event with the same
* {@link UUID} or adds the event to the set of recent events, and then
* prunes the set of recent events so that all completed events older than
* {@link #eventHistoryMillis} are discarded.
*
* @see EventReceiver
*/
public void notifyEvent(Event e) throws IOException {
if(!isOpen())
throw new IllegalStateException();
eventReceiver.notifyEvent(e);
}
/**
* {@inheritDoc}
*/
public Iterator rangeIterator(long fromTime, long toTime) {
if(!isOpen())
throw new IllegalStateException();
return eventReceiver.rangeIterator(fromTime, toTime);
}
/**
* {@inheritDoc}
*/
public long rangeCount(long fromTime, long toTime) {
if(!isOpen())
throw new IllegalStateException();
return eventReceiver.rangeCount(fromTime, toTime);
}
public void notify(final UUID serviceUUID, final byte[] data) {
setupLoggingContext();
try {
if (log.isInfoEnabled())
log.info("serviceUUID=" + serviceUUID);
if (!serviceUUID.equals(getServiceUUID())) {
/*
* Don't do this for the load balancer itself!
*
* @todo the LBS probably should not bother to notify() itself.
*/
try {
// read the counters into our local history.
getFederation().getCounters().readXML(
new ByteArrayInputStream(data), instrumentFactory,
null/* filter */);
} catch (Exception e) {
log.warn(e.getMessage(), e);
throw new RuntimeException(e);
}
}
} finally {
clearLoggingContext();
}
}
public void warn(String msg, UUID serviceUUID) {
setupLoggingContext();
try {
log.warn(msg+" : serviceUUID="+serviceUUID);
} finally {
clearLoggingContext();
}
}
public void urgent(String msg, UUID serviceUUID) {
setupLoggingContext();
try {
log.error(msg+" : serviceUUID="+serviceUUID);
} finally {
clearLoggingContext();
}
}
public boolean isHighlyUtilizedDataService(final UUID serviceUUID)
throws IOException {
setupLoggingContext();
try {
final ServiceScore[] scores = this.serviceScores.get();
// No scores yet?
if (scores == null) {
if(log.isInfoEnabled()) log.info("No scores yet");
return false;
}
final ServiceScore score = activeDataServices.get(serviceUUID);
if (score == null) {
if (log.isInfoEnabled())
log.info("Service is not scored: " + serviceUUID);
return false;
}
return isHighlyUtilizedDataService(score, scores);
} finally {
clearLoggingContext();
}
}
public boolean isUnderUtilizedDataService(final UUID serviceUUID)
throws IOException {
setupLoggingContext();
try {
final ServiceScore[] scores = this.serviceScores.get();
// No scores yet?
if (scores == null) {
if(log.isInfoEnabled()) log.info("No scores yet");
return false;
}
final ServiceScore score = activeDataServices.get(serviceUUID);
if (score == null) {
if (log.isInfoEnabled())
log.info("Service is not scored: " + serviceUUID);
return false;
}
return isUnderUtilizedDataService(score, scores);
} finally {
clearLoggingContext();
}
}
protected boolean isHighlyUtilizedDataService(final ServiceScore score,
final ServiceScore[] scores) {
if (score == null)
throw new IllegalArgumentException();
if (scores == null)
throw new IllegalArgumentException();
boolean highlyUtilized = false;
if (score.drank > .8) {
// top 20% is considered to be highly utilized.
highlyUtilized = true;
} else if (score.rank == scores.length - 1) {
// top rank is considered to be highly utilized.
highlyUtilized = true;
}
if (log.isInfoEnabled())
log.info("highlyUtilized=" + highlyUtilized + " : " + score);
return highlyUtilized;
}
protected boolean isUnderUtilizedDataService(final ServiceScore score,
final ServiceScore[] scores) {
if (score == null)
throw new IllegalArgumentException();
if (scores == null)
throw new IllegalArgumentException();
boolean underUtilized = false;
if (score.drank < .2) {
// bottom 20% is considered to be under-utilized.
underUtilized = true;
} else if (score.rank == 0) {
// bottom rank is considered to be under-utilized.
underUtilized = true;
}
if (log.isInfoEnabled())
log.info("underUtilized=" + underUtilized + " : " + score);
return underUtilized;
}
public UUID getUnderUtilizedDataService() throws IOException,
TimeoutException, InterruptedException {
return getUnderUtilizedDataServices(1, 1, null/* exclude */)[0];
}
public UUID[] getUnderUtilizedDataServices(final int minCount,
final int maxCount, final UUID exclude) throws IOException,
TimeoutException, InterruptedException {
setupLoggingContext();
try {
if (minCount < 0)
throw new IllegalArgumentException();
if (maxCount < 0)
throw new IllegalArgumentException();
final UUID[] uuids;
lock.lock();
try {
uuids = getUnderUtilizedDataServicesWithLock(minCount,
maxCount, exclude);
} finally {
lock.unlock();
}
if (log.isInfoEnabled())
log.info("minCount=" + minCount + ", maxCount=" + maxCount
+ ", exclude=" + exclude + " : reporting "
+ uuids.length
+ " under-utilized and non-excluded services: "
+ Arrays.toString(uuids));
return uuids;
} finally {
clearLoggingContext();
}
}
/**
* Impl. runs with {@link #lock}.
*
* @param minCount
* @param maxCount
* @param exclude
* @return
* @throws TimeoutException
* @throws InterruptedException
*/
private UUID[] getUnderUtilizedDataServicesWithLock(final int minCount,
final int maxCount, final UUID exclude) throws TimeoutException,
InterruptedException {
if (log.isDebugEnabled())
log.debug("minCount=" + minCount + ", maxCount=" + maxCount
+ ", exclude=" + exclude);
if (nupdates < initialRoundRobinUpdateCount) {
/*
* Use a round-robin assignment for the first N updates while the
* LBS develops some history on the hosts and services.
*/
return roundRobinServiceLoadHelper.getUnderUtilizedDataServices(
minCount, maxCount, exclude);
}
/*
* Scores for the services in ascending order (least utilized to most
* utilized).
*/
final ServiceScore[] scores = this.serviceScores.get();
if (scores == null || scores.length == 0) {
if (minCount == 0) {
if (log.isDebugEnabled())
log
.debug("No scores, minCount is zero - will return null.");
return null;
}
/*
* Scores are not available immediately. This will await a
* non-excluded service join and then return the "under-utilized"
* services without reference to computed service scores. This path
* is used when the load balancer first starts up (unless the round
* robin is enabled) since it will not have scores for at least one
* pass of the UpdateTask.
*/
return new ServiceLoadHelperWithoutScores()
.getUnderUtilizedDataServices(minCount, maxCount, exclude);
}
/*
* Count the #of non-excluded active services - this is [nok].
*
* Note: [knownGood] is set to a service that (a) is not excluded; and
* (b) is active. This is the fallback service that we will recommend if
* minCount is non-zero and we are using the scores and all of a sudden
* it looks like there are no active services to recommend. This
* basically codifies a decision point where we accept that this service
* is active. We choose this as the first active and non-excluded
* service so that it will be as under-utilized as possible.
*/
int nok = 0;
UUID knownGood = null;
for (int i = 0; i < scores.length && nok < 1; i++) {
final UUID serviceUUID = scores[i].serviceUUID;
if (exclude != null && exclude.equals(serviceUUID))
continue;
if (!activeDataServices.containsKey(serviceUUID))
continue;
if (knownGood == null)
knownGood = serviceUUID;
nok++;
}
if (nok == 0) {
/*
* There are no non-excluded active services.
*/
if (log.isDebugEnabled())
log.debug("No non-excluded services.");
if (minCount == 0) {
/*
* Since there was no minimum #of services demanded by the
* caller, we return [null].
*/
if (log.isDebugEnabled())
log
.debug("No non-excluded services, minCount is zero - will return null.");
return null;
} else {
/*
* We do not have ANY active and scored non-excluded services
* and [minCount GT ZERO]. In this case we use a he variant that
* does not use scores and that awaits a service join.
*/
if (log.isDebugEnabled())
log.debug("Will await a service join.");
return new ServiceLoadHelperWithoutScores()
.getUnderUtilizedDataServices(minCount, maxCount,
exclude);
}
}
/*
* Use the scores to compute the under-utilized services.
*/
if (log.isDebugEnabled())
log.debug("Will recommend services based on scores: #scored="
+ scores.length + ", nok=" + nok + ", knownGood="
+ knownGood + ", exclude=" + exclude);
assert nok > 0;
assert knownGood != null;
assert scores != null;
assert scores.length != 0;
return new ServiceLoadHelperWithScores(knownGood, scores)
.getUnderUtilizedDataServices(minCount, maxCount, exclude);
}
/**
* Integration with the {@link LoadBalancerService}.
*
* @author Bryan Thompson
* @version $Id$
*/
protected class RoundRobinServiceLoadHelper extends
AbstractRoundRobinServiceLoadHelper {
protected UUID[] awaitServices(int minCount, long timeout)
throws InterruptedException, TimeoutException {
return ((AbstractScaleOutFederation) LoadBalancerService.this
.getFederation()).awaitServices(minCount, timeout);
}
}
/**
* Integration with the {@link LoadBalancerService}.
*
* @author Bryan Thompson
* @version $Id$
*/
protected class ServiceLoadHelperWithoutScores extends
AbstractServiceLoadHelperWithoutScores {
public ServiceLoadHelperWithoutScores() {
super(serviceJoinTimeout);
}
@Override
protected void awaitJoin(long timeout, TimeUnit unit) throws InterruptedException {
// await a join.
joined.await(timeout, unit);
}
@Override
protected UUID[] getActiveServices() {
return activeDataServices.keySet().toArray(new UUID[] {});
}
@Override
protected boolean isActiveDataService(UUID serviceUUID) {
return activeDataServices.containsKey(serviceUUID);
}
@Override
protected boolean isUnderUtilizedDataService(ServiceScore score,
ServiceScore[] scores) {
return LoadBalancerService.this.isUnderUtilizedDataService(score,
scores);
}
}
/**
* Integration with the {@link LoadBalancerService}.
*
* @author Bryan Thompson
* @version $Id$
*/
protected class ServiceLoadHelperWithScores extends
AbstractServiceLoadHelperWithScores {
public ServiceLoadHelperWithScores(final UUID knownGood,
final ServiceScore[] scores) {
super(serviceJoinTimeout, knownGood, scores);
}
@Override
protected void awaitJoin(long timeout, TimeUnit unit) throws InterruptedException {
// await a join.
joined.await(timeout, unit);
}
@Override
protected UUID[] getActiveServices() {
return activeDataServices.keySet().toArray(new UUID[] {});
}
@Override
protected boolean isActiveDataService(UUID serviceUUID) {
return activeDataServices.containsKey(serviceUUID);
}
@Override
protected boolean isUnderUtilizedDataService(ServiceScore score,
ServiceScore[] scores) {
return LoadBalancerService.this.isUnderUtilizedDataService(score,
scores);
}
}
}