com.bazaarvoice.ostrich.pool.ServicePool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of ostrich-core Show documentation
Show all versions of ostrich-core Show documentation
Core classes that form Ostrich
package com.bazaarvoice.ostrich.pool;
import com.bazaarvoice.ostrich.HealthCheckResult;
import com.bazaarvoice.ostrich.HealthCheckResults;
import com.bazaarvoice.ostrich.HostDiscovery;
import com.bazaarvoice.ostrich.LoadBalanceAlgorithm;
import com.bazaarvoice.ostrich.PartitionContext;
import com.bazaarvoice.ostrich.PartitionContextBuilder;
import com.bazaarvoice.ostrich.RetryPolicy;
import com.bazaarvoice.ostrich.ServiceCallback;
import com.bazaarvoice.ostrich.ServiceEndPoint;
import com.bazaarvoice.ostrich.ServiceFactory;
import com.bazaarvoice.ostrich.ServicePoolStatistics;
import com.bazaarvoice.ostrich.exceptions.MaxRetriesException;
import com.bazaarvoice.ostrich.exceptions.NoAvailableHostsException;
import com.bazaarvoice.ostrich.exceptions.NoCachedInstancesAvailableException;
import com.bazaarvoice.ostrich.exceptions.NoSuitableHostsException;
import com.bazaarvoice.ostrich.exceptions.OnlyBadHostsException;
import com.bazaarvoice.ostrich.healthcheck.DefaultHealthCheckResults;
import com.bazaarvoice.ostrich.healthcheck.HealthCheckRetryDelay;
import com.bazaarvoice.ostrich.metrics.Metrics;
import com.bazaarvoice.ostrich.partition.PartitionFilter;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Objects;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.base.Throwables;
import com.google.common.base.Ticker;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import static com.google.common.base.Preconditions.checkNotNull;
class ServicePool implements com.bazaarvoice.ostrich.ServicePool {
private static final Logger LOG = LoggerFactory.getLogger(ServicePool.class);
/**
* Number of seconds between bad endpoint health check verification runs.
*/
private static final int HEALTH_CHECK_VERIFY_SECS = 30;
private final Ticker _ticker;
private final HostDiscovery _hostDiscovery;
private final boolean _cleanupHostDiscoveryOnClose;
private final HostDiscovery.EndPointListener _hostDiscoveryListener;
private final ServiceFactory _serviceFactory;
private final ScheduledExecutorService _healthCheckExecutor;
private final boolean _shutdownHealthCheckExecutorOnClose;
private final PartitionFilter _partitionFilter;
private final LoadBalanceAlgorithm _loadBalanceAlgorithm;
private final ServicePoolStatistics _servicePoolStatistics;
private final ConcurrentMap _badEndPoints;
private final Predicate _badEndPointFilter;
private final Set _recentlyRemovedEndPoints;
private final ServiceCache _serviceCache;
private final Metrics.InstanceMetrics _metrics;
private final Timer _callbackExecutionTime;
private final Timer _healthCheckTime;
private final Meter _numExecuteSuccesses;
private final Meter _numExecuteAttemptFailures;
private final HealthCheckRetryDelay _healthCheckRetryDelay;
ServicePool(Ticker ticker, HostDiscovery hostDiscovery, boolean cleanupHostDiscoveryOnClose,
ServiceFactory serviceFactory, ServiceCachingPolicy cachingPolicy,
PartitionFilter partitionFilter, LoadBalanceAlgorithm loadBalanceAlgorithm,
ScheduledExecutorService healthCheckExecutor, boolean shutdownHealthCheckExecutorOnClose,
HealthCheckRetryDelay healthCheckRetryDelay, MetricRegistry metrics) {
_healthCheckRetryDelay = checkNotNull(healthCheckRetryDelay);
_ticker = checkNotNull(ticker);
_hostDiscovery = checkNotNull(hostDiscovery);
_cleanupHostDiscoveryOnClose = cleanupHostDiscoveryOnClose;
_serviceFactory = checkNotNull(serviceFactory);
_healthCheckExecutor = checkNotNull(healthCheckExecutor);
_shutdownHealthCheckExecutorOnClose = shutdownHealthCheckExecutorOnClose;
_badEndPoints = Maps.newConcurrentMap();
_badEndPointFilter = Predicates.not(Predicates.in(_badEndPoints.keySet()));
_recentlyRemovedEndPoints = Sets.newSetFromMap(CacheBuilder.newBuilder()
.ticker(_ticker)
.expireAfterWrite(10, TimeUnit.MINUTES) // TODO: Make this a constant
.build()
.asMap());
checkNotNull(cachingPolicy);
_serviceCache = new ServiceCacheBuilder()
.withServiceFactory(serviceFactory)
.withCachingPolicy(cachingPolicy)
.withMetricRegistry(metrics)
.build();
_partitionFilter = checkNotNull(partitionFilter);
_loadBalanceAlgorithm = checkNotNull(loadBalanceAlgorithm);
_servicePoolStatistics = new ServicePoolStatistics() {
@Override
public int getNumIdleCachedInstances(ServiceEndPoint endPoint) {
return _serviceCache.getNumIdleInstances(endPoint);
}
@Override
public int getNumActiveInstances(ServiceEndPoint endPoint) {
return _serviceCache.getNumActiveInstances(endPoint);
}
};
// Watch end points as they are removed from host discovery so that we can remove them from our set of bad
// end points as well. This will prevent the bad end points set from growing in an unbounded fashion.
// There is a minor race condition that could happen here, but it's not anything to be concerned about. The
// HostDiscovery component could lose its connection to its backing data store and then immediately regain it
// right afterwards. If that happens it could remove all of its end points only to re-add them right back again
// and we will "forget" that an end point was bad and try to use it again. This isn't fatal though because
// we'll just rediscover that it's a bad end point again in the future. Also in the future it might be useful
// to measure how long an end point has been considered bad and potentially take action for end points that are
// bad for long periods of time.
_hostDiscoveryListener = new HostDiscovery.EndPointListener() {
@Override
public void onEndPointAdded(ServiceEndPoint endPoint) {
addEndPoint(endPoint);
}
@Override
public void onEndPointRemoved(ServiceEndPoint endPoint) {
removeEndPoint(endPoint);
}
};
_hostDiscovery.addListener(_hostDiscoveryListener);
_metrics = Metrics.forInstance(metrics, this, _serviceFactory.getServiceName());
_callbackExecutionTime = _metrics.timer("callback-execution-time");
_healthCheckTime = _metrics.timer("health-check-time");
_numExecuteSuccesses = _metrics.meter("num-execute-successes");
_numExecuteAttemptFailures = _metrics.meter("num-execute-attempt-failures");
_metrics.gauge("num-valid-end-points", new Gauge() {
@Override
public Integer getValue() {
return getNumValidEndPoints();
}
});
_metrics.gauge("num-bad-end-points", new Gauge() {
@Override
public Integer getValue() {
return getNumBadEndPoints();
}
});
// Periodically ensure that health checks for unhealthy endpoints are still running
_healthCheckExecutor.scheduleAtFixedRate(
new HealthCheckVerifier(),
HEALTH_CHECK_VERIFY_SECS,
HEALTH_CHECK_VERIFY_SECS,
TimeUnit.SECONDS);
}
@Override
public void close() {
for (HealthCheck healthCheck : _badEndPoints.values()) {
healthCheck.cancel(true);
}
_hostDiscovery.removeListener(_hostDiscoveryListener);
if (_cleanupHostDiscoveryOnClose) {
try {
_hostDiscovery.close();
} catch (IOException e) {
// NOP
}
}
_serviceCache.close();
_metrics.close();
if (_shutdownHealthCheckExecutorOnClose) {
_healthCheckExecutor.shutdownNow();
}
}
@Override
public R execute(RetryPolicy retry, ServiceCallback callback) {
return execute(PartitionContextBuilder.empty(), retry, callback);
}
@Override
public R execute(PartitionContext partitionContext, RetryPolicy retry, ServiceCallback callback) {
final long start = _ticker.read();
int numAttempts = 0;
Exception lastException = null;
do {
Iterable allEndPoints = getAllEndPoints();
if (Iterables.isEmpty(allEndPoints)) {
throw (lastException == null)
? new NoAvailableHostsException(String.format("No endpoints discovered for service %s", getServiceName()))
: new NoAvailableHostsException(lastException.getMessage(), lastException);
}
Iterable validEndPoints = getValidEndPoints(allEndPoints);
if (Iterables.isEmpty(validEndPoints)) {
throw (lastException == null)
? new OnlyBadHostsException(String.format("No valid endpoints discovered for service %s, all endpoints: %s", getServiceName(), allEndPoints))
: new OnlyBadHostsException(lastException.getMessage(), lastException);
}
ServiceEndPoint endPoint = chooseEndPoint(validEndPoints, partitionContext);
if (endPoint == null) {
throw (lastException == null)
? new NoSuitableHostsException(String.format("No suitable endpoint discovered for service %s from valid endpoints %s", getServiceName(), validEndPoints))
: new NoSuitableHostsException(lastException);
}
try {
R result = executeOnEndPoint(endPoint, callback);
_numExecuteSuccesses.mark();
return result;
} catch (Exception e) {
_numExecuteAttemptFailures.mark();
// Don't retry if exception is too severe.
if (!isRetriableException(e)) {
throw Throwables.propagate(e);
}
LOG.info("Retriable exception from end point: {}, {}", endPoint, e.toString());
LOG.debug("Exception", e);
lastException = e;
}
}
while (retry.allowRetry(++numAttempts, TimeUnit.NANOSECONDS.toMillis(_ticker.read() - start)));
throw new MaxRetriesException(lastException);
}
@Override
public int getNumValidEndPoints() {
return Iterables.size(_hostDiscovery.getHosts()) - _badEndPoints.size();
}
@Override
public int getNumBadEndPoints() {
return _badEndPoints.size();
}
/**
* Determine the set of all {@link ServiceEndPoint}s.
*
* NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
*/
Iterable getAllEndPoints() {
return _hostDiscovery.getHosts();
}
/**
* Determine the set of usable {@link ServiceEndPoint}s.
*/
private Iterable getValidEndPoints(Iterable endPoints) {
return Iterables.filter(endPoints, _badEndPointFilter);
}
private ServiceEndPoint chooseEndPoint(Iterable endPoints, PartitionContext partitionContext) {
endPoints = _partitionFilter.filter(endPoints, partitionContext);
if (endPoints == null || Iterables.isEmpty(endPoints)) {
return null;
}
ServiceEndPoint endPoint = _loadBalanceAlgorithm.choose(endPoints, _servicePoolStatistics);
if (endPoint == null) {
return null;
}
return endPoint;
}
/**
* Execute a callback on a specific end point.
*
* NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
*/
R executeOnEndPoint(ServiceEndPoint endPoint, ServiceCallback callback)
throws Exception {
ServiceHandle handle = null;
try {
handle = _serviceCache.checkOut(endPoint);
Timer.Context timer = _callbackExecutionTime.time();
try {
return callback.call(handle.getService());
} finally {
timer.stop();
}
} catch (NoCachedInstancesAvailableException e) {
LOG.info("Service cache exhausted. End point: {}", endPoint, e);
// Don't mark an end point as bad just because there are no cached end points for it.
throw e;
} catch (Exception e) {
if (_serviceFactory.isRetriableException(e)) {
// This is a known and supported exception indicating that something went wrong somewhere in the service
// layer while trying to communicate with the end point. These errors are often transient, so we
// enqueue a health check for the end point and mark it as unavailable for the time being.
markEndPointAsBad(endPoint);
LOG.info("Bad end point discovered. End point: {}", endPoint, e);
}
throw e;
} finally {
if (handle != null) {
try {
_serviceCache.checkIn(handle);
} catch (Exception e) {
// This should never happen, but log just in case.
LOG.warn("Error returning end point to cache. End point: {}, {}",
endPoint, e.toString());
LOG.debug("Exception", e);
}
}
}
}
/**
* Check if an exception is retriable.
*
* NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
*/
boolean isRetriableException(Exception exception) {
return _serviceFactory.isRetriableException(exception);
}
/**
* NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
*
* @return The name of the service for this pool.
*/
String getServiceName() {
return _serviceFactory.getServiceName();
}
@VisibleForTesting
HostDiscovery getHostDiscovery() {
return _hostDiscovery;
}
@VisibleForTesting
PartitionFilter getPartitionFilter() {
return _partitionFilter;
}
@VisibleForTesting
LoadBalanceAlgorithm getLoadBalanceAlgorithm() {
return _loadBalanceAlgorithm;
}
@VisibleForTesting
ServicePoolStatistics getServicePoolStatistics() {
return _servicePoolStatistics;
}
@VisibleForTesting
Set getBadEndPoints() {
return ImmutableSet.copyOf(_badEndPoints.keySet());
}
@Override
public HealthCheckResults checkForHealthyEndPoint() {
DefaultHealthCheckResults aggregate = new DefaultHealthCheckResults();
Iterable allEndPoints = getAllEndPoints();
if (Iterables.isEmpty(allEndPoints)) {
// There were no end points
return aggregate;
}
Iterable validEndPoints = getValidEndPoints(allEndPoints);
if (Iterables.isEmpty(validEndPoints)) {
// There were no valid end points
return aggregate;
}
Set endPoints = Sets.newHashSet(validEndPoints);
while (!endPoints.isEmpty()) {
// Prefer end points in the order the load balancer recommends.
ServiceEndPoint endPoint = chooseEndPoint(endPoints, PartitionContextBuilder.empty());
if (endPoint == null) {
// Load balancer didn't like our end points, so just go sequentially.
endPoint = endPoints.iterator().next();
}
HealthCheckResult result = checkHealth(endPoint);
aggregate.addHealthCheckResult(result);
if (!result.isHealthy()) {
Exception exception = ((FailedHealthCheckResult) result).getException();
if (exception == null || isRetriableException(exception)) {
LOG.info("Unhealthy end point discovered. End point {}", endPoint);
endPoints.remove(endPoint);
markEndPointAsBad(endPoint);
continue;
}
}
break;
}
return aggregate;
}
/**
* Run the health checks on all current unhealthy end points. This method blocks until the
* health checks have completed.
*/
@VisibleForTesting
void forceHealthChecks() {
for (HealthCheck healthCheck : _badEndPoints.values()) {
healthCheck.run();
}
}
private synchronized void addEndPoint(ServiceEndPoint endPoint) {
_recentlyRemovedEndPoints.remove(endPoint);
markEndPointAsBad(endPoint);
LOG.debug("End point added to service pool. End point: {}", endPoint);
}
private synchronized void removeEndPoint(ServiceEndPoint endPoint) {
// Mark this end point as recently removed. We do this in order to keep a positive set of removed
// end points so that we avoid a potential race condition where someone was using this end point while
// we noticed it disappeared from host discovery. In that case there is the potential that they
// would add it to the bad end points set after we've already processed the removal, thus leading to a
// memory leak in the bad end points set. Having this time-limited view of the recently removed
// end points ensures that this memory leak doesn't happen.
_recentlyRemovedEndPoints.add(endPoint);
_badEndPoints.remove(endPoint);
_serviceCache.evict(endPoint);
LOG.debug("End point removed from service pool. End point: {}", endPoint);
}
private synchronized void markEndPointAsBad(ServiceEndPoint endPoint) {
if (_recentlyRemovedEndPoints.contains(endPoint)) {
// Nothing to do, we've already removed this end point
return;
}
_serviceCache.evict(endPoint);
// Only schedule a health check if this is the first time we've seen this end point as bad...
HealthCheck healthCheck = new HealthCheck(endPoint);
if (_badEndPoints.putIfAbsent(endPoint, healthCheck) == null) {
healthCheck.start();
}
}
@VisibleForTesting
HealthCheckResult checkHealth(ServiceEndPoint endPoint) {
// We have to be very careful to not allow any exceptions to make it out of of this method, if they do then
// subsequent scheduled invocations of the Runnable may not happen, and we could stop checking health checks
// completely. So we intentionally handle all possible exceptions here.
final long start = _ticker.read();
boolean isHealthy;
Exception exception = null;
try {
isHealthy = _serviceFactory.isHealthy(endPoint);
} catch (Exception e) {
isHealthy = false;
exception = e;
}
final long duration = _ticker.read() - start;
_healthCheckTime.update(duration, TimeUnit.NANOSECONDS);
return isHealthy
? new SuccessfulHealthCheckResult(endPoint.getId(), duration)
: new FailedHealthCheckResult(endPoint.getId(), duration, exception);
}
@VisibleForTesting
final class HealthCheckVerifier implements Runnable {
@Override
public void run() {
try {
for (HealthCheck healthCheck : _badEndPoints.values()) {
healthCheck.verifyScheduling();
}
} catch (Throwable ex) {
LOG.warn("Error rescheduling health checks", ex);
}
}
}
@VisibleForTesting
final class HealthCheck implements Runnable {
private final ServiceEndPoint _endPoint;
private final Lock _lock = new ReentrantLock();
private int _count = 0;
private Future _future;
private boolean _cancelled;
private boolean _scheduled;
private boolean _running;
public HealthCheck(ServiceEndPoint endPoint) {
_endPoint = endPoint;
}
public void start() {
_lock.lock();
try {
assert _future == null && !_cancelled;
_future = _healthCheckExecutor.submit(this);
_scheduled = true;
} finally {
_lock.unlock();
}
}
public void cancel(boolean mayInterruptIfRunning) {
_lock.lock();
try {
if (_future != null) {
_future.cancel(mayInterruptIfRunning);
_future = null;
}
_cancelled = true;
} finally {
_lock.unlock();
}
}
public void verifyScheduling() {
_lock.lock();
try {
if (!_cancelled && !_running && !_scheduled) {
_healthCheckExecutor.submit(this);
_scheduled = true;
}
} finally {
_lock.unlock();
}
}
@Override
public void run() {
_lock.lock();
try {
if (_cancelled || _badEndPoints.get(_endPoint) != this) {
return;
}
_scheduled = false;
_running = true;
// Don't perform health check operation in lock as it could cause deadlock on cancel if
// health check stalls.
HealthCheckResult result;
_lock.unlock();
try {
result = checkHealth(_endPoint);
} finally {
_lock.lock();
}
_count += 1;
if (result.isHealthy()) {
_serviceCache.register(_endPoint);
_badEndPoints.remove(_endPoint, this);
this.cancel(false);
} else {
long delayMillis = _healthCheckRetryDelay.getDelay(_count, result);
if (_future != null) {
_future.cancel(false); // In case this Runnable was invoked directly and not by scheduler
}
_future = _healthCheckExecutor.schedule(this, Math.max(0, delayMillis), TimeUnit.MILLISECONDS);
_scheduled = true;
}
} finally {
_running = false;
_lock.unlock();
}
}
}
private static final class SuccessfulHealthCheckResult implements HealthCheckResult {
private final String _endPointId;
private final long _responseTimeInNanos;
public SuccessfulHealthCheckResult(String endPointId, long responseTimeInNanos) {
_endPointId = endPointId;
_responseTimeInNanos = responseTimeInNanos;
}
@Override
public boolean isHealthy() {
return true;
}
@Override
public String getEndPointId() {
return _endPointId;
}
@Override
public long getResponseTime(TimeUnit unit) {
return unit.convert(_responseTimeInNanos, TimeUnit.NANOSECONDS);
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("endPointId", _endPointId)
.toString();
}
}
private static final class FailedHealthCheckResult implements HealthCheckResult {
private final String _endPointId;
private final long _responseTimeInNanos;
private final Exception _exception;
public FailedHealthCheckResult(String endPointId, long responseTimeInNanos, Exception exception) {
_endPointId = endPointId;
_responseTimeInNanos = responseTimeInNanos;
_exception = exception;
}
public FailedHealthCheckResult(String endPointId, long responseTimeInNanos) {
this(endPointId, responseTimeInNanos, null);
}
@Override
public boolean isHealthy() {
return false;
}
@Override
public String getEndPointId() {
return _endPointId;
}
@Override
public long getResponseTime(TimeUnit unit) {
return unit.convert(_responseTimeInNanos, TimeUnit.NANOSECONDS);
}
public Exception getException() {
return _exception;
}
@Override
public String toString() {
return Objects.toStringHelper(this)
.add("endPointId", _endPointId)
.add("exception", _exception)
.toString();
}
}
}