com.bazaarvoice.ostrich.pool.ServicePool Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of ostrich-core Show documentation
Core classes that form Ostrich
There is a newer version: 2.1.0
package com.bazaarvoice.ostrich.pool;

import com.bazaarvoice.ostrich.HealthCheckResult;
import com.bazaarvoice.ostrich.HealthCheckResults;
import com.bazaarvoice.ostrich.HostDiscovery;
import com.bazaarvoice.ostrich.LoadBalanceAlgorithm;
import com.bazaarvoice.ostrich.PartitionContext;
import com.bazaarvoice.ostrich.PartitionContextBuilder;
import com.bazaarvoice.ostrich.RetryPolicy;
import com.bazaarvoice.ostrich.ServiceCallback;
import com.bazaarvoice.ostrich.ServiceEndPoint;
import com.bazaarvoice.ostrich.ServiceFactory;
import com.bazaarvoice.ostrich.ServicePoolStatistics;
import com.bazaarvoice.ostrich.exceptions.MaxRetriesException;
import com.bazaarvoice.ostrich.exceptions.NoAvailableHostsException;
import com.bazaarvoice.ostrich.exceptions.NoCachedInstancesAvailableException;
import com.bazaarvoice.ostrich.exceptions.NoSuitableHostsException;
import com.bazaarvoice.ostrich.exceptions.OnlyBadHostsException;
import com.bazaarvoice.ostrich.healthcheck.DefaultHealthCheckResults;
import com.bazaarvoice.ostrich.healthcheck.HealthCheckRetryDelay;
import com.bazaarvoice.ostrich.metrics.Metrics;
import com.bazaarvoice.ostrich.partition.PartitionFilter;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Objects;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.base.Throwables;
import com.google.common.base.Ticker;
import com.google.common.cache.CacheBuilder;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import static com.google.common.base.Preconditions.checkNotNull;

class ServicePool implements com.bazaarvoice.ostrich.ServicePool {
    private static final Logger LOG = LoggerFactory.getLogger(ServicePool.class);

    /**
     * Number of seconds between bad endpoint health check verification runs.
     */
    private static final int HEALTH_CHECK_VERIFY_SECS = 30;

    private final Ticker _ticker;
    private final HostDiscovery _hostDiscovery;
    private final boolean _cleanupHostDiscoveryOnClose;
    private final HostDiscovery.EndPointListener _hostDiscoveryListener;
    private final ServiceFactory _serviceFactory;
    private final ScheduledExecutorService _healthCheckExecutor;
    private final boolean _shutdownHealthCheckExecutorOnClose;
    private final PartitionFilter _partitionFilter;
    private final LoadBalanceAlgorithm _loadBalanceAlgorithm;
    private final ServicePoolStatistics _servicePoolStatistics;
    private final ConcurrentMap _badEndPoints;
    private final Predicate _badEndPointFilter;
    private final Set _recentlyRemovedEndPoints;
    private final ServiceCache _serviceCache;
    private final Metrics.InstanceMetrics _metrics;
    private final Timer _callbackExecutionTime;
    private final Timer _healthCheckTime;
    private final Meter _numExecuteSuccesses;
    private final Meter _numExecuteAttemptFailures;
    private final HealthCheckRetryDelay _healthCheckRetryDelay;

    ServicePool(Ticker ticker, HostDiscovery hostDiscovery, boolean cleanupHostDiscoveryOnClose,
                ServiceFactory serviceFactory, ServiceCachingPolicy cachingPolicy,
                PartitionFilter partitionFilter, LoadBalanceAlgorithm loadBalanceAlgorithm,
                ScheduledExecutorService healthCheckExecutor, boolean shutdownHealthCheckExecutorOnClose,
                HealthCheckRetryDelay healthCheckRetryDelay, MetricRegistry metrics) {
        _healthCheckRetryDelay = checkNotNull(healthCheckRetryDelay);
        _ticker = checkNotNull(ticker);
        _hostDiscovery = checkNotNull(hostDiscovery);
        _cleanupHostDiscoveryOnClose = cleanupHostDiscoveryOnClose;
        _serviceFactory = checkNotNull(serviceFactory);
        _healthCheckExecutor = checkNotNull(healthCheckExecutor);
        _shutdownHealthCheckExecutorOnClose = shutdownHealthCheckExecutorOnClose;
        _badEndPoints = Maps.newConcurrentMap();
        _badEndPointFilter = Predicates.not(Predicates.in(_badEndPoints.keySet()));
        _recentlyRemovedEndPoints = Sets.newSetFromMap(CacheBuilder.newBuilder()
                .ticker(_ticker)
                .expireAfterWrite(10, TimeUnit.MINUTES)  // TODO: Make this a constant
                .build()
                .asMap());
        checkNotNull(cachingPolicy);
        _serviceCache = new ServiceCacheBuilder()
                .withServiceFactory(serviceFactory)
                .withCachingPolicy(cachingPolicy)
                .withMetricRegistry(metrics)
                .build();
        _partitionFilter = checkNotNull(partitionFilter);
        _loadBalanceAlgorithm = checkNotNull(loadBalanceAlgorithm);

        _servicePoolStatistics = new ServicePoolStatistics() {
            @Override
            public int getNumIdleCachedInstances(ServiceEndPoint endPoint) {
                return _serviceCache.getNumIdleInstances(endPoint);
            }

            @Override
            public int getNumActiveInstances(ServiceEndPoint endPoint) {
                return _serviceCache.getNumActiveInstances(endPoint);
            }
        };

        // Watch end points as they are removed from host discovery so that we can remove them from our set of bad
        // end points as well.  This will prevent the bad end points set from growing in an unbounded fashion.
        // There is a minor race condition that could happen here, but it's not anything to be concerned about.  The
        // HostDiscovery component could lose its connection to its backing data store and then immediately regain it
        // right afterwards.  If that happens it could remove all of its end points only to re-add them right back again
        // and we will "forget" that an end point was bad and try to use it again.  This isn't fatal though because
        // we'll just rediscover that it's a bad end point again in the future.  Also in the future it might be useful
        // to measure how long an end point has been considered bad and potentially take action for end points that are
        // bad for long periods of time.
        _hostDiscoveryListener = new HostDiscovery.EndPointListener() {
            @Override
            public void onEndPointAdded(ServiceEndPoint endPoint) {
                addEndPoint(endPoint);
            }

            @Override
            public void onEndPointRemoved(ServiceEndPoint endPoint) {
                removeEndPoint(endPoint);
            }
        };
        _hostDiscovery.addListener(_hostDiscoveryListener);

        _metrics = Metrics.forInstance(metrics, this, _serviceFactory.getServiceName());
        _callbackExecutionTime = _metrics.timer("callback-execution-time");
        _healthCheckTime = _metrics.timer("health-check-time");
        _numExecuteSuccesses = _metrics.meter("num-execute-successes");
        _numExecuteAttemptFailures = _metrics.meter("num-execute-attempt-failures");
        _metrics.gauge("num-valid-end-points", new Gauge() {
            @Override
            public Integer getValue() {
                return getNumValidEndPoints();
            }
        });
        _metrics.gauge("num-bad-end-points", new Gauge() {
            @Override
            public Integer getValue() {
                return getNumBadEndPoints();
            }
        });

        // Periodically ensure that health checks for unhealthy endpoints are still running
        _healthCheckExecutor.scheduleAtFixedRate(
                new HealthCheckVerifier(),
                HEALTH_CHECK_VERIFY_SECS,
                HEALTH_CHECK_VERIFY_SECS,
                TimeUnit.SECONDS);
    }

    @Override
    public void close() {
        for (HealthCheck healthCheck : _badEndPoints.values()) {
            healthCheck.cancel(true);
        }

        _hostDiscovery.removeListener(_hostDiscoveryListener);
        if (_cleanupHostDiscoveryOnClose) {
            try {
                _hostDiscovery.close();
            } catch (IOException e) {
                // NOP
            }
        }

        _serviceCache.close();
        _metrics.close();

        if (_shutdownHealthCheckExecutorOnClose) {
            _healthCheckExecutor.shutdownNow();
        }
    }

    @Override
    public  R execute(RetryPolicy retry, ServiceCallback callback) {
        return execute(PartitionContextBuilder.empty(), retry, callback);
    }

    @Override
    public  R execute(PartitionContext partitionContext, RetryPolicy retry, ServiceCallback callback) {
        final long start = _ticker.read();
        int numAttempts = 0;
        Exception lastException = null;

        do {
            Iterable allEndPoints = getAllEndPoints();
            if (Iterables.isEmpty(allEndPoints)) {
                throw (lastException == null)
                        ? new NoAvailableHostsException(String.format("No endpoints discovered for service %s", getServiceName()))
                        : new NoAvailableHostsException(lastException.getMessage(), lastException);
            }

            Iterable validEndPoints = getValidEndPoints(allEndPoints);
            if (Iterables.isEmpty(validEndPoints)) {
                throw (lastException == null)
                        ? new OnlyBadHostsException(String.format("No valid endpoints discovered for service %s, all endpoints: %s", getServiceName(), allEndPoints))
                        : new OnlyBadHostsException(lastException.getMessage(), lastException);
            }

            ServiceEndPoint endPoint = chooseEndPoint(validEndPoints, partitionContext);
            if (endPoint == null) {
                throw (lastException == null)
                        ? new NoSuitableHostsException(String.format("No suitable endpoint discovered for service %s from valid endpoints %s", getServiceName(), validEndPoints))
                        : new NoSuitableHostsException(lastException);
            }

            try {
                R result = executeOnEndPoint(endPoint, callback);
                _numExecuteSuccesses.mark();
                return result;
            } catch (Exception e) {
                _numExecuteAttemptFailures.mark();

                // Don't retry if exception is too severe.
                if (!isRetriableException(e)) {
                    throw Throwables.propagate(e);
                }

                LOG.info("Retriable exception from end point: {}, {}", endPoint, e.toString());
                LOG.debug("Exception", e);
                lastException = e;
            }
        }
        while (retry.allowRetry(++numAttempts, TimeUnit.NANOSECONDS.toMillis(_ticker.read() - start)));

        throw new MaxRetriesException(lastException);
    }

    @Override
    public int getNumValidEndPoints() {
        return Iterables.size(_hostDiscovery.getHosts()) - _badEndPoints.size();
    }

    @Override
    public int getNumBadEndPoints() {
        return _badEndPoints.size();
    }

    /**
     * Determine the set of all {@link ServiceEndPoint}s.
     * 
     * NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
     */
    Iterable getAllEndPoints() {
        return _hostDiscovery.getHosts();
    }

    /**
     * Determine the set of usable {@link ServiceEndPoint}s.
     */
    private Iterable getValidEndPoints(Iterable endPoints) {
        return Iterables.filter(endPoints, _badEndPointFilter);
    }

    private ServiceEndPoint chooseEndPoint(Iterable endPoints, PartitionContext partitionContext) {
        endPoints = _partitionFilter.filter(endPoints, partitionContext);

        if (endPoints == null || Iterables.isEmpty(endPoints)) {
            return null;
        }

        ServiceEndPoint endPoint = _loadBalanceAlgorithm.choose(endPoints, _servicePoolStatistics);
        if (endPoint == null) {
            return null;
        }

        return endPoint;
    }

    /**
     * Execute a callback on a specific end point.
     * 

     * NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
     */
     R executeOnEndPoint(ServiceEndPoint endPoint, ServiceCallback callback)
            throws Exception {
        ServiceHandle handle = null;

        try {
            handle = _serviceCache.checkOut(endPoint);

            Timer.Context timer = _callbackExecutionTime.time();
            try {
                return callback.call(handle.getService());
            } finally {
                timer.stop();
            }
        } catch (NoCachedInstancesAvailableException e) {
            LOG.info("Service cache exhausted. End point: {}", endPoint, e);
            // Don't mark an end point as bad just because there are no cached end points for it.
            throw e;
        } catch (Exception e) {
            if (_serviceFactory.isRetriableException(e)) {
                // This is a known and supported exception indicating that something went wrong somewhere in the service
                // layer while trying to communicate with the end point.  These errors are often transient, so we
                // enqueue a health check for the end point and mark it as unavailable for the time being.
                markEndPointAsBad(endPoint);
                LOG.info("Bad end point discovered. End point: {}", endPoint, e);
            }
            throw e;
        } finally {
            if (handle != null) {
                try {
                    _serviceCache.checkIn(handle);
                } catch (Exception e) {
                    // This should never happen, but log just in case.
                    LOG.warn("Error returning end point to cache. End point: {}, {}",
                            endPoint, e.toString());
                    LOG.debug("Exception", e);
                }
            }
        }
    }

    /**
     * Check if an exception is retriable.
     * 

     * NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
     */
    boolean isRetriableException(Exception exception) {
        return _serviceFactory.isRetriableException(exception);
    }

    /**
     * NOTE: This method is package private specifically so that {@link AsyncServicePool} can call it.
     *
     * @return The name of the service for this pool.
     */
    String getServiceName() {
        return _serviceFactory.getServiceName();
    }

    @VisibleForTesting
    HostDiscovery getHostDiscovery() {
        return _hostDiscovery;
    }

    @VisibleForTesting
    PartitionFilter getPartitionFilter() {
        return _partitionFilter;
    }

    @VisibleForTesting
    LoadBalanceAlgorithm getLoadBalanceAlgorithm() {
        return _loadBalanceAlgorithm;
    }

    @VisibleForTesting
    ServicePoolStatistics getServicePoolStatistics() {
        return _servicePoolStatistics;
    }

    @VisibleForTesting
    Set getBadEndPoints() {
        return ImmutableSet.copyOf(_badEndPoints.keySet());
    }

    @Override
    public HealthCheckResults checkForHealthyEndPoint() {
        DefaultHealthCheckResults aggregate = new DefaultHealthCheckResults();

        Iterable allEndPoints = getAllEndPoints();
        if (Iterables.isEmpty(allEndPoints)) {
            // There were no end points
            return aggregate;
        }

        Iterable validEndPoints = getValidEndPoints(allEndPoints);
        if (Iterables.isEmpty(validEndPoints)) {
            // There were no valid end points
            return aggregate;
        }

        Set endPoints = Sets.newHashSet(validEndPoints);
        while (!endPoints.isEmpty()) {
            // Prefer end points in the order the load balancer recommends.
            ServiceEndPoint endPoint = chooseEndPoint(endPoints, PartitionContextBuilder.empty());
            if (endPoint == null) {
                // Load balancer didn't like our end points, so just go sequentially.
                endPoint = endPoints.iterator().next();
            }

            HealthCheckResult result = checkHealth(endPoint);
            aggregate.addHealthCheckResult(result);

            if (!result.isHealthy()) {
                Exception exception = ((FailedHealthCheckResult) result).getException();
                if (exception == null || isRetriableException(exception)) {
                    LOG.info("Unhealthy end point discovered. End point {}", endPoint);
                    endPoints.remove(endPoint);
                    markEndPointAsBad(endPoint);
                    continue;
                }
            }

            break;
        }

        return aggregate;
    }

    /**
     * Run the health checks on all current unhealthy end points. This method blocks until the
     * health checks have completed.
     */
    @VisibleForTesting
    void forceHealthChecks() {
        for (HealthCheck healthCheck : _badEndPoints.values()) {
            healthCheck.run();
        }
    }

    private synchronized void addEndPoint(ServiceEndPoint endPoint) {
        _recentlyRemovedEndPoints.remove(endPoint);
        markEndPointAsBad(endPoint);
        LOG.debug("End point added to service pool. End point: {}", endPoint);
    }

    private synchronized void removeEndPoint(ServiceEndPoint endPoint) {
        // Mark this end point as recently removed.  We do this in order to keep a positive set of removed
        // end points so that we avoid a potential race condition where someone was using this end point while
        // we noticed it disappeared from host discovery.  In that case there is the potential that they
        // would add it to the bad end points set after we've already processed the removal, thus leading to a
        // memory leak in the bad end points set.  Having this time-limited view of the recently removed
        // end points ensures that this memory leak doesn't happen.
        _recentlyRemovedEndPoints.add(endPoint);
        _badEndPoints.remove(endPoint);
        _serviceCache.evict(endPoint);
        LOG.debug("End point removed from service pool. End point: {}", endPoint);
    }

    private synchronized void markEndPointAsBad(ServiceEndPoint endPoint) {
        if (_recentlyRemovedEndPoints.contains(endPoint)) {
            // Nothing to do, we've already removed this end point
            return;
        }

        _serviceCache.evict(endPoint);

        // Only schedule a health check if this is the first time we've seen this end point as bad...
        HealthCheck healthCheck = new HealthCheck(endPoint);
        if (_badEndPoints.putIfAbsent(endPoint, healthCheck) == null) {
            healthCheck.start();
        }
    }

    @VisibleForTesting
    HealthCheckResult checkHealth(ServiceEndPoint endPoint) {
        // We have to be very careful to not allow any exceptions to make it out of of this method, if they do then
        // subsequent scheduled invocations of the Runnable may not happen, and we could stop checking health checks
        // completely.  So we intentionally handle all possible exceptions here.
        final long start = _ticker.read();
        boolean isHealthy;
        Exception exception = null;

        try {
            isHealthy = _serviceFactory.isHealthy(endPoint);
        } catch (Exception e) {
            isHealthy = false;
            exception = e;
        }

        final long duration = _ticker.read() - start;
        _healthCheckTime.update(duration, TimeUnit.NANOSECONDS);
        return isHealthy
                ? new SuccessfulHealthCheckResult(endPoint.getId(), duration)
                : new FailedHealthCheckResult(endPoint.getId(), duration, exception);
    }

    @VisibleForTesting
    final class HealthCheckVerifier implements Runnable {
        @Override
        public void run() {
            try {
                for (HealthCheck healthCheck : _badEndPoints.values()) {
                    healthCheck.verifyScheduling();
                }
            } catch (Throwable ex) {
                LOG.warn("Error rescheduling health checks", ex);
            }
        }
    }

    @VisibleForTesting
    final class HealthCheck implements Runnable {
        private final ServiceEndPoint _endPoint;
        private final Lock _lock = new ReentrantLock();
        private int _count = 0;
        private Future _future;
        private boolean _cancelled;
        private boolean _scheduled;
        private boolean _running;

        public HealthCheck(ServiceEndPoint endPoint) {
            _endPoint = endPoint;
        }

        public void start() {
            _lock.lock();
            try {
                assert _future == null && !_cancelled;
                _future = _healthCheckExecutor.submit(this);
                _scheduled = true;
            } finally {
                _lock.unlock();
            }
        }

        public void cancel(boolean mayInterruptIfRunning) {
            _lock.lock();
            try {
                if (_future != null) {
                    _future.cancel(mayInterruptIfRunning);
                    _future = null;
                }

                _cancelled = true;
            } finally {
                _lock.unlock();
            }
        }

        public void verifyScheduling() {
            _lock.lock();
            try {
                if (!_cancelled && !_running && !_scheduled) {
                    _healthCheckExecutor.submit(this);
                    _scheduled = true;
                }
            } finally {
                _lock.unlock();
            }
        }

        @Override
        public void run() {
            _lock.lock();
            try {
                if (_cancelled || _badEndPoints.get(_endPoint) != this) {
                    return;
                }

                _scheduled = false;
                _running = true;

                // Don't perform health check operation in lock as it could cause deadlock on cancel if
                // health check stalls.
                HealthCheckResult result;
                _lock.unlock();
                try {
                    result = checkHealth(_endPoint);
                } finally {
                    _lock.lock();
                }

                _count += 1;

                if (result.isHealthy()) {
                    _serviceCache.register(_endPoint);
                    _badEndPoints.remove(_endPoint, this);
                    this.cancel(false);
                } else {
                    long delayMillis = _healthCheckRetryDelay.getDelay(_count, result);

                    if (_future != null) {
                        _future.cancel(false); // In case this Runnable was invoked directly and not by scheduler
                    }

                    _future = _healthCheckExecutor.schedule(this, Math.max(0, delayMillis), TimeUnit.MILLISECONDS);
                    _scheduled = true;
                }
            } finally {
                _running = false;
                _lock.unlock();
            }
        }
    }

    private static final class SuccessfulHealthCheckResult implements HealthCheckResult {
        private final String _endPointId;
        private final long _responseTimeInNanos;

        public SuccessfulHealthCheckResult(String endPointId, long responseTimeInNanos) {
            _endPointId = endPointId;
            _responseTimeInNanos = responseTimeInNanos;
        }

        @Override
        public boolean isHealthy() {
            return true;
        }

        @Override
        public String getEndPointId() {
            return _endPointId;
        }

        @Override
        public long getResponseTime(TimeUnit unit) {
            return unit.convert(_responseTimeInNanos, TimeUnit.NANOSECONDS);
        }

        @Override
        public String toString() {
            return Objects.toStringHelper(this)
                    .add("endPointId", _endPointId)
                    .toString();
        }
    }

    private static final class FailedHealthCheckResult implements HealthCheckResult {
        private final String _endPointId;
        private final long _responseTimeInNanos;
        private final Exception _exception;

        public FailedHealthCheckResult(String endPointId, long responseTimeInNanos, Exception exception) {
            _endPointId = endPointId;
            _responseTimeInNanos = responseTimeInNanos;
            _exception = exception;
        }

        public FailedHealthCheckResult(String endPointId, long responseTimeInNanos) {
            this(endPointId, responseTimeInNanos, null);
        }

        @Override
        public boolean isHealthy() {
            return false;
        }

        @Override
        public String getEndPointId() {
            return _endPointId;
        }

        @Override
        public long getResponseTime(TimeUnit unit) {
            return unit.convert(_responseTimeInNanos, TimeUnit.NANOSECONDS);
        }

        public Exception getException() {
            return _exception;
        }

        @Override
        public String toString() {
            return Objects.toStringHelper(this)
                    .add("endPointId", _endPointId)
                    .add("exception", _exception)
                    .toString();
        }
    }
}
Related Artifacts

mysql-connector-java mysql
facebook-messenger com.github.codedrinker
selenium-java org.seleniumhq.selenium
instagram-java com.github.sola92
gson com.google.code.gson
poi org.apache.poi
httpclient org.apache.httpcomponents
json org.json
facebook-java-api com.google.code.facebook-java-api
poi-ooxml org.apache.poi
jackson-databind com.fasterxml.jackson.core
junit junit
primefaces org.primefaces
ojdbc7 com.github.noraui
jfoenix com.jfoenix
testng org.testng
json-simple com.googlecode.json-simple
selenium-server org.seleniumhq.selenium
itextpdf com.itextpdf
spring-core org.springframework

Related Groups

org.springframework
org.apache.poi
org.hibernate
org.springframework.boot
com.fasterxml.jackson.core
com.itextpdf
org.seleniumhq.selenium
mysql
org.finos.legend.engine
org.apache.httpcomponents
org.apache.logging.log4j
org.openjfx
org.apache.commons
org.json
com.google.guava
com.google.zxing
net.sf.jasperreports
javax.xml.bind
ojdbc
com.google.code.facebook-java-api