All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.brooklyn.policy.ha.AbstractFailureDetector Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.brooklyn.policy.ha;

import static org.apache.brooklyn.util.time.Time.makeTimeStringRounded;

import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.brooklyn.api.entity.EntityLocal;
import org.apache.brooklyn.api.mgmt.Task;
import org.apache.brooklyn.api.sensor.Sensor;
import org.apache.brooklyn.config.ConfigKey;
import org.apache.brooklyn.core.config.ConfigKeys;
import org.apache.brooklyn.core.entity.EntityInternal;
import org.apache.brooklyn.core.mgmt.BrooklynTaskTags;
import org.apache.brooklyn.core.policy.AbstractPolicy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.brooklyn.policy.ha.HASensors.FailureDescriptor;
import org.apache.brooklyn.util.collections.MutableMap;
import org.apache.brooklyn.util.core.flags.SetFromFlag;
import org.apache.brooklyn.util.core.task.BasicTask;
import org.apache.brooklyn.util.core.task.ScheduledTask;
import org.apache.brooklyn.util.exceptions.Exceptions;
import org.apache.brooklyn.util.time.Duration;
import org.apache.brooklyn.util.time.Time;

import com.google.common.reflect.TypeToken;

public abstract class AbstractFailureDetector extends AbstractPolicy {

    // TODO Remove duplication from ServiceFailureDetector, particularly for the stabilisation delays.

    private static final Logger LOG = LoggerFactory.getLogger(AbstractFailureDetector.class);

    private static final long MIN_PERIOD_BETWEEN_EXECS_MILLIS = 100;

    public static final ConfigKey POLL_PERIOD = ConfigKeys.newDurationConfigKey(
            "failureDetector.pollPeriod", "", Duration.ONE_SECOND);

    @SetFromFlag("failedStabilizationDelay")
    public static final ConfigKey FAILED_STABILIZATION_DELAY = ConfigKeys.newDurationConfigKey(
            "failureDetector.serviceFailedStabilizationDelay",
            "Time period for which the health check consistently fails "
                    + "(e.g. doesn't report failed-ok-faled) before concluding failure.",
            Duration.ZERO);

    @SetFromFlag("recoveredStabilizationDelay")
    public static final ConfigKey RECOVERED_STABILIZATION_DELAY = ConfigKeys.newDurationConfigKey(
            "failureDetector.serviceRecoveredStabilizationDelay",
            "Time period for which the health check succeeds continiually " +
                    "(e.g. doesn't report ok-failed-ok) before concluding recovered",
            Duration.ZERO);

    @SuppressWarnings("serial")
    public static final ConfigKey> SENSOR_FAILED = ConfigKeys.newConfigKey(new TypeToken>() {},
            "failureDetector.sensor.fail", "A sensor which will indicate failure when set", HASensors.ENTITY_FAILED);

    @SuppressWarnings("serial")
    public static final ConfigKey> SENSOR_RECOVERED = ConfigKeys.newConfigKey(new TypeToken>() {},
            "failureDetector.sensor.recover", "A sensor which will indicate recovery from failure when set", HASensors.ENTITY_RECOVERED);

    public interface CalculatedStatus {
        boolean isHealthy();
        String getDescription();
    }

    private final class PublishJob implements Runnable {
        @Override public void run() {
            try {
                executorTime = System.currentTimeMillis();
                executorQueued.set(false);

                publishNow();

            } catch (Exception e) {
                if (isRunning()) {
                    LOG.error("Problem resizing: "+e, e);
                } else {
                    if (LOG.isDebugEnabled()) LOG.debug("Problem resizing, but no longer running: "+e, e);
                }
            } catch (Throwable t) {
                LOG.error("Problem in service-failure-detector: "+t, t);
                throw Exceptions.propagate(t);
            }
        }
    }

    private final class HealthPoller implements Runnable {
        @Override
        public void run() {
            checkHealth();
        }
    }

    private final class HealthPollingTaskFactory implements Callable> {
        @Override
        public Task call() {
            BasicTask task = new BasicTask(new HealthPoller());
            BrooklynTaskTags.setTransient(task);
            return task;
        }
    }

    protected static class BasicCalculatedStatus implements CalculatedStatus {
        private boolean healthy;
        private String description;

        public BasicCalculatedStatus(boolean healthy, String description) {
            this.healthy = healthy;
            this.description = description;
        }

        @Override
        public boolean isHealthy() {
            return healthy;
        }

        @Override
        public String getDescription() {
            return description;
        }
    }

    public enum LastPublished {
        NONE,
        FAILED,
        RECOVERED;
    }

    protected final AtomicReference stateLastGood = new AtomicReference();
    protected final AtomicReference stateLastFail = new AtomicReference();

    protected Long currentFailureStartTime = null;
    protected Long currentRecoveryStartTime = null;

    protected LastPublished lastPublished = LastPublished.NONE;

    private final AtomicBoolean executorQueued = new AtomicBoolean(false);
    private volatile long executorTime = 0;

    private Callable> pollingTaskFactory = new HealthPollingTaskFactory();

    private Task scheduledTask;

    protected abstract CalculatedStatus calculateStatus();

    @Override
    public void setEntity(EntityLocal entity) {
        super.setEntity(entity);

        if (isRunning()) {
            doStartPolling();
        }
    }

    @Override
    public void suspend() {
        scheduledTask.cancel(true);
        super.suspend();
    }

    @Override
    public void resume() {
        currentFailureStartTime = null;
        currentRecoveryStartTime = null;
        lastPublished = LastPublished.NONE;
        executorQueued.set(false);
        executorTime = 0;

        super.resume();
        doStartPolling();
    }

    @SuppressWarnings("unchecked")
    protected void doStartPolling() {
        if (scheduledTask == null || scheduledTask.isDone()) {
            ScheduledTask task = new ScheduledTask(MutableMap.of("period", getPollPeriod(), "displayName", getTaskName()), pollingTaskFactory);
            scheduledTask = ((EntityInternal)entity).getExecutionContext().submit(task);
        }
    }

    private String getTaskName() {
        return getDisplayName();
    }

    protected Duration getPollPeriod() {
        return getConfig(POLL_PERIOD);
    }

    protected Duration getFailedStabilizationDelay() {
        return getConfig(FAILED_STABILIZATION_DELAY);
    }

    protected Duration getRecoveredStabilizationDelay() {
        return getConfig(RECOVERED_STABILIZATION_DELAY);
    }

    protected Sensor getSensorFailed() {
        return getConfig(SENSOR_FAILED);
    }

    protected Sensor getSensorRecovered() {
        return getConfig(SENSOR_RECOVERED);
    }

    private synchronized void checkHealth() {
        CalculatedStatus status = calculateStatus();
        boolean healthy = status.isHealthy();
        long now = System.currentTimeMillis();

        if (healthy) {
            stateLastGood.set(now);
            if (lastPublished == LastPublished.FAILED) {
                if (currentRecoveryStartTime == null) {
                    LOG.info("{} check for {}, now recovering: {}", new Object[] {this, entity, getDescription(status)});
                    currentRecoveryStartTime = now;
                    schedulePublish();
                } else {
                    if (LOG.isTraceEnabled()) LOG.trace("{} check for {}, continuing recovering: {}", new Object[] {this, entity, getDescription(status)});
                }
            } else {
                if (currentFailureStartTime != null) {
                    LOG.info("{} check for {}, now healthy: {}", new Object[] {this, entity, getDescription(status)});
                    currentFailureStartTime = null;
                } else {
                    if (LOG.isTraceEnabled()) LOG.trace("{} check for {}, still healthy: {}", new Object[] {this, entity, getDescription(status)});
                }
            }
        } else {
            stateLastFail.set(now);
            if (lastPublished != LastPublished.FAILED) {
                if (currentFailureStartTime == null) {
                    LOG.info("{} check for {}, now failing: {}", new Object[] {this, entity, getDescription(status)});
                    currentFailureStartTime = now;
                    schedulePublish();
                } else {
                    if (LOG.isTraceEnabled()) LOG.trace("{} check for {}, continuing failing: {}", new Object[] {this, entity, getDescription(status)});
                }
            } else {
                if (currentRecoveryStartTime != null) {
                    LOG.info("{} check for {}, now failing: {}", new Object[] {this, entity, getDescription(status)});
                    currentRecoveryStartTime = null;
                } else {
                    if (LOG.isTraceEnabled()) LOG.trace("{} check for {}, still failed: {}", new Object[] {this, entity, getDescription(status)});
                }
            }
        }
    }

    protected void schedulePublish() {
        schedulePublish(0);
    }

    @SuppressWarnings("unchecked")
    protected void schedulePublish(long delay) {
        if (isRunning() && executorQueued.compareAndSet(false, true)) {
            long now = System.currentTimeMillis();
            delay = Math.max(0, Math.max(delay, (executorTime + MIN_PERIOD_BETWEEN_EXECS_MILLIS) - now));
            if (LOG.isTraceEnabled()) LOG.trace("{} scheduling publish in {}ms", this, delay);

            Runnable job = new PublishJob();

            ScheduledTask task = new ScheduledTask(MutableMap.of("delay", Duration.of(delay, TimeUnit.MILLISECONDS)), new BasicTask(job));
            ((EntityInternal)entity).getExecutionContext().submit(task);
        }
    }

    private synchronized void publishNow() {
        if (!isRunning()) return;

        CalculatedStatus calculatedStatus = calculateStatus();
        boolean healthy = calculatedStatus.isHealthy();

        Long lastUpTime = stateLastGood.get();
        Long lastDownTime = stateLastFail.get();
        long serviceFailedStabilizationDelay = getFailedStabilizationDelay().toMilliseconds();
        long serviceRecoveredStabilizationDelay = getRecoveredStabilizationDelay().toMilliseconds();
        long now = System.currentTimeMillis();

        if (healthy) {
            if (lastPublished == LastPublished.FAILED) {
                // only publish if consistently up for serviceRecoveredStabilizationDelay
                long currentRecoveryPeriod = getTimeDiff(now, currentRecoveryStartTime);
                long sinceLastDownPeriod = getTimeDiff(now, lastDownTime);
                if (currentRecoveryPeriod > serviceRecoveredStabilizationDelay && sinceLastDownPeriod > serviceRecoveredStabilizationDelay) {
                    String description = getDescription(calculatedStatus);
                    LOG.warn("{} check for {}, publishing recovered: {}", new Object[] {this, entity, description});
                    entity.sensors().emit(getSensorRecovered(), new HASensors.FailureDescriptor(entity, description));
                    lastPublished = LastPublished.RECOVERED;
                    currentFailureStartTime = null;
                } else {
                    long nextAttemptTime = Math.max(serviceRecoveredStabilizationDelay - currentRecoveryPeriod, serviceRecoveredStabilizationDelay - sinceLastDownPeriod);
                    schedulePublish(nextAttemptTime);
                }
            }
        } else {
            if (lastPublished != LastPublished.FAILED) {
                // only publish if consistently down for serviceFailedStabilizationDelay
                long currentFailurePeriod = getTimeDiff(now, currentFailureStartTime);
                long sinceLastUpPeriod = getTimeDiff(now, lastUpTime);
                if (currentFailurePeriod > serviceFailedStabilizationDelay && sinceLastUpPeriod > serviceFailedStabilizationDelay) {
                    String description = getDescription(calculatedStatus);
                    LOG.warn("{} connectivity-check for {}, publishing failed: {}", new Object[] {this, entity, description});
                    entity.sensors().emit(getSensorFailed(), new HASensors.FailureDescriptor(entity, description));
                    lastPublished = LastPublished.FAILED;
                    currentRecoveryStartTime = null;
                } else {
                    long nextAttemptTime = Math.max(serviceFailedStabilizationDelay - currentFailurePeriod, serviceFailedStabilizationDelay - sinceLastUpPeriod);
                    schedulePublish(nextAttemptTime);
                }
            }
        }
    }

    protected String getDescription(CalculatedStatus status) {
        Long lastUpTime = stateLastGood.get();
        Long lastDownTime = stateLastGood.get();
        Duration serviceFailedStabilizationDelay = getFailedStabilizationDelay();
        Duration serviceRecoveredStabilizationDelay = getRecoveredStabilizationDelay();

        return String.format("%s; healthy=%s; timeNow=%s; lastUp=%s; lastDown=%s; lastPublished=%s; "+
                    "currentFailurePeriod=%s; currentRecoveryPeriod=%s",
                status.getDescription(),
                status.isHealthy(),
                Time.makeDateString(System.currentTimeMillis()),
                (lastUpTime != null ? Time.makeDateString(lastUpTime) : ""),
                (lastDownTime != null ? Time.makeDateString(lastDownTime) : ""),
                lastPublished,
                (currentFailureStartTime != null ? getTimeStringSince(currentFailureStartTime) : "") + " (stabilization "+makeTimeStringRounded(serviceFailedStabilizationDelay) + ")",
                (currentRecoveryStartTime != null ? getTimeStringSince(currentRecoveryStartTime) : "") + " (stabilization "+makeTimeStringRounded(serviceRecoveredStabilizationDelay) + ")");
    }

    private long getTimeDiff(Long recent, Long previous) {
        return (previous == null) ? recent : (recent - previous);
    }

    private String getTimeStringSince(Long time) {
        return time == null ? null : Time.makeTimeStringRounded(System.currentTimeMillis() - time);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy