All Downloads are FREE. Search and download functionalities are using the official Maven repository.

akka.remote.PhiAccrualFailureDetector.scala Maven / Gradle / Ivy

The newest version!
/**
 * Copyright (C) 2009-2014 Typesafe Inc. 
 */
package akka.remote

import akka.remote.FailureDetector.Clock
import java.util.concurrent.atomic.AtomicReference
import java.util.concurrent.TimeUnit.MILLISECONDS
import scala.annotation.tailrec
import scala.concurrent.duration.Duration
import scala.concurrent.duration.FiniteDuration
import scala.collection.immutable
import com.typesafe.config.Config
import akka.event.EventStream
import akka.util.Helpers.ConfigOps

/**
 * Implementation of 'The Phi Accrual Failure Detector' by Hayashibara et al. as defined in their paper:
 * [http://ddg.jaist.ac.jp/pub/HDY+04.pdf]
 *
 * The suspicion level of failure is given by a value called φ (phi).
 * The basic idea of the φ failure detector is to express the value of φ on a scale that
 * is dynamically adjusted to reflect current network conditions. A configurable
 * threshold is used to decide if φ is considered to be a failure.
 *
 * The value of φ is calculated as:
 *
 * {{{
 * φ = -log10(1 - F(timeSinceLastHeartbeat)
 * }}}
 * where F is the cumulative distribution function of a normal distribution with mean
 * and standard deviation estimated from historical heartbeat inter-arrival times.
 *
 * @param threshold A low threshold is prone to generate many wrong suspicions but ensures a quick detection in the event
 *   of a real crash. Conversely, a high threshold generates fewer mistakes but needs more time to detect
 *   actual crashes
 *
 * @param maxSampleSize Number of samples to use for calculation of mean and standard deviation of
 *   inter-arrival times.
 *
 * @param minStdDeviation Minimum standard deviation to use for the normal distribution used when calculating phi.
 *   Too low standard deviation might result in too much sensitivity for sudden, but normal, deviations
 *   in heartbeat inter arrival times.
 *
 * @param acceptableHeartbeatPause Duration corresponding to number of potentially lost/delayed
 *   heartbeats that will be accepted before considering it to be an anomaly.
 *   This margin is important to be able to survive sudden, occasional, pauses in heartbeat
 *   arrivals, due to for example garbage collect or network drop.
 *
 * @param firstHeartbeatEstimate Bootstrap the stats with heartbeats that corresponds to
 *   to this duration, with a with rather high standard deviation (since environment is unknown
 *   in the beginning)
 *
 * @param clock The clock, returning current time in milliseconds, but can be faked for testing
 *   purposes. It is only used for measuring intervals (duration).
 */
class PhiAccrualFailureDetector(
  val threshold: Double,
  val maxSampleSize: Int,
  val minStdDeviation: FiniteDuration,
  val acceptableHeartbeatPause: FiniteDuration,
  val firstHeartbeatEstimate: FiniteDuration)(
    implicit clock: Clock) extends FailureDetector {

  /**
   * Constructor that reads parameters from config.
   * Expecting config properties named `threshold`, `max-sample-size`,
   * `min-std-deviation`, `acceptable-heartbeat-pause` and
   * `heartbeat-interval`.
   */
  def this(config: Config, ev: EventStream) =
    this(
      threshold = config.getDouble("threshold"),
      maxSampleSize = config.getInt("max-sample-size"),
      minStdDeviation = config.getMillisDuration("min-std-deviation"),
      acceptableHeartbeatPause = config.getMillisDuration("acceptable-heartbeat-pause"),
      firstHeartbeatEstimate = config.getMillisDuration("heartbeat-interval"))

  require(threshold > 0.0, "failure-detector.threshold must be > 0")
  require(maxSampleSize > 0, "failure-detector.max-sample-size must be > 0")
  require(minStdDeviation > Duration.Zero, "failure-detector.min-std-deviation must be > 0")
  require(acceptableHeartbeatPause >= Duration.Zero, "failure-detector.acceptable-heartbeat-pause must be >= 0")
  require(firstHeartbeatEstimate > Duration.Zero, "failure-detector.heartbeat-interval must be > 0")

  // guess statistics for first heartbeat,
  // important so that connections with only one heartbeat becomes unavailable
  private val firstHeartbeat: HeartbeatHistory = {
    // bootstrap with 2 entries with rather high standard deviation
    val mean = firstHeartbeatEstimate.toMillis
    val stdDeviation = mean / 4
    HeartbeatHistory(maxSampleSize) :+ (mean - stdDeviation) :+ (mean + stdDeviation)
  }

  private val acceptableHeartbeatPauseMillis = acceptableHeartbeatPause.toMillis

  /**
   * Implement using optimistic lockless concurrency, all state is represented
   * by this immutable case class and managed by an AtomicReference.
   */
  private case class State(history: HeartbeatHistory, timestamp: Option[Long])

  private val state = new AtomicReference[State](State(history = firstHeartbeat, timestamp = None))

  override def isAvailable: Boolean = isAvailable(clock())

  private def isAvailable(timestamp: Long): Boolean = phi(timestamp) < threshold

  override def isMonitoring: Boolean = state.get.timestamp.nonEmpty

  @tailrec
  final override def heartbeat(): Unit = {

    val timestamp = clock()
    val oldState = state.get

    val newHistory = oldState.timestamp match {
      case None ⇒
        // this is heartbeat from a new resource
        // add starter records for this new resource
        firstHeartbeat
      case Some(latestTimestamp) ⇒
        // this is a known connection
        val interval = timestamp - latestTimestamp
        // don't use the first heartbeat after failure for the history, since a long pause will skew the stats
        if (isAvailable(timestamp)) oldState.history :+ interval
        else oldState.history
    }

    val newState = oldState.copy(history = newHistory, timestamp = Some(timestamp)) // record new timestamp

    // if we won the race then update else try again
    if (!state.compareAndSet(oldState, newState)) heartbeat() // recur
  }

  /**
   * The suspicion level of the accrual failure detector.
   *
   * If a connection does not have any records in failure detector then it is
   * considered healthy.
   */
  def phi: Double = phi(clock())

  private def phi(timestamp: Long): Double = {
    val oldState = state.get
    val oldTimestamp = oldState.timestamp

    if (oldTimestamp.isEmpty) 0.0 // treat unmanaged connections, e.g. with zero heartbeats, as healthy connections
    else {
      val timeDiff = timestamp - oldTimestamp.get

      val history = oldState.history
      val mean = history.mean
      val stdDeviation = ensureValidStdDeviation(history.stdDeviation)

      phi(timeDiff, mean + acceptableHeartbeatPauseMillis, stdDeviation)
    }
  }

  /**
   * Calculation of phi, derived from the Cumulative distribution function for
   * N(mean, stdDeviation) normal distribution, given by
   * 1.0 / (1.0 + math.exp(-y * (1.5976 + 0.070566 * y * y)))
   * where y = (x - mean) / standard_deviation
   * This is an approximation defined in β Mathematics Handbook (Logistic approximation).
   * Error is 0.00014 at +- 3.16
   * The calculated value is equivalent to -log10(1 - CDF(y))
   */
  private[akka] def phi(timeDiff: Long, mean: Double, stdDeviation: Double): Double = {
    val y = (timeDiff - mean) / stdDeviation
    val e = math.exp(-y * (1.5976 + 0.070566 * y * y))
    if (timeDiff > mean)
      -math.log10(e / (1.0 + e))
    else
      -math.log10(1.0 - 1.0 / (1.0 + e))
  }

  private val minStdDeviationMillis = minStdDeviation.toMillis

  private def ensureValidStdDeviation(stdDeviation: Double): Double = math.max(stdDeviation, minStdDeviationMillis)

}

private[akka] object HeartbeatHistory {

  /**
   * Create an empty HeartbeatHistory, without any history.
   * Can only be used as starting point for appending intervals.
   * The stats (mean, variance, stdDeviation) are not defined for
   * for empty HeartbeatHistory, i.e. throws AritmeticException.
   */
  def apply(maxSampleSize: Int): HeartbeatHistory = HeartbeatHistory(
    maxSampleSize = maxSampleSize,
    intervals = immutable.IndexedSeq.empty,
    intervalSum = 0L,
    squaredIntervalSum = 0L)

}

/**
 * Holds the heartbeat statistics for a specific node Address.
 * It is capped by the number of samples specified in `maxSampleSize`.
 *
 * The stats (mean, variance, stdDeviation) are not defined for
 * for empty HeartbeatHistory, i.e. throws AritmeticException.
 */
private[akka] case class HeartbeatHistory private (
  maxSampleSize: Int,
  intervals: immutable.IndexedSeq[Long],
  intervalSum: Long,
  squaredIntervalSum: Long) {

  // Heartbeat histories are created trough the firstHeartbeat variable of the PhiAccrualFailureDetector
  // which always have intervals.size > 0.
  if (maxSampleSize < 1)
    throw new IllegalArgumentException(s"maxSampleSize must be >= 1, got [$maxSampleSize]")
  if (intervalSum < 0L)
    throw new IllegalArgumentException(s"intervalSum must be >= 0, got [$intervalSum]")
  if (squaredIntervalSum < 0L)
    throw new IllegalArgumentException(s"squaredIntervalSum must be >= 0, got [$squaredIntervalSum]")

  def mean: Double = intervalSum.toDouble / intervals.size

  def variance: Double = (squaredIntervalSum.toDouble / intervals.size) - (mean * mean)

  def stdDeviation: Double = math.sqrt(variance)

  @tailrec
  final def :+(interval: Long): HeartbeatHistory = {
    if (intervals.size < maxSampleSize)
      HeartbeatHistory(
        maxSampleSize,
        intervals = intervals :+ interval,
        intervalSum = intervalSum + interval,
        squaredIntervalSum = squaredIntervalSum + pow2(interval))
    else
      dropOldest :+ interval // recur
  }

  private def dropOldest: HeartbeatHistory = HeartbeatHistory(
    maxSampleSize,
    intervals = intervals drop 1,
    intervalSum = intervalSum - intervals.head,
    squaredIntervalSum = squaredIntervalSum - pow2(intervals.head))

  private def pow2(x: Long) = x * x
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy