All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pekko.remote.artery.Association.scala Maven / Gradle / Ivy

Go to download

Apache Pekko is a toolkit for building highly concurrent, distributed, and resilient message-driven applications for Java and Scala.

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * license agreements; and to You under the Apache License, version 2.0:
 *
 *   https://www.apache.org/licenses/LICENSE-2.0
 *
 * This file is part of the Apache Pekko project, which was derived from Akka.
 */

/*
 * Copyright (C) 2016-2022 Lightbend Inc. 
 */

package org.apache.pekko.remote.artery

import java.net.ConnectException
import java.util.Queue
import java.util.concurrent.CountDownLatch
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.TimeUnit
import java.util.concurrent.atomic.AtomicBoolean
import java.util.concurrent.atomic.AtomicReference

import scala.annotation.tailrec
import scala.concurrent.Future
import scala.concurrent.Promise
import scala.concurrent.duration._
import scala.util.control.NoStackTrace

import scala.annotation.nowarn
import org.agrona.concurrent.ManyToOneConcurrentArrayQueue

import org.apache.pekko
import pekko.Done
import pekko.NotUsed
import pekko.actor.ActorRef
import pekko.actor.ActorSelectionMessage
import pekko.actor.Address
import pekko.actor.Cancellable
import pekko.actor.Dropped
import pekko.dispatch.Dispatchers
import pekko.dispatch.sysmsg.DeathWatchNotification
import pekko.dispatch.sysmsg.SystemMessage
import pekko.dispatch.sysmsg.Unwatch
import pekko.event.Logging
import pekko.remote.DaemonMsgCreate
import pekko.remote.PriorityMessage
import pekko.remote.RemoteActorRef
import pekko.remote.RemoteLogMarker
import pekko.remote.UniqueAddress
import pekko.remote.artery.ArteryTransport.AeronTerminated
import pekko.remote.artery.ArteryTransport.ShuttingDown
import pekko.remote.artery.Encoder.OutboundCompressionAccess
import pekko.remote.artery.InboundControlJunction.ControlMessageSubject
import pekko.remote.artery.OutboundControlJunction.OutboundControlIngress
import pekko.remote.artery.OutboundHandshake.HandshakeTimeoutException
import pekko.remote.artery.SystemMessageDelivery.ClearSystemMessageDelivery
import pekko.remote.artery.aeron.AeronSink.GaveUpMessageException
import pekko.remote.artery.compress.CompressionTable
import pekko.stream.AbruptTerminationException
import pekko.stream.KillSwitches
import pekko.stream.Materializer
import pekko.stream.SharedKillSwitch
import pekko.stream.StreamTcpException
import pekko.stream.scaladsl.Keep
import pekko.stream.scaladsl.MergeHub
import pekko.stream.scaladsl.Source
import pekko.util.OptionVal
import pekko.util.PrettyDuration._
import pekko.util.Unsafe
import pekko.util.WildcardIndex
import pekko.util.ccompat._

/**
 * INTERNAL API
 */
private[remote] object Association {
  sealed trait QueueWrapper extends SendQueue.ProducerApi[OutboundEnvelope] {
    def queue: Queue[OutboundEnvelope]
  }

  final case class QueueWrapperImpl(queue: Queue[OutboundEnvelope]) extends QueueWrapper {
    override def offer(message: OutboundEnvelope): Boolean = queue.offer(message)

    override def isEnabled: Boolean = true
  }

  object DisabledQueueWrapper extends QueueWrapper {
    override def queue: java.util.Queue[OutboundEnvelope] =
      throw new UnsupportedOperationException("The Queue is disabled")

    override def offer(message: OutboundEnvelope): Boolean =
      throw new UnsupportedOperationException("The method offer() is illegal on a disabled queue")

    override def isEnabled: Boolean = false
  }

  object RemovedQueueWrapper extends QueueWrapper {
    override def queue: java.util.Queue[OutboundEnvelope] =
      throw new UnsupportedOperationException("The Queue is removed")

    override def offer(message: OutboundEnvelope): Boolean = false

    override def isEnabled: Boolean = false
  }

  final case class LazyQueueWrapper(queue: Queue[OutboundEnvelope], materialize: () => Unit) extends QueueWrapper {
    private val onlyOnce = new AtomicBoolean

    def runMaterialize(): Unit = {
      if (onlyOnce.compareAndSet(false, true))
        materialize()
    }

    override def offer(message: OutboundEnvelope): Boolean = {
      runMaterialize()
      queue.offer(message)
    }

    override def isEnabled: Boolean = true
  }

  final val ControlQueueIndex = 0
  final val LargeQueueIndex = 1
  final val OrdinaryQueueIndex = 2

  sealed trait StopSignal
  case object OutboundStreamStopIdleSignal extends RuntimeException("") with StopSignal with NoStackTrace
  case object OutboundStreamStopQuarantinedSignal extends RuntimeException("") with StopSignal with NoStackTrace

  final case class OutboundStreamMatValues(
      streamKillSwitch: OptionVal[SharedKillSwitch],
      completed: Future[Done],
      stopping: OptionVal[StopSignal])
}

/**
 * INTERNAL API
 *
 * Thread-safe, mutable holder for association state. Main entry point for remote destined message to a specific
 * remote address.
 */
@ccompatUsedUntil213
private[remote] class Association(
    val transport: ArteryTransport,
    val materializer: Materializer,
    val controlMaterializer: Materializer,
    override val remoteAddress: Address,
    override val controlSubject: ControlMessageSubject,
    largeMessageDestinations: WildcardIndex[NotUsed],
    priorityMessageDestinations: WildcardIndex[NotUsed],
    outboundEnvelopePool: ObjectPool[ReusableOutboundEnvelope])
    extends AbstractAssociation
    with OutboundContext {
  import Association._

  require(remoteAddress.port.nonEmpty)

  private val log = Logging.withMarker(transport.system, classOf[Association])
  private def flightRecorder = transport.flightRecorder

  override def settings = transport.settings
  private def advancedSettings = transport.settings.Advanced
  private val deathWatchNotificationFlushEnabled =
    advancedSettings.DeathWatchNotificationFlushTimeout > Duration.Zero && transport.provider.settings.HasCluster

  private val restartCounter =
    new RestartCounter(advancedSettings.OutboundMaxRestarts, advancedSettings.OutboundRestartTimeout)

  // We start with the raw wrapped queue and then it is replaced with the materialized value of
  // the `SendQueue` after materialization. Using same underlying queue. This makes it possible to
  // start sending (enqueuing) to the Association immediate after construction.

  def createQueue(capacity: Int, queueIndex: Int): Queue[OutboundEnvelope] = {
    val linked = queueIndex == ControlQueueIndex || queueIndex == LargeQueueIndex
    if (linked)
      new LinkedBlockingQueue[OutboundEnvelope](capacity) // less memory than ManyToOneConcurrentArrayQueue
    else
      new ManyToOneConcurrentArrayQueue[OutboundEnvelope](capacity)
  }

  private val outboundLanes = advancedSettings.OutboundLanes
  private val controlQueueSize = advancedSettings.OutboundControlQueueSize
  private val queueSize = advancedSettings.OutboundMessageQueueSize
  private val largeQueueSize = advancedSettings.OutboundLargeMessageQueueSize

  private[this] val queues: Array[SendQueue.ProducerApi[OutboundEnvelope]] = new Array(2 + outboundLanes)
  queues(ControlQueueIndex) = QueueWrapperImpl(createQueue(controlQueueSize, ControlQueueIndex)) // control stream
  queues(LargeQueueIndex) =
    if (transport.largeMessageChannelEnabled) // large messages stream
      QueueWrapperImpl(createQueue(largeQueueSize, LargeQueueIndex))
    else
      DisabledQueueWrapper

  (0 until outboundLanes).foreach { i =>
    queues(OrdinaryQueueIndex + i) = QueueWrapperImpl(createQueue(queueSize, OrdinaryQueueIndex + i)) // ordinary messages stream
  }
  @volatile private[this] var queuesVisibility = false

  private def controlQueue: SendQueue.ProducerApi[OutboundEnvelope] = queues(ControlQueueIndex)

  @volatile private[this] var _outboundControlIngress: OptionVal[OutboundControlIngress] = OptionVal.None
  @volatile private[this] var materializing = new CountDownLatch(1)
  @volatile private[this] var outboundCompressionAccess: Vector[OutboundCompressionAccess] = Vector.empty

  // keyed by stream queue index
  private[this] val streamMatValues = new AtomicReference(Map.empty[Int, OutboundStreamMatValues])

  private[this] val idleTimer = new AtomicReference[Option[Cancellable]](None)
  private[this] val stopQuarantinedTimer = new AtomicReference[Option[Cancellable]](None)

  private[remote] def changeActorRefCompression(table: CompressionTable[ActorRef]): Future[Done] =
    updateOutboundCompression(c => c.changeActorRefCompression(table))

  private[remote] def changeClassManifestCompression(table: CompressionTable[String]): Future[Done] =
    updateOutboundCompression(c => c.changeClassManifestCompression(table))

  private def clearOutboundCompression(): Future[Done] =
    updateOutboundCompression(c => c.clearCompression())

  private def updateOutboundCompression(action: OutboundCompressionAccess => Future[Done]): Future[Done] = {
    implicit val ec = transport.system.dispatchers.internalDispatcher
    val c = outboundCompressionAccess
    if (c.isEmpty) Future.successful(Done)
    else if (c.size == 1) action(c.head)
    else Future.sequence(c.map(action(_))).map(_ => Done)
  }

  private def clearInboundCompression(originUid: Long): Unit =
    transport.inboundCompressionAccess match {
      case OptionVal.Some(access) => access.closeCompressionFor(originUid)
      case _                      => // do nothing
    }

  private def deadletters = transport.system.deadLetters

  def outboundControlIngress: OutboundControlIngress = {
    _outboundControlIngress match {
      case OptionVal.Some(o) => o
      case _ =>
        controlQueue match {
          case w: LazyQueueWrapper => w.runMaterialize()
          case _                   =>
        }
        // the outboundControlIngress may be accessed before the stream is materialized
        // using CountDownLatch to make sure that materialization is completed
        materializing.await(10, TimeUnit.SECONDS)
        _outboundControlIngress match {
          case OptionVal.Some(o) => o
          case _ =>
            if (transport.isShutdown || isRemovedAfterQuarantined()) throw ShuttingDown
            else throw new IllegalStateException(s"outboundControlIngress for [$remoteAddress] not initialized yet")
        }
    }
  }

  override def localAddress: UniqueAddress = transport.localAddress

  /**
   * Holds reference to shared state of Association - *access only via helper methods*
   */
  @volatile
  @nowarn("msg=never used")
  private[artery] var _sharedStateDoNotCallMeDirectly: AssociationState = AssociationState()

  /**
   * Helper method for access to underlying state via Unsafe
   *
   * @param oldState Previous state
   * @param newState Next state on transition
   * @return Whether the previous state matched correctly
   */
  private[artery] def swapState(oldState: AssociationState, newState: AssociationState): Boolean =
    Unsafe.instance.compareAndSwapObject(this, AbstractAssociation.sharedStateOffset, oldState, newState)

  /**
   * @return Reference to current shared state
   */
  def associationState: AssociationState =
    Unsafe.instance.getObjectVolatile(this, AbstractAssociation.sharedStateOffset).asInstanceOf[AssociationState]

  def setControlIdleKillSwitch(killSwitch: OptionVal[SharedKillSwitch]): Unit = {
    val current = associationState
    swapState(current, current.withControlIdleKillSwitch(killSwitch))
  }

  def completeHandshake(peer: UniqueAddress): Future[Done] = {
    require(
      remoteAddress == peer.address,
      s"wrong remote address in completeHandshake, got ${peer.address}, expected $remoteAddress")
    val current = associationState

    current.uniqueRemoteAddress() match {
      case Some(`peer`) =>
        // handshake already completed
        Future.successful(Done)
      case _ =>
        // clear outbound compression, it's safe to do that several times if someone else
        // completes handshake at same time, but it's important to clear it before
        implicit val ec = transport.system.dispatchers.internalDispatcher
        clearOutboundCompression().map { _ =>
          current.completeUniqueRemoteAddress(peer)
          current.uniqueRemoteAddress() match {
            case Some(`peer`) =>
            // our value
            case _ =>
              val newState = current.newIncarnation(peer)
              if (swapState(current, newState)) {
                current.uniqueRemoteAddress() match {
                  case Some(old) =>
                    cancelStopQuarantinedTimer()
                    log.debug(
                      "Incarnation {} of association to [{}] with new UID [{}] (old UID [{}])",
                      newState.incarnation,
                      peer.address,
                      peer.uid,
                      old.uid)
                    clearInboundCompression(old.uid)
                  case None =>
                  // Failed, nothing to do
                }
                // if swap failed someone else completed before us, and that is fine
              }
          }
          Done
        }
    }
  }

  // OutboundContext
  override def sendControl(message: ControlMessage): Unit = {
    try {
      if (!transport.isShutdown && !isRemovedAfterQuarantined()) {
        if (associationState.isQuarantined()) {
          log.debug("Send control message [{}] to quarantined [{}]", Logging.messageClassName(message), remoteAddress)
          setupStopQuarantinedTimer()
        }
        outboundControlIngress.sendControlMessage(message)
      }
    } catch {
      case ShuttingDown => // silence it
    }
  }

  def send(message: Any, sender: OptionVal[ActorRef], recipient: OptionVal[RemoteActorRef]): Unit = {

    def createOutboundEnvelope(): OutboundEnvelope =
      outboundEnvelopePool.acquire().init(recipient, message.asInstanceOf[AnyRef], sender)

    // volatile read to see latest queue array
    @nowarn("msg=never used")
    val unused = queuesVisibility

    def dropped(queueIndex: Int, qSize: Int, env: OutboundEnvelope): Unit = {
      val removed = isRemovedAfterQuarantined()
      if (removed) recipient match {
        case OptionVal.Some(ref) => ref.cachedAssociation = null // don't use this Association instance any more
        case _                   =>
      }
      val reason =
        if (removed) "Due to removed unused quarantined association"
        else s"Due to overflow of send queue, size [$qSize]"
      transport.system.eventStream
        .publish(Dropped(message, reason, env.sender.getOrElse(ActorRef.noSender), recipient.getOrElse(deadletters)))

      flightRecorder.transportSendQueueOverflow(queueIndex)
      deadletters ! env
    }

    def shouldSendUnwatch(): Boolean =
      !transport.provider.settings.HasCluster || !transport.system.isTerminating()

    def shouldSendDeathWatchNotification(d: DeathWatchNotification): Boolean =
      d.addressTerminated || !transport.provider.settings.HasCluster || !transport.system.isTerminating()

    def sendSystemMessage(outboundEnvelope: OutboundEnvelope): Unit = {
      outboundEnvelope.message match {
        case u: Unwatch if shouldSendUnwatch() =>
          log.debug(
            "Not sending Unwatch of {} to {} because it will be notified when this member " +
            "has been removed from Cluster.",
            u.watcher,
            u.watchee)
        case d: DeathWatchNotification if !shouldSendDeathWatchNotification(d) =>
          log.debug(
            "Not sending DeathWatchNotification of {} to {} because it will be notified when this member " +
            "has been removed from Cluster.",
            d.actor,
            outboundEnvelope.recipient.getOrElse("unknown"))
        case _ =>
          if (!controlQueue.offer(outboundEnvelope)) {
            quarantine(reason = s"Due to overflow of control queue, size [$controlQueueSize]")
            dropped(ControlQueueIndex, controlQueueSize, outboundEnvelope)
          }
      }
    }

    val state = associationState
    val quarantined = state.isQuarantined()
    val messageIsClearSystemMessageDelivery = message.isInstanceOf[ClearSystemMessageDelivery]

    // allow ActorSelectionMessage to pass through quarantine, to be able to establish interaction with new system
    if (message.isInstanceOf[ActorSelectionMessage] || !quarantined || messageIsClearSystemMessageDelivery) {
      if (quarantined && !messageIsClearSystemMessageDelivery) {
        log.debug(
          "Quarantine piercing attempt with message [{}] to [{}]",
          Logging.messageClassName(message),
          recipient.getOrElse(""))
        setupStopQuarantinedTimer()
      }
      try {
        val outboundEnvelope = createOutboundEnvelope()
        message match {
          case d: DeathWatchNotification if deathWatchNotificationFlushEnabled && shouldSendDeathWatchNotification(d) =>
            val flushingPromise = Promise[Done]()
            log.debug("Delaying death watch notification until flush has been sent. {}", d)
            transport.system.systemActorOf(
              FlushBeforeDeathWatchNotification
                .props(flushingPromise, settings.Advanced.DeathWatchNotificationFlushTimeout, this)
                .withDispatcher(Dispatchers.InternalDispatcherId),
              FlushBeforeDeathWatchNotification.nextName())
            flushingPromise.future.onComplete { _ =>
              log.debug("Sending death watch notification as flush is complete. {}", d)
              sendSystemMessage(outboundEnvelope)
            }(materializer.executionContext)
          case _: SystemMessage =>
            sendSystemMessage(outboundEnvelope)
          case ActorSelectionMessage(_: PriorityMessage, _, _) | _: ControlMessage | _: ClearSystemMessageDelivery =>
            // ActorSelectionMessage with PriorityMessage is used by cluster and remote failure detector heartbeating
            if (!controlQueue.offer(outboundEnvelope)) {
              dropped(ControlQueueIndex, controlQueueSize, outboundEnvelope)
            }
          case _: DaemonMsgCreate =>
            // DaemonMsgCreate is not a SystemMessage, but must be sent over the control stream because
            // remote deployment process depends on message ordering for DaemonMsgCreate and Watch messages.
            // First ordinary message may arrive earlier but then the resolve in the Decoder is retried
            // so that the first message can be delivered after the remote actor has been created.
            if (!controlQueue.offer(outboundEnvelope))
              dropped(ControlQueueIndex, controlQueueSize, outboundEnvelope)
          case _ =>
            val queueIndex = selectQueue(recipient)
            val queue = queues(queueIndex)
            val offerOk = queue.offer(outboundEnvelope)
            if (!offerOk)
              dropped(queueIndex, queueSize, outboundEnvelope)
        }
      } catch {
        case ShuttingDown => // silence it
      }
    } else if (log.isDebugEnabled)
      log.debug(
        "Dropping message [{}] from [{}] to [{}] due to quarantined system [{}]",
        Logging.messageClassName(message),
        sender.getOrElse(deadletters),
        recipient.getOrElse(recipient),
        remoteAddress)
  }

  private def selectQueue(recipient: OptionVal[RemoteActorRef]): Int = {
    recipient match {
      case OptionVal.Some(r) =>
        r.cachedSendQueueIndex match {
          case -1 =>
            // only happens when messages are sent to new remote destination
            // and is then cached on the RemoteActorRef
            val elements = r.path.elements
            val idx =
              if (priorityMessageDestinations.find(elements).isDefined) {
                log.debug("Using priority message stream for {}", r.path)
                ControlQueueIndex
              } else if (transport.largeMessageChannelEnabled && largeMessageDestinations.find(elements).isDefined) {
                log.debug("Using large message stream for {}", r.path)
                LargeQueueIndex
              } else if (outboundLanes == 1) {
                OrdinaryQueueIndex
              } else {
                // select lane based on destination, to preserve message order
                OrdinaryQueueIndex + (math.abs(r.path.uid % outboundLanes))
              }
            r.cachedSendQueueIndex = idx
            idx
          case idx => idx
        }

      case _ =>
        OrdinaryQueueIndex
    }
  }

  override def isOrdinaryMessageStreamActive(): Boolean =
    isStreamActive(OrdinaryQueueIndex)

  def isStreamActive(queueIndex: Int): Boolean = {
    queues(queueIndex) match {
      case _: LazyQueueWrapper  => false
      case DisabledQueueWrapper => false
      case RemovedQueueWrapper  => false
      case _                    => true
    }
  }

  def sendTerminationHint(replyTo: ActorRef): Int = {
    log.debug("Sending ActorSystemTerminating to all queues")
    sendToAllQueues(ActorSystemTerminating(localAddress), replyTo, excludeControlQueue = false)
  }

  def sendFlush(replyTo: ActorRef, excludeControlQueue: Boolean): Int =
    sendToAllQueues(Flush, replyTo, excludeControlQueue)

  def sendToAllQueues(msg: ControlMessage, replyTo: ActorRef, excludeControlQueue: Boolean): Int = {
    if (!associationState.isQuarantined()) {
      var sent = 0
      val queuesIter = if (excludeControlQueue) queues.iterator.drop(1) else queues.iterator
      queuesIter.filter(q => q.isEnabled && !q.isInstanceOf[LazyQueueWrapper]).foreach { queue =>
        try {
          val envelope = outboundEnvelopePool.acquire().init(OptionVal.None, msg, OptionVal.Some(replyTo))

          queue.offer(envelope)
          sent += 1
        } catch {
          case ShuttingDown => // can be thrown if `offer` triggers new materialization
        }
      }
      sent
    } else 0
  }

  // OutboundContext
  override def quarantine(reason: String): Unit = {
    val uid = associationState.uniqueRemoteAddress().map(_.uid)
    quarantine(reason, uid, harmless = false)
  }

  @tailrec final def quarantine(reason: String, uid: Option[Long], harmless: Boolean): Unit = {
    uid match {
      case Some(u) =>
        val current = associationState
        current.uniqueRemoteAddress() match {
          case Some(peer) if peer.uid == u =>
            if (!current.isQuarantined(u)) {
              val newState = current.newQuarantined()
              if (swapState(current, newState)) {
                // quarantine state change was performed
                if (harmless) {
                  log.info(
                    "Association to [{}] having UID [{}] has been stopped. All " +
                    "messages to this UID will be delivered to dead letters. Reason: {}",
                    remoteAddress,
                    u,
                    reason)
                  transport.system.eventStream
                    .publish(GracefulShutdownQuarantinedEvent(UniqueAddress(remoteAddress, u), reason))
                } else {
                  log.warning(
                    RemoteLogMarker.quarantine(remoteAddress, Some(u)),
                    "Association to [{}] with UID [{}] is irrecoverably failed. UID is now quarantined and all " +
                    "messages to this UID will be delivered to dead letters. " +
                    "Remote ActorSystem must be restarted to recover from this situation. Reason: {}",
                    remoteAddress,
                    u,
                    reason)
                  transport.system.eventStream.publish(QuarantinedEvent(UniqueAddress(remoteAddress, u)))
                }
                flightRecorder.transportQuarantined(remoteAddress, u)
                clearOutboundCompression()
                clearInboundCompression(u)
                // end delivery of system messages to that incarnation after this point
                send(ClearSystemMessageDelivery(current.incarnation), OptionVal.None, OptionVal.None)
                if (!harmless) {
                  // try to tell the other system that we have quarantined it
                  sendControl(Quarantined(localAddress, peer))
                }
                setupStopQuarantinedTimer()
              } else
                quarantine(reason, uid, harmless) // recursive
            }
          case Some(peer) =>
            log.info(
              RemoteLogMarker.quarantine(remoteAddress, Some(u)),
              "Quarantine of [{}] ignored due to non-matching UID, quarantine requested for [{}] but current is [{}]. Reason: {}",
              remoteAddress,
              u,
              peer.uid,
              reason)
            send(ClearSystemMessageDelivery(current.incarnation - 1), OptionVal.None, OptionVal.None)
          case None =>
            log.info(
              RemoteLogMarker.quarantine(remoteAddress, Some(u)),
              "Quarantine of [{}] ignored because handshake not completed, quarantine request was for old incarnation. Reason: {}",
              remoteAddress,
              reason)
        }
      case None =>
        log.warning(
          RemoteLogMarker.quarantine(remoteAddress, None),
          "Quarantine of [{}] ignored because unknown UID. Reason: {}",
          remoteAddress,
          reason)
    }

  }

  /**
   * After calling this no messages can be sent with this Association instance
   */
  def removedAfterQuarantined(): Unit = {
    if (!isRemovedAfterQuarantined()) {
      flightRecorder.transportRemoveQuarantined(remoteAddress)
      queues(ControlQueueIndex) = RemovedQueueWrapper

      if (transport.largeMessageChannelEnabled)
        queues(LargeQueueIndex) = RemovedQueueWrapper

      (0 until outboundLanes).foreach { i =>
        queues(OrdinaryQueueIndex + i) = RemovedQueueWrapper
      }
      queuesVisibility = true // volatile write for visibility of the queues array

      // cleanup
      _outboundControlIngress = OptionVal.None
      outboundCompressionAccess = Vector.empty
      cancelAllTimers()
      abortQuarantined()

      log.info("Unused association to [{}] removed after quarantine", remoteAddress)
    }
  }

  def isRemovedAfterQuarantined(): Boolean =
    queues(ControlQueueIndex) == RemovedQueueWrapper

  private def cancelStopQuarantinedTimer(): Unit = {
    val current = stopQuarantinedTimer.get
    current.foreach(_.cancel())
    stopQuarantinedTimer.compareAndSet(current, None)
  }

  private def setupStopQuarantinedTimer(): Unit = {
    cancelStopQuarantinedTimer()
    stopQuarantinedTimer.set(Some(transport.system.scheduler.scheduleOnce(advancedSettings.StopQuarantinedAfterIdle) {
      if (associationState.isQuarantined())
        abortQuarantined()
    }(transport.system.dispatchers.internalDispatcher)))
  }

  private def abortQuarantined(): Unit = {
    cancelIdleTimer()
    streamMatValues.get.foreach {
      case (queueIndex, OutboundStreamMatValues(killSwitch, _, _)) =>
        killSwitch match {
          case OptionVal.Some(k) =>
            setStopReason(queueIndex, OutboundStreamStopQuarantinedSignal)
            clearStreamKillSwitch(queueIndex, k)
            k.abort(OutboundStreamStopQuarantinedSignal)
          case _ => // already aborted
        }
    }
  }

  private def cancelIdleTimer(): Unit = {
    val current = idleTimer.get
    current.foreach(_.cancel())
    idleTimer.compareAndSet(current, None)
  }

  private def setupIdleTimer(): Unit = {
    if (idleTimer.get.isEmpty) {
      val StopIdleOutboundAfter = settings.Advanced.StopIdleOutboundAfter
      val QuarantineIdleOutboundAfter = settings.Advanced.QuarantineIdleOutboundAfter
      val interval = StopIdleOutboundAfter / 2
      val initialDelay = settings.Advanced.Tcp.ConnectionTimeout.max(StopIdleOutboundAfter) + 1.second
      val task =
        transport.system.scheduler.scheduleWithFixedDelay(initialDelay, interval) { () =>
          val lastUsedDurationNanos = System.nanoTime() - associationState.lastUsedTimestamp.get
          if (lastUsedDurationNanos >= QuarantineIdleOutboundAfter.toNanos && !associationState.isQuarantined()) {
            // If idle longer than quarantine-idle-outbound-after and the low frequency HandshakeReq
            // doesn't get through it will be quarantined to cleanup lingering associations to crashed systems.
            quarantine(s"Idle longer than quarantine-idle-outbound-after [${QuarantineIdleOutboundAfter.pretty}]")
            associationState.uniqueRemoteAddressState() match {
              case AssociationState.UidQuarantined => // quarantined as expected
              case AssociationState.UidKnown       => // must be new uid, keep as is
              case AssociationState.UidUnknown =>
                val newLastUsedDurationNanos = System.nanoTime() - associationState.lastUsedTimestamp.get
                // quarantine ignored due to unknown UID, have to stop this task anyway
                if (newLastUsedDurationNanos >= QuarantineIdleOutboundAfter.toNanos)
                  abortQuarantined() // quarantine ignored due to unknown UID, have to stop this task anyway
            }
          } else if (lastUsedDurationNanos >= StopIdleOutboundAfter.toNanos) {
            streamMatValues.get.foreach {
              case (queueIndex, OutboundStreamMatValues(streamKillSwitch, _, stopping)) =>
                if (isStreamActive(queueIndex) && stopping.isEmpty) {
                  if (queueIndex != ControlQueueIndex) {
                    streamKillSwitch match {
                      case OptionVal.Some(k) =>
                        // for non-control streams we can stop the entire stream
                        log.info("Stopping idle outbound stream [{}] to [{}]", queueIndex, remoteAddress)
                        flightRecorder.transportStopIdleOutbound(remoteAddress, queueIndex)
                        setStopReason(queueIndex, OutboundStreamStopIdleSignal)
                        clearStreamKillSwitch(queueIndex, k)
                        k.abort(OutboundStreamStopIdleSignal)
                      case _ => // already aborted
                    }

                  } else {
                    // only stop the transport parts of the stream because SystemMessageDelivery stage has
                    // state (seqno) and system messages might be sent at the same time
                    associationState.controlIdleKillSwitch match {
                      case OptionVal.Some(killSwitch) =>
                        log.info("Stopping idle outbound control stream to [{}]", remoteAddress)
                        flightRecorder.transportStopIdleOutbound(remoteAddress, queueIndex)
                        setControlIdleKillSwitch(OptionVal.None)
                        killSwitch.abort(OutboundStreamStopIdleSignal)
                      case _ => // already stopped
                    }
                  }
                }
            }
          }
        }(transport.system.dispatcher)

      if (!idleTimer.compareAndSet(None, Some(task))) {
        // another thread did same thing and won
        task.cancel()
      }
    }
  }

  private def cancelAllTimers(): Unit = {
    cancelIdleTimer()
    cancelStopQuarantinedTimer()
  }

  private def sendToDeadLetters[T](pending: Vector[OutboundEnvelope]): Unit = {
    pending.foreach(transport.system.deadLetters ! _)
  }

  /**
   * Called once after construction when the `Association` instance
   * wins the CAS in the `AssociationRegistry`. It will materialize
   * the streams. It is possible to sending (enqueuing) to the association
   * before this method is called.
   *
   * @throws ShuttingDown if called while the transport is shutting down
   */
  def associate(): Unit = {
    if (!controlQueue.isInstanceOf[QueueWrapper])
      throw new IllegalStateException("associate() must only be called once")
    runOutboundStreams()
  }

  private def runOutboundStreams(): Unit = {

    // it's important to materialize the outboundControl stream first,
    // so that outboundControlIngress is ready when stages for all streams start
    runOutboundControlStream()
    runOutboundOrdinaryMessagesStream()

    if (transport.largeMessageChannelEnabled)
      runOutboundLargeMessagesStream()
  }

  private def runOutboundControlStream(): Unit = {
    if (transport.isShutdown) throw ShuttingDown
    log.debug("Starting outbound control stream to [{}]", remoteAddress)

    val wrapper = getOrCreateQueueWrapper(ControlQueueIndex, queueSize)
    queues(ControlQueueIndex) = wrapper // use new underlying queue immediately for restarts
    queuesVisibility = true // volatile write for visibility of the queues array

    val streamKillSwitch = KillSwitches.shared("outboundControlStreamKillSwitch")

    def sendQueuePostStop[T](pending: Vector[OutboundEnvelope]): Unit = {
      sendToDeadLetters(pending)
      val systemMessagesCount = pending.count(env => env.message.isInstanceOf[SystemMessage])
      if (systemMessagesCount > 0)
        quarantine(s"SendQueue stopped with [$systemMessagesCount] pending system messages.")
    }

    val (queueValue, (control, completed)) =
      Source
        .fromGraph(new SendQueue[OutboundEnvelope](sendQueuePostStop))
        .via(streamKillSwitch.flow)
        .toMat(transport.outboundControl(this))(Keep.both)
        .run()(materializer)

    queueValue.inject(wrapper.queue)
    // replace with the materialized value, still same underlying queue
    queues(ControlQueueIndex) = queueValue
    queuesVisibility = true // volatile write for visibility of the queues array
    _outboundControlIngress = OptionVal.Some(control)
    materializing.countDown()

    updateStreamMatValues(ControlQueueIndex, streamKillSwitch, completed)
    setupIdleTimer()
    attachOutboundStreamRestart(
      "Outbound control stream",
      ControlQueueIndex,
      controlQueueSize,
      completed,
      () => runOutboundControlStream())
  }

  private def getOrCreateQueueWrapper(queueIndex: Int, capacity: Int): QueueWrapper = {
    @nowarn("msg=never used")
    val unused = queuesVisibility // volatile read to see latest queues array
    queues(queueIndex) match {
      case existing: QueueWrapper => existing
      case _                      =>
        // use new queue for restarts
        QueueWrapperImpl(createQueue(capacity, queueIndex))
    }
  }

  private def runOutboundOrdinaryMessagesStream(): Unit = {
    if (transport.isShutdown) throw ShuttingDown

    val streamKillSwitch = KillSwitches.shared("outboundMessagesKillSwitch")

    if (outboundLanes == 1) {
      log.debug("Starting outbound message stream to [{}]", remoteAddress)
      val queueIndex = OrdinaryQueueIndex
      val wrapper = getOrCreateQueueWrapper(queueIndex, queueSize)
      queues(queueIndex) = wrapper // use new underlying queue immediately for restarts
      queuesVisibility = true // volatile write for visibility of the queues array

      val (queueValue, _, changeCompression, completed) =
        Source
          .fromGraph(new SendQueue[OutboundEnvelope](sendToDeadLetters))
          .via(streamKillSwitch.flow)
          .viaMat(transport.outboundTestFlow(this))(Keep.both)
          .toMat(transport.outbound(this)) { case ((a, b), (c, d)) => (a, b, c, d) } // "keep all, exploded"
          .run()(materializer)

      queueValue.inject(wrapper.queue)
      // replace with the materialized value, still same underlying queue
      queues(queueIndex) = queueValue
      queuesVisibility = true // volatile write for visibility of the queues array
      outboundCompressionAccess = Vector(changeCompression)

      updateStreamMatValues(OrdinaryQueueIndex, streamKillSwitch, completed)
      attachOutboundStreamRestart(
        "Outbound message stream",
        OrdinaryQueueIndex,
        queueSize,
        completed,
        () => runOutboundOrdinaryMessagesStream())

    } else {
      log.debug("Starting outbound message stream to [{}] with [{}] lanes", remoteAddress, outboundLanes)
      val wrappers = (0 until outboundLanes).map { i =>
        val wrapper = getOrCreateQueueWrapper(OrdinaryQueueIndex + i, queueSize)
        queues(OrdinaryQueueIndex + i) = wrapper // use new underlying queue immediately for restarts
        queuesVisibility = true // volatile write for visibility of the queues array
        wrapper
      }.toVector

      val lane = Source
        .fromGraph(new SendQueue[OutboundEnvelope](sendToDeadLetters))
        .via(streamKillSwitch.flow)
        .via(transport.outboundTestFlow(this))
        .viaMat(transport.outboundLane(this))(Keep.both)
        .watchTermination()(Keep.both)
        // recover to avoid error logging by MergeHub
        .recoverWithRetries(-1, { case _: Throwable => Source.empty })
        .mapMaterializedValue {
          case ((q, c), w) => (q, c, w)
        }

      val (mergeHub, transportSinkCompleted) = MergeHub
        .source[EnvelopeBuffer]
        .via(streamKillSwitch.flow)
        .toMat(transport.outboundTransportSink(this))(Keep.both)
        .run()(materializer)

      val values: Vector[(SendQueue.QueueValue[OutboundEnvelope], Encoder.OutboundCompressionAccess, Future[Done])] =
        (0 until outboundLanes).iterator
          .map { _ =>
            lane.to(mergeHub).run()(materializer)
          }
          .to(Vector)

      val (queueValues, compressionAccessValues, laneCompletedValues) = values.unzip3

      implicit val ec = transport.system.dispatchers.internalDispatcher

      // tear down all parts if one part fails or completes
      Future.firstCompletedOf(laneCompletedValues).failed.foreach { reason =>
        streamKillSwitch.abort(reason)
      }
      (laneCompletedValues :+ transportSinkCompleted).foreach(_.foreach { _ =>
        streamKillSwitch.shutdown()
      })

      val allCompleted = Future.sequence(laneCompletedValues).flatMap(_ => transportSinkCompleted)

      queueValues.zip(wrappers).zipWithIndex.foreach {
        case ((q, w), i) =>
          q.inject(w.queue)
          queues(OrdinaryQueueIndex + i) = q // replace with the materialized value, still same underlying queue
      }
      queuesVisibility = true // volatile write for visibility of the queues array

      outboundCompressionAccess = compressionAccessValues

      attachOutboundStreamRestart(
        "Outbound message stream",
        OrdinaryQueueIndex,
        queueSize,
        allCompleted,
        () => runOutboundOrdinaryMessagesStream())
    }
  }

  private def runOutboundLargeMessagesStream(): Unit = {
    if (transport.isShutdown) throw ShuttingDown
    log.debug("Starting outbound large message stream to [{}]", remoteAddress)
    val wrapper = getOrCreateQueueWrapper(LargeQueueIndex, largeQueueSize)
    queues(LargeQueueIndex) = wrapper // use new underlying queue immediately for restarts
    queuesVisibility = true // volatile write for visibility of the queues array

    val streamKillSwitch = KillSwitches.shared("outboundLargeMessagesKillSwitch")

    val (queueValue, completed) = Source
      .fromGraph(new SendQueue[OutboundEnvelope](sendToDeadLetters))
      .via(streamKillSwitch.flow)
      .via(transport.outboundTestFlow(this))
      .toMat(transport.outboundLarge(this))(Keep.both)
      .run()(materializer)

    queueValue.inject(wrapper.queue)
    // replace with the materialized value, still same underlying queue
    queues(LargeQueueIndex) = queueValue
    queuesVisibility = true // volatile write for visibility of the queues array

    updateStreamMatValues(LargeQueueIndex, streamKillSwitch, completed)
    attachOutboundStreamRestart(
      "Outbound large message stream",
      LargeQueueIndex,
      largeQueueSize,
      completed,
      () => runOutboundLargeMessagesStream())
  }

  private def attachOutboundStreamRestart(
      streamName: String,
      queueIndex: Int,
      queueCapacity: Int,
      streamCompleted: Future[Done],
      restart: () => Unit): Unit = {

    def lazyRestart(): Unit = {
      flightRecorder.transportRestartOutbound(remoteAddress, streamName)
      outboundCompressionAccess = Vector.empty
      if (queueIndex == ControlQueueIndex) {
        materializing = new CountDownLatch(1)
        _outboundControlIngress = OptionVal.None
      }
      // LazyQueueWrapper will invoke the `restart` function when first message is offered
      val wrappedRestartFun: () => Unit = () => {
        restart()
      }

      if (!isRemovedAfterQuarantined())
        queues(queueIndex) = LazyQueueWrapper(createQueue(queueCapacity, queueIndex), wrappedRestartFun)

      queuesVisibility = true // volatile write for visibility of the queues array
    }

    implicit val ec = materializer.executionContext
    streamCompleted.foreach { _ =>
      if (transport.isShutdown || isRemovedAfterQuarantined()) {
        // shutdown as expected
        // countDown the latch in case threads are waiting on the latch in outboundControlIngress method
        materializing.countDown()
      } else {
        log.debug("{} to [{}] was completed. It will be restarted if used again.", streamName, remoteAddress)
        lazyRestart()
      }
    }
    streamCompleted.failed.foreach {
      case ArteryTransport.ShutdownSignal =>
        // shutdown as expected
        cancelAllTimers()
        // countDown the latch in case threads are waiting on the latch in outboundControlIngress method
        materializing.countDown()
      case cause if transport.isShutdown || isRemovedAfterQuarantined() =>
        // don't restart after shutdown, but log some details so we notice
        // for the TCP transport the ShutdownSignal is "converted" to StreamTcpException
        if (!cause.isInstanceOf[StreamTcpException])
          log.warning(
            s"{} to [{}] failed after shutdown. {}: {}",
            streamName,
            remoteAddress,
            cause.getClass.getName,
            cause.getMessage)
        cancelAllTimers()
        // countDown the latch in case threads are waiting on the latch in outboundControlIngress method
        materializing.countDown()
      case _: AeronTerminated =>
        // shutdown already in progress
        cancelAllTimers()
      case _: AbruptTerminationException =>
        // ActorSystem shutdown
        cancelAllTimers()
      case cause =>
        // it might have been stopped as expected due to idle or quarantine
        // for the TCP transport the exception is "converted" to StreamTcpException
        val stoppedIdle = cause == OutboundStreamStopIdleSignal ||
          getStopReason(queueIndex).contains(OutboundStreamStopIdleSignal)
        val stoppedQuarantined = cause == OutboundStreamStopQuarantinedSignal ||
          getStopReason(queueIndex).contains(OutboundStreamStopQuarantinedSignal)

        // for some cases restart unconditionally, without counting restarts
        val bypassRestartCounter = cause match {
          case _: GaveUpMessageException => true
          case _                         => stoppedIdle || stoppedQuarantined
        }

        if (queueIndex == ControlQueueIndex && !stoppedQuarantined) {
          cause match {
            case _: HandshakeTimeoutException => // ok, quarantine not possible without UID
            case _                            =>
              // Must quarantine in case all system messages haven't been delivered.
              // See also comment in the stoppedIdle case below
              quarantine(s"Outbound control stream restarted. $cause")
          }
        }

        def isConnectException: Boolean =
          cause.isInstanceOf[StreamTcpException] && cause.getCause != null && cause.getCause
            .isInstanceOf[ConnectException]

        if (stoppedIdle) {
          log.debug("{} to [{}] was idle and stopped. It will be restarted if used again.", streamName, remoteAddress)
          lazyRestart()
        } else if (stoppedQuarantined) {
          log.debug(
            "{} to [{}] was quarantined and stopped. It will be restarted if used again.",
            streamName,
            remoteAddress)
          lazyRestart()
        } else if (bypassRestartCounter || restartCounter.restart()) {
          // ConnectException may happen repeatedly and are already logged in connectionFlowWithRestart
          if (isConnectException)
            log.debug("{} to [{}] failed. Restarting it. {}", streamName, remoteAddress, cause)
          else
            log.warning("{} to [{}] failed. Restarting it. {}", streamName, remoteAddress, cause)
          lazyRestart()
        } else {
          log.error(
            cause,
            s"{} to [{}] failed and restarted {} times within {} seconds. Terminating system. ${cause.getMessage}",
            streamName,
            remoteAddress,
            advancedSettings.OutboundMaxRestarts,
            advancedSettings.OutboundRestartTimeout.toSeconds)
          cancelAllTimers()
          transport.system.terminate()
        }
    }
  }

  private def updateStreamMatValues(
      streamId: Int,
      streamKillSwitch: SharedKillSwitch,
      completed: Future[Done]): Unit = {
    implicit val ec = materializer.executionContext
    updateStreamMatValues(streamId,
      OutboundStreamMatValues(OptionVal.Some(streamKillSwitch),
        completed.recover {
          case _ => Done
        }, stopping = OptionVal.None))
  }

  @tailrec private def updateStreamMatValues(streamId: Int, values: OutboundStreamMatValues): Unit = {
    val prev = streamMatValues.get()
    if (!streamMatValues.compareAndSet(prev, prev + (streamId -> values))) {
      updateStreamMatValues(streamId, values)
    }
  }

  @tailrec private def setStopReason(streamId: Int, stopSignal: StopSignal): Unit = {
    val prev = streamMatValues.get()
    prev.get(streamId) match {
      case Some(v) =>
        if (!streamMatValues.compareAndSet(prev, prev.updated(streamId, v.copy(stopping = OptionVal.Some(stopSignal)))))
          setStopReason(streamId, stopSignal)
      case None => throw new IllegalStateException(s"Expected streamMatValues for [$streamId]")
    }
  }

  private def getStopReason(streamId: Int): OptionVal[StopSignal] = {
    streamMatValues.get().get(streamId) match {
      case Some(OutboundStreamMatValues(_, _, stopping)) => stopping
      case None                                          => OptionVal.None
    }
  }

  // after it has been used we remove the kill switch to cleanup some memory,
  // not a "leak" but a KillSwitch is rather heavy
  @tailrec private def clearStreamKillSwitch(streamId: Int, old: SharedKillSwitch): Unit = {
    val prev = streamMatValues.get()
    prev.get(streamId) match {
      case Some(v) =>
        if (v.streamKillSwitch.isDefined && (v.streamKillSwitch.get eq old)) {
          if (!streamMatValues.compareAndSet(prev, prev.updated(streamId, v.copy(streamKillSwitch = OptionVal.None))))
            clearStreamKillSwitch(streamId, old)
        }
      case None => throw new IllegalStateException(s"Expected streamMatValues for [$streamId]")
    }
  }

  /**
   * Exposed for orderly shutdown purposes, can not be trusted except for during shutdown as streams may restart.
   * Will complete successfully even if one of the stream completion futures failed
   */
  def streamsCompleted: Future[Done] = {
    implicit val ec = materializer.executionContext
    Future
      .sequence(streamMatValues.get().values.map {
        case OutboundStreamMatValues(_, done, _) => done
      })
      .map(_ => Done)
  }

  override def toString: String =
    s"Association($localAddress -> $remoteAddress with $associationState)"

}

/**
 * INTERNAL API
 */
private[remote] class AssociationRegistry(createAssociation: Address => Association) {
  private[this] val associationsByAddress = new AtomicReference[Map[Address, Association]](Map.empty)
  private[this] val associationsByUid = new AtomicReference[ImmutableLongMap[Association]](ImmutableLongMap.empty)

  /**
   * @throws ShuttingDown if called while the transport is shutting down
   */
  @tailrec final def association(remoteAddress: Address): Association = {
    val currentMap = associationsByAddress.get
    currentMap.get(remoteAddress) match {
      case Some(existing) => existing
      case None =>
        val newAssociation = createAssociation(remoteAddress)
        val newMap = currentMap.updated(remoteAddress, newAssociation)
        if (associationsByAddress.compareAndSet(currentMap, newMap)) {
          newAssociation.associate() // start it, only once
          newAssociation
        } else
          association(remoteAddress) // lost CAS, retry
    }
  }

  def association(uid: Long): OptionVal[Association] =
    associationsByUid.get.get(uid)

  /**
   * @throws ShuttingDown if called while the transport is shutting down
   */
  @tailrec final def setUID(peer: UniqueAddress): Association = {
    // Don't create a new association via this method. It's supposed to exist unless it was removed after quarantined.
    val a = association(peer.address)

    val currentMap = associationsByUid.get
    currentMap.get(peer.uid) match {
      case OptionVal.Some(previous) =>
        if (previous eq a)
          // associationsByUid Map already contains the right association
          a
        else
          // make sure we don't overwrite same UID with different association
          throw new IllegalArgumentException(s"UID collision old [$previous] new [$a]")
      case _ =>
        // update associationsByUid Map with the uid -> association
        val newMap = currentMap.updated(peer.uid, a)
        if (associationsByUid.compareAndSet(currentMap, newMap))
          a
        else
          setUID(peer) // lost CAS, retry
    }
  }

  def allAssociations: Set[Association] =
    associationsByAddress.get.values.toSet

  def removeUnusedQuarantined(after: FiniteDuration): Unit = {
    removeUnusedQuarantinedByAddress(after)
    removeUnusedQuarantinedByUid(after)
  }

  @tailrec private def removeUnusedQuarantinedByAddress(after: FiniteDuration): Unit = {
    val now = System.nanoTime()
    val afterNanos = after.toNanos
    val currentMap = associationsByAddress.get
    val remove = currentMap.foldLeft(Map.empty[Address, Association]) {
      case (acc, (address, association)) =>
        val state = association.associationState
        if ((now - state.lastUsedTimestamp.get) >= afterNanos) {
          state.uniqueRemoteAddressState() match {
            case AssociationState.UidQuarantined | AssociationState.UidUnknown => acc.updated(address, association)
            case AssociationState.UidKnown                                     => acc
          }
        } else {
          acc
        }
    }
    if (remove.nonEmpty) {
      val newMap = currentMap -- remove.keysIterator
      if (associationsByAddress.compareAndSet(currentMap, newMap))
        remove.valuesIterator.foreach(_.removedAfterQuarantined())
      else
        removeUnusedQuarantinedByAddress(after) // CAS fail, recursive
    }
  }

  @tailrec private def removeUnusedQuarantinedByUid(after: FiniteDuration): Unit = {
    val now = System.nanoTime()
    val afterNanos = after.toNanos
    val currentMap = associationsByUid.get
    val remove = currentMap.keysIterator.foldLeft(Map.empty[Long, Association]) {
      case (acc, uid) =>
        val association = currentMap.get(uid).get
        val state = association.associationState
        if ((now - state.lastUsedTimestamp.get) >= afterNanos) {
          state.uniqueRemoteAddressState() match {
            case AssociationState.UidQuarantined | AssociationState.UidUnknown => acc.updated(uid, association)
            case AssociationState.UidKnown                                     => acc
          }
        } else {
          acc
        }
    }
    if (remove.nonEmpty) {
      val newMap = remove.keysIterator.foldLeft(currentMap)((acc, uid) => acc.remove(uid))
      if (associationsByUid.compareAndSet(currentMap, newMap))
        remove.valuesIterator.foreach(_.removedAfterQuarantined())
      else
        removeUnusedQuarantinedByUid(after) // CAS fail, recursive
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy