monitoring.DeploymentMonitor.scala Maven / Gradle / Ivy
The newest version!
//: ----------------------------------------------------------------------------
//: Copyright (C) 2017 Verizon. All Rights Reserved.
//:
//: Licensed under the Apache License, Version 2.0 (the "License");
//: you may not use this file except in compliance with the License.
//: You may obtain a copy of the License at
//:
//: http://www.apache.org/licenses/LICENSE-2.0
//:
//: Unless required by applicable law or agreed to in writing, software
//: distributed under the License is distributed on an "AS IS" BASIS,
//: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//: See the License for the specific language governing permissions and
//: limitations under the License.
//:
//: ----------------------------------------------------------------------------
package nelson
package monitoring
import journal.Logger
import Datacenter.{Deployment, TrafficShift,StackName}
import DeploymentStatus.{Ready, Warming}
import storage.{StoreOp, StoreOpF}
import health.{HealthCheckOp, HealthCheck, Passing}
import HealthCheckOp.HealthCheckF
import java.time.Instant
import scalaz.{NonEmptyList, OptionT}
import scalaz.concurrent.Task
import scalaz.syntax.bind._
import scalaz.syntax.std.option._
import scalaz.syntax.traverse._
import scalaz.std.list._
import scalaz.stream.{Process, Sink, sink, time}
import helm.HealthStatus._
import helm.HealthStatus
import nelson.Json.DeploymentEncoder
import nelson.audit.AuditableInstances.deploymentAuditable
import nelson.Nelson._
import scala.concurrent.duration.Duration
import scala.concurrent.duration._
import scala.language.postfixOps
/*
* Intended to be launched as a background daemon for monitoring and acting on deployment status/activity.
*/
object DeploymentMonitor {
protected val log = Logger[this.type]
sealed abstract class MonitorActionItem {
def deployment: Deployment
}
final case class PromoteToReady(dc: Datacenter, deployment: Deployment) extends MonitorActionItem
final case class RetainAsWarming(dc: Datacenter, deployment: Deployment, reason: String) extends MonitorActionItem
def heartbeat(cfg: NelsonConfig): Process[Task, Duration] =
(Process.eval(Task.now(1 seconds)) ++ Process.repeatEval(Task.delay(cfg.deploymentMonitor.delay))).flatMap(d =>
time.awakeEvery(d)(cfg.pools.schedulingExecutor, cfg.pools.schedulingPool).once)
/*
* Creates a daemon that will decide all deployment monitor actions that need to occur, and drain them.
*/
def loop(cfg: NelsonConfig): Process[Task, Unit] = drain(cfg)(heartbeat(cfg),
lift[Process, Seq[MonitorActionItem]](monitorActionItems _ andThen Process.eval), counterSink,
lift[Sink, MonitorActionItem](promotionSink)
)
/*
* Drain all actions from the writer (using an auditor error sink, observing it and routing to a final sink.
*/
def drain[A](cfg: NelsonConfig)(h: Process[Task, Duration], w: NelsonFK[Process, Seq[A]], s: Sink[Task, A], k: NelsonFK[Sink, A]): Process[Task, Unit] =
h >> w.run(cfg).flatMap(Process.emitAll)
.through(s.toChannel)
.attempt()
.observeW(cfg.auditor.errorSink)
.stripW
.to(k.run(cfg))
/*
* Build a list of MonitorActionItems based on the health of deployments that are presently in the Warming state.
*/
def monitorActionItems(cfg: NelsonConfig): Task[List[MonitorActionItem]] =
cfg.datacenters.traverseM(dc => monitorActionItemsByDatacenter(dc))
def monitorActionItemsByDatacenter(dc: Datacenter): Task[List[MonitorActionItem]] =
for {
ns <- storage.run(dc.storage, StoreOp.listNamespacesForDatacenter(dc.name)).map(_.toList)
d <- ns.traverseM(n => monitorActionItemsByNamespace(dc,n))
} yield d
def monitorActionItemsByNamespace(dc: Datacenter, ns: Datacenter.Namespace): Task[List[MonitorActionItem]] =
for {
d <- storage.run(dc.storage, StoreOp.listDeploymentsForNamespaceByStatus(ns.id, NonEmptyList(Warming)))
.map(_.toList.map(_._1))
ai <- d.traverse(d => monitorActionItem(dc, d))
} yield ai
/*
* Validates deployment is reporting healthy in consul and no preceeding traffic shift are in progress;
* this guards us from having multiple traffic shifts overlapping.
*
* Ask Helm that will for the list of all the health statuses, and determine if a majority of the jobs are passing.
*/
def monitorActionItem(dc: Datacenter, d: Deployment): Task[MonitorActionItem] =
for {
hcs <- health.run(dc.health, getHealth(dc, d.namespace.name, d.stackName))
shift <- storage.run(dc.storage, trafficShift(d))
next <- storage.run(dc.storage, next(d))
} yield {
if (!majorityPassing(hcs))
RetainAsWarming(dc, d, "The majority of all health status checks must be passing.")
else if (shift.exists(_.inProgress(Instant.now)))
RetainAsWarming(dc, d, "Traffic shift in progress, can not promote at this time.")
else {
if (next.contains(d))
PromoteToReady(dc, d)
else
RetainAsWarming(dc, d, s"A previous deployment exists and will be promoted first.")
}
// Note: if the traffic shift is None the deployment will be promoted to ready.
// This is intended for deployments that are not part of a traffic shift,
// i.e. periodic jobs or bootstrapping a service
}
def majorityPassing(statuses: List[health.HealthStatus]) : Boolean = {
val healths = statuses.map(_.status)
healths.count(_ == Passing) > healths.count(_ != Passing)
}
def getHealth(dc: Datacenter, ns: NamespaceName, sn: StackName): HealthCheckOp.HealthCheckF[List[health.HealthStatus]] =
HealthCheckOp.health(dc, ns, sn)
def trafficShift(d: Deployment): StoreOpF[Option[TrafficShift]] =
StoreOp.getTrafficShiftForServiceName(d.nsid, d.unit.serviceName)
// returns the next deployment in warming state that should be promoted to ready
// this allows us to sequence traffic shifts in the order they were deployed
// getDeploymentForServiceNameByStatus returns an ordered list of deployments
def next(d: Deployment): StoreOpF[Option[Deployment]] =
StoreOp.getDeploymentsForServiceNameByStatus(d.unit.serviceName, d.nsid, NonEmptyList(Warming)).map(_.reverse.headOption)
val counterSink: Sink[Task, MonitorActionItem] =
sink.lift { item => count(item) }
def count(item: MonitorActionItem): Task[Unit] = {
val task: Task[Unit] = item match {
case PromoteToReady(dc, _) =>
Task.delay(Metrics.default.deploymentMonitorReadyToPromote.labels(dc.name).inc())
case RetainAsWarming(dc, d, r) =>
Task.delay {
Metrics.default.deploymentMonitorAwaitingHealth.labels(dc.name).inc()
log.info(s"""Deployment "${d.stackName.toString}" not ready to promote to Ready. Reason: $r""")
}
}
task.handleWith { case t =>
warn(s"unexpected error occurred whilst attempting to update deployment monitor counters. ${t.getMessage}, cause: ${t.getCause}")
}
}
def promotionSink(cfg: NelsonConfig): Sink[Task, MonitorActionItem] =
sink.lift { item => promote(item)(cfg.auditor) }
def promote(item: MonitorActionItem)(auditor: audit.Auditor): Task[Unit] = {
val task = item match {
case PromoteToReady(dc, d) =>
val t = storage.run(dc.storage, promoteToReady(d))
t >> auditor.write(d, audit.ReadyAction)
case _ =>
Task.now(())
}
task.handleWith { case t =>
warn(s"unexpected error occurred whilst attempting to promote deployment. ${t.getMessage}, cause: ${t.getCause}")
}
}
def promoteToReady(d: Deployment): StoreOpF[Unit] = {
def startTrafficShift: StoreOpF[Unit] =
(for {
t <- OptionT(StoreOp.getCurrentTargetForServiceName(d.nsid, d.unit.serviceName))
id <- OptionT(StoreOp.startTrafficShift(from = t.deploymentTarget.id, to = d.id, start = Instant.now))
} yield id).run.map(_ => ())
def ready: StoreOpF[Unit] =
StoreOp.createDeploymentStatus(d.id, Ready,
Some(s"Promoting ${d.stackName} to ready."))
startTrafficShift >> ready
}
private def warn(msg: String): Task[Unit] =
Task.delay(log.warn(msg))
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy