All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.spotify.helios.master.ZooKeeperMasterModel Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2014 Spotify AB.
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.spotify.helios.master;

import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Ordering;

import com.fasterxml.jackson.core.type.TypeReference;
import com.spotify.helios.common.HeliosRuntimeException;
import com.spotify.helios.common.Json;
import com.spotify.helios.common.descriptors.AgentInfo;
import com.spotify.helios.common.descriptors.Deployment;
import com.spotify.helios.common.descriptors.DeploymentGroup;
import com.spotify.helios.common.descriptors.DeploymentGroupEvent;
import com.spotify.helios.common.descriptors.DeploymentGroupStatus;
import com.spotify.helios.common.descriptors.Goal;
import com.spotify.helios.common.descriptors.HostInfo;
import com.spotify.helios.common.descriptors.HostStatus;
import com.spotify.helios.common.descriptors.Job;
import com.spotify.helios.common.descriptors.JobId;
import com.spotify.helios.common.descriptors.JobStatus;
import com.spotify.helios.common.descriptors.PortMapping;
import com.spotify.helios.common.descriptors.RolloutOptions;
import com.spotify.helios.common.descriptors.RolloutTask;
import com.spotify.helios.common.descriptors.Task;
import com.spotify.helios.common.descriptors.TaskStatus;
import com.spotify.helios.common.descriptors.TaskStatusEvent;
import com.spotify.helios.rollingupdate.RolloutPlanner;
import com.spotify.helios.servicescommon.KafkaRecord;
import com.spotify.helios.servicescommon.KafkaSender;
import com.spotify.helios.servicescommon.coordination.Node;
import com.spotify.helios.servicescommon.coordination.Paths;
import com.spotify.helios.servicescommon.coordination.ZooKeeperClient;
import com.spotify.helios.servicescommon.coordination.ZooKeeperClientProvider;
import com.spotify.helios.servicescommon.coordination.ZooKeeperOperation;

import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NoNodeException;
import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.KeeperException.NotEmptyException;
import org.apache.zookeeper.data.Stat;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;

import static com.google.common.base.Charsets.UTF_8;
import static com.google.common.base.Optional.fromNullable;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Strings.isNullOrEmpty;
import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Lists.reverse;
import static com.spotify.helios.common.descriptors.DeploymentGroupStatus.State.DONE;
import static com.spotify.helios.common.descriptors.DeploymentGroupStatus.State.FAILED;
import static com.spotify.helios.common.descriptors.DeploymentGroupStatus.State.PLANNING_ROLLOUT;
import static com.spotify.helios.common.descriptors.DeploymentGroupStatus.State.ROLLING_OUT;
import static com.spotify.helios.common.descriptors.DeploymentGroupStatus.State.START_ROLLING_UPDATE;
import static com.spotify.helios.common.descriptors.Descriptor.parse;
import static com.spotify.helios.common.descriptors.HostStatus.Status.DOWN;
import static com.spotify.helios.common.descriptors.HostStatus.Status.UP;
import static com.spotify.helios.servicescommon.coordination.ZooKeeperOperations.check;
import static com.spotify.helios.servicescommon.coordination.ZooKeeperOperations.create;
import static com.spotify.helios.servicescommon.coordination.ZooKeeperOperations.delete;
import static com.spotify.helios.servicescommon.coordination.ZooKeeperOperations.set;
import static java.util.Collections.emptyList;
import static java.util.Collections.emptyMap;
import static java.util.concurrent.TimeUnit.MILLISECONDS;

/**
 * The Helios Master's view into ZooKeeper.
 */
public class ZooKeeperMasterModel implements MasterModel {
  private static final Comparator EVENT_COMPARATOR =
      new Comparator() {
        @Override
        public int compare(TaskStatusEvent arg0, TaskStatusEvent arg1) {
          if (arg1.getTimestamp() > arg0.getTimestamp()) {
            return -1;
          } else if (arg1.getTimestamp() == arg0.getTimestamp()) {
            return 0;
          } else {
            return 1;
          }
        }
      };

  private static final Logger log = LoggerFactory.getLogger(ZooKeeperMasterModel.class);

  public static final Map EMPTY_STATUSES = emptyMap();
  public static final TypeReference
      HOST_INFO_TYPE =
      new TypeReference() {};
  public static final TypeReference
      AGENT_INFO_TYPE =
      new TypeReference() {};
  public static final TypeReference>
      STRING_MAP_TYPE =
      new TypeReference>() {};
  public static final TypeReference>
      STRING_LIST_TYPE =
      new TypeReference>() {};

  private final ZooKeeperClientProvider provider;
  private final String name;
  private final KafkaSender kafkaSender;

  public  ZooKeeperMasterModel(final ZooKeeperClientProvider provider)
      throws IOException, InterruptedException {
    this(provider, null);
  }

  public ZooKeeperMasterModel(final ZooKeeperClientProvider provider, @Nullable final String name) {
    this(provider, name, null);
  }

  /**
   * Constructor
   * @param provider         {@link ZooKeeperClientProvider}
   * @param name             The hostname of the machine running the {@link MasterModel}
   * @param kafkaSender      {@link KafkaSender}
   */
  public ZooKeeperMasterModel(
      final ZooKeeperClientProvider provider,
      @Nullable final String name,
      @Nullable final KafkaSender kafkaSender) {
    this.provider = provider;
    this.name = name;
    this.kafkaSender = kafkaSender;
  }

  /**
   * Registers a host into ZooKeeper.  The {@code id} is initially generated randomly by the Agent
   * and persisted on disk.  This way, in the event that you have two agents attempting to register
   * with the same value of @{code host}, the first one will win.
   */
  @Override
  public void registerHost(final String host, final String id) {
    log.info("registering host: {}", host);
    final ZooKeeperClient client = provider.get("registerHost");
    try {
      // TODO (dano): this code is replicated in AgentZooKeeperRegistrar

      // This would've been nice to do in a transaction but PathChildrenCache ensures paths
      // so we can't know what paths already exist so assembling a suitable transaction is too
      // painful.
      client.ensurePath(Paths.configHost(host));
      client.ensurePath(Paths.configHostJobs(host));
      client.ensurePath(Paths.configHostPorts(host));
      client.ensurePath(Paths.statusHost(host));
      client.ensurePath(Paths.statusHostJobs(host));

      // Finish registration by creating the id node last
      client.createAndSetData(Paths.configHostId(host), id.getBytes(UTF_8));
    } catch (Exception e) {
      throw new HeliosRuntimeException("registering host " + host + " failed", e);
    }
  }

  /**
   * Returns a list of the hosts/agents that have been registered.
   */
  @Override
  public List listHosts() {
    try {
      // TODO (dano): only return hosts whose agents completed registration (i.e. has id nodes)
      return provider.get("listHosts").getChildren(Paths.configHosts());
    } catch (KeeperException.NoNodeException e) {
      return emptyList();
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("listing hosts failed", e);
    }
  }

  /**
   * Returns a list of the host names of the currently running masters.
   */
  @Override
  public List getRunningMasters() {
    final ZooKeeperClient client = provider.get("getRunningMasters");
    try {
      final List masters = client.getChildren(Paths.statusMaster());
      final ImmutableList.Builder upMasters = ImmutableList.builder();
      for (final String master : masters) {
        if (client.exists(Paths.statusMasterUp(master)) != null) {
          upMasters.add(master);
        }
      }
      return upMasters.build();
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("listing masters failed", e);
    }
  }

  /**
   * Undoes the effect of {@link ZooKeeperMasterModel#registerHost(String, String)}.  Cleans up
   * any leftover host-related things.
   */
  @Override
  public void deregisterHost(final String host)
      throws HostNotFoundException, HostStillInUseException {
    log.info("deregistering host: {}", host);
    final ZooKeeperClient client = provider.get("deregisterHost");
    // TODO (dano): handle retry failures
    try {
      final List operations = Lists.newArrayList();

      // Remove all jobs deployed to this host
      final List jobs = listHostJobs(client, host);

      if (jobs == null) {
        if (client.exists(Paths.configHost(host)) == null) {
          throw new HostNotFoundException("host [" + host + "] does not exist");
        }
      }

      if (jobs != null) {
        for (final JobId job : jobs) {
          final String hostJobPath = Paths.configHostJob(host, job);

          final List nodes = safeListRecursive(client, hostJobPath);
          for (final String node : reverse(nodes)) {
            operations.add(delete(node));
          }
          if (client.exists(Paths.configJobHost(job, host)) != null) {
            operations.add(delete(Paths.configJobHost(job, host)));
          }
          // Clean out the history for each job
          final List history = safeListRecursive(client, Paths.historyJobHost(job, host));
          for (final String s : reverse(history)) {
            operations.add(delete(s));
          }
        }
      }
      operations.add(delete(Paths.configHostJobs(host)));

      // Remove the host status
      final List nodes = safeListRecursive(client, Paths.statusHost(host));
      for (final String node : reverse(nodes)) {
        operations.add(delete(node));
      }

      // Remove port allocations
      final List ports = safeGetChildren(client, Paths.configHostPorts(host));
      for (final String port : ports) {
        operations.add(delete(Paths.configHostPort(host, Integer.valueOf(port))));
      }
      operations.add(delete(Paths.configHostPorts(host)));

      // Remove host id
      final String idPath = Paths.configHostId(host);
      if (client.exists(idPath) != null) {
        operations.add(delete(idPath));
      }

      // Remove host config root
      operations.add(delete(Paths.configHost(host)));

      client.transaction(operations);
    } catch (NotEmptyException e) {
      final HostStatus hostStatus = getHostStatus(host);
      final List jobs = hostStatus != null
                               ? ImmutableList.copyOf(hostStatus.getJobs().keySet())
                               : Collections.emptyList();
      throw new HostStillInUseException(host, jobs);
    } catch (NoNodeException e) {
      throw new HostNotFoundException(host);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException(e);
    }
  }

  private List safeGetChildren(final ZooKeeperClient client, final String path) {
    try {
      return client.getChildren(path);
    } catch (KeeperException ignore) {
      return ImmutableList.of();
    }
  }

  private List safeListRecursive(final ZooKeeperClient client, final String path)
      throws KeeperException {
    try {
      return client.listRecursive(path);
    } catch (NoNodeException e) {
      return ImmutableList.of();
    }
  }

  /**
   * Adds a job into the configuration.
   */
  @Override
  public void addJob(final Job job) throws JobExistsException {
    log.info("adding job: {}", job);
    final JobId id = job.getId();
    final UUID operationId = UUID.randomUUID();
    final String creationPath = Paths.configJobCreation(id, operationId);
    final ZooKeeperClient client = provider.get("addJob");
    try {
      try {
        client.ensurePath(Paths.historyJob(id));
        client.transaction(create(Paths.configJob(id), job),
                           create(Paths.configJobRefShort(id), id),
                           create(Paths.configJobHosts(id)),
                           create(creationPath),
                           // Touch the jobs root node so that its version is bumped on every job
                           // change down the tree. Effectively, make it that version == cVersion.
                           set(Paths.configJobs(), UUID.randomUUID().toString().getBytes()));
      } catch (final NodeExistsException e) {
        if (client.exists(creationPath) != null) {
          // The job was created, we're done here
          return;
        }
        throw new JobExistsException(id.toString());
      }
    } catch (NoNodeException e) {
      throw new HeliosRuntimeException("adding job " + job + " failed due to missing ZK path: " +
                                       e.getPath(), e);
    } catch (final KeeperException e) {
      throw new HeliosRuntimeException("adding job " + job + " failed", e);
    }
  }

  /**
   * Given a jobId, returns the N most recent events in it's history in the cluster.
   */
  @Override
  public List getJobHistory(final JobId jobId) throws JobDoesNotExistException {
    final Job descriptor = getJob(jobId);
    if (descriptor == null) {
      throw new JobDoesNotExistException(jobId);
    }
    final ZooKeeperClient client = provider.get("getJobHistory");
    final List hosts;
    try {
      hosts = client.getChildren(Paths.historyJobHosts(jobId));
    } catch (NoNodeException e) {
      return emptyList();
    } catch (KeeperException e) {
      throw Throwables.propagate(e);
    }

    final List jsEvents = Lists.newArrayList();

    for (String host : hosts) {
      final List events;
      try {
        events = client.getChildren(Paths.historyJobHostEvents(jobId, host));
      } catch (NoNodeException e) {
        continue;
      } catch (KeeperException e) {
        throw Throwables.propagate(e);
      }

      for (String event : events) {
        try {
          byte[] data = client.getData(Paths.historyJobHostEventsTimestamp(
              jobId, host, Long.valueOf(event)));
          final TaskStatus status = Json.read(data, TaskStatus.class);
          jsEvents.add(new TaskStatusEvent(status, Long.valueOf(event), host));
        } catch (NoNodeException e) { // ignore, it went away before we read it
        } catch (KeeperException | IOException e) {
          throw Throwables.propagate(e);
        }
      }
    }

    return Ordering.from(EVENT_COMPARATOR).sortedCopy(jsEvents);
  }

  @Override
  public void addDeploymentGroup(final DeploymentGroup deploymentGroup)
      throws DeploymentGroupExistsException {
    log.info("adding deployment-group: {}", deploymentGroup);
    final ZooKeeperClient client = provider.get("addDeploymentGroup");

    try {
      try {
        client.ensurePath(Paths.configDeploymentGroups());
        client.ensurePath(Paths.statusDeploymentGroups());
        client.transaction(
            create(Paths.configDeploymentGroup(deploymentGroup.getName()), deploymentGroup),
            create(Paths.statusDeploymentGroup(deploymentGroup.getName())),
            create(Paths.statusDeploymentGroupHosts(deploymentGroup.getName()))
          );
      } catch (final NodeExistsException e) {
        throw new DeploymentGroupExistsException(deploymentGroup.getName());
      }
    } catch (final KeeperException e) {
      throw new HeliosRuntimeException("adding deployment-group " + deploymentGroup + " failed", e);
    }
  }

  @Override
  public DeploymentGroup getDeploymentGroup(final String name)
      throws DeploymentGroupDoesNotExistException {
    log.debug("getting deployment-group: {}", name);
    final ZooKeeperClient client = provider.get("getDeploymentGroup");
    return getDeploymentGroup(client, name);
  }

  private DeploymentGroup getDeploymentGroup(final ZooKeeperClient client, final String name)
      throws DeploymentGroupDoesNotExistException {
    try {
      final byte[] data = client.getData(Paths.configDeploymentGroup(name));
      return Json.read(data, DeploymentGroup.class);
    } catch (NoNodeException e) {
      throw new DeploymentGroupDoesNotExistException(name);
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("getting deployment-group " + name + " failed", e);
    }
  }

  @Override
  public void removeDeploymentGroup(final String name) throws DeploymentGroupDoesNotExistException {
    log.info("removing deployment-group: name={}", name);
    final ZooKeeperClient client = provider.get("removeDeploymentGroup");
    try {
      client.ensurePath(Paths.configDeploymentGroups());
      client.delete(Paths.configDeploymentGroup(name));
      if (client.exists(Paths.statusDeploymentGroupHosts(name)) != null) {
        client.delete(Paths.statusDeploymentGroupHosts(name));
      }
      if (client.exists(Paths.statusDeploymentGroup(name)) != null) {
        client.delete(Paths.statusDeploymentGroup(name));
      }
    } catch (final NoNodeException e) {
      throw new DeploymentGroupDoesNotExistException(name);
    } catch (final KeeperException e) {
      throw new HeliosRuntimeException("removing deployment-group " + name + " failed", e);
    }
  }

  @Override
  public void updateDeploymentGroupHosts(String name, List hosts)
      throws DeploymentGroupDoesNotExistException {
    log.debug("updating deployment-group hosts: name={}", name);
    final ZooKeeperClient client = provider.get("updateDeploymentGroupHosts");
    try {
      client.setData(Paths.statusDeploymentGroupHosts(name), Json.asBytes(hosts));
    } catch (NoNodeException e) {
      throw new DeploymentGroupDoesNotExistException(name, e);
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("updating deployment group hosts failed", e);
    }
  }

  @Override
  public void rollingUpdate(final DeploymentGroup deploymentGroup,
                            final JobId jobId,
                            final RolloutOptions options)
      throws DeploymentGroupDoesNotExistException, JobDoesNotExistException {
    checkNotNull(deploymentGroup, "deploymentGroup");

    log.info("rolling-update on deployment-group: name={}", deploymentGroup.getName());

    final DeploymentGroup updated = deploymentGroup.toBuilder()
        .setJobId(jobId)
        .setRolloutOptions(options)
        .build();

    if (getJob(jobId) == null) {
      throw new JobDoesNotExistException(jobId);
    }

    final List operations = Lists.newArrayList();
    final ZooKeeperClient client = provider.get("rollingUpdate");

    operations.add(set(Paths.configDeploymentGroup(deploymentGroup.getName()), updated));

    final String statusPath = Paths.statusDeploymentGroup(deploymentGroup.getName());
    final DeploymentGroupStatus initialStatus = DeploymentGroupStatus.newBuilder()
        .setDeploymentGroup(deploymentGroup)
        .setState(START_ROLLING_UPDATE)
        .build();
    operations.add(set(statusPath, initialStatus));

    try {
      client.ensurePath(statusPath);
      client.transaction(operations);

      if (kafkaSender != null) {
        final DeploymentGroupEvent event = DeploymentGroupEvent.newBuilder()
            .setDeploymentGroupStatus(initialStatus)
            .build();
        kafkaSender.send(KafkaRecord.of(DeploymentGroupEvent.KAFKA_TOPIC, event.toJsonBytes()));
      }
    } catch (final NoNodeException e) {
      throw new DeploymentGroupDoesNotExistException(deploymentGroup.getName());
    } catch (final KeeperException e) {
      throw new HeliosRuntimeException(
          "rolling-update on deployment-group " + deploymentGroup.getName() + " failed", e);
    }
  }

  @Override
  public void rollingUpdateStep(final DeploymentGroup deploymentGroup,
                                final RolloutPlanner rolloutPlanner)
      throws DeploymentGroupDoesNotExistException {
    checkNotNull(deploymentGroup, "deploymentGroup");

    log.debug("rolling-update step on deployment-group: name={}", deploymentGroup.getName());

    final ZooKeeperClient client = provider.get("rollingUpdateStep");
    final String statusPath = Paths.statusDeploymentGroup(deploymentGroup.getName());
    final DeploymentGroupStatus status = getDeploymentGroupStatus(deploymentGroup.getName());

    if (status == null) {
      // The rolling-update command hasn't been called yet for this deployment group.
      // The deployment group status doesn't exist yet and there's nothing to do.
      return;
    }

    final RolloutOpsEvents opsEvents = new RolloutOpsEvents();

    final DeploymentGroupStatus.State state = status.getState();
    if (state.equals(START_ROLLING_UPDATE) || state.equals(PLANNING_ROLLOUT)) {
      // generate the rollout plan and proceed to ROLLING_OUT
      final Map hostsAndStatuses = Maps.newLinkedHashMap();
      for (final String host: getDeploymentGroupHosts(deploymentGroup.getName())) {
        hostsAndStatuses.put(host, getHostStatus(host));
      }

      final List oldPlan = status.getRolloutTasks();
      final List newPlan = rolloutPlanner.plan(hostsAndStatuses);

      final DeploymentGroupStatus.Builder newStatus = status.toBuilder()
          .setState(ROLLING_OUT)
          .setRolloutTasks(newPlan)
          .setTaskIndex(0);

      if (!Objects.equals(oldPlan, newPlan)) {
        // if our plan changes (because hosts have been added or removed), reset
        // the successful iteration counter (since our new plan has never been successful)
        newStatus.setSuccessfulIterations(0);
      }

      opsEvents.addOperation(set(statusPath, newStatus.build()));
    } else if (status.getState().equals(ROLLING_OUT)) {
      // grab the current task off the rollout task list and execute it
      opsEvents.addAll(getRolloutOperations(deploymentGroup, status));
    } else if (status.getState().equals(DONE)) {
      if (status.getSuccessfulIterations() == 1) {
        // this is the first successful iteration
        opsEvents.addEvent(DeploymentGroupEvent.newBuilder()
                               .setDeploymentGroupStatus(status)
                               .build());
      }

      // after DONE, go to PLANNING_ROLLOUT
      opsEvents.addOperation(set(statusPath, status.toBuilder()
          .setState(PLANNING_ROLLOUT)
          .build()));
    }

    if (opsEvents.getOperations().isEmpty()) {
      return;
    }

    try {
      final List zkOperations = opsEvents.getOperations();
      client.transaction(Lists.asList(
          check(statusPath, status.getVersion()),
          zkOperations.toArray(new ZooKeeperOperation[zkOperations.size()])
      ));

      if (kafkaSender != null) {
        for (final DeploymentGroupEvent event : opsEvents.getEvents()) {
          kafkaSender.send(KafkaRecord.of(DeploymentGroupEvent.KAFKA_TOPIC, event.toJsonBytes()));
        }
      }
    } catch (final KeeperException e) {
      if (e instanceof KeeperException.BadVersionException) {
        // some other master beat us in processing this rolling update step. not exceptional.
        // ideally we would check the path in the exception, but curator doesn't provide a path
        // for exceptions thrown as part of a transaction.
        log.debug("error saving rolling-update operations: {}", e);
      } else {
        throw new HeliosRuntimeException(
            "rolling-update on deployment-group " + deploymentGroup.getName() + " failed", e);
      }
    }
  }

  private RolloutOpsEvents getRolloutOperations(final DeploymentGroup deploymentGroup,
                                                final DeploymentGroupStatus status) {
    final int taskIndex = status.getTaskIndex();
    final RolloutTask currentTask = Iterables.get(status.getRolloutTasks(), taskIndex, null);
    final RollingUpdateTaskResult result = getRollingUpdateTaskResult(currentTask, deploymentGroup);

    final String statusPath = Paths.statusDeploymentGroup(deploymentGroup.getName());
    final RolloutOpsEvents opsEvents = new RolloutOpsEvents();

    if (result.equals(RollingUpdateTaskResult.TASK_IN_PROGRESS)) {
      // not an error, but nothing to do
      return opsEvents;
    }

    if (result.error != null) {
      // if an error occurred, record it in the status and fail
      opsEvents.addEvent(DeploymentGroupEvent.newBuilder()
                       .setDeploymentGroupStatus(status.toBuilder().setState(FAILED).build())
                       .build());
      final String errMsg = isNullOrEmpty(result.host) ? result.error.getMessage() :
                            result.host + ": " + result.error.getMessage();
      opsEvents.addOperation(set(statusPath, status.toBuilder()
          .setState(FAILED)
          .setError(errMsg)
          .build()));
    } else {
      for (ZooKeeperOperation op : result.operations) {
        opsEvents.addOperation(op);
      }

      if (!result.operations.isEmpty()) {
        // if we're actually doing any operations, then record an event

        final DeploymentGroupEvent.Builder eventBuilder = DeploymentGroupEvent.newBuilder()
            .setRolloutTaskStatus(RolloutTask.Status.OK)
            .setDeploymentGroupStatus(status.toBuilder().setState(ROLLING_OUT).build());

        if (currentTask != null) {
          eventBuilder
              .setAction(currentTask.getAction())
              .setTarget(currentTask.getTarget());
        }

        opsEvents.addEvent(eventBuilder.build());
      }

      if (taskIndex + 1 >= status.getRolloutTasks().size()) {
        // successfully completed the last task
        opsEvents.addOperation(set(statusPath, status.toBuilder()
            .setSuccessfulIterations(status.getSuccessfulIterations() + 1)
            .setState(DONE)
            .build()));
      } else {
        opsEvents.addOperation(set(statusPath, status.toBuilder()
            .setTaskIndex(taskIndex + 1)
            .build()));
      }
    }

    return opsEvents;
  }

  private RollingUpdateTaskResult getRollingUpdateTaskResult(final RolloutTask task,
                                                             final DeploymentGroup group) {
    final RollingUpdateTaskResult result;
    if (task == null) {
      // if there is no rollout task, then we're done by definition. this can happen
      // when (for example) there are no hosts in the deployment group
      result = RollingUpdateTaskResult.TASK_COMPLETE;
    } else {
      final String host = task.getTarget();
      final RolloutTask.Action action = task.getAction();

      switch (action) {
        case UNDEPLOY_OLD_JOBS:
          // add undeploy ops for jobs previously deployed by this deployment group
          result = rollingUpdateUndeploy(group, host);
          break;
        case DEPLOY_NEW_JOB:
          // add deploy ops for the new job
          result = rollingUpdateDeploy(group, host);
          break;
        case AWAIT_RUNNING:
          result = rollingUpdateAwaitRunning(group, host);
          break;
        default:
          throw new HeliosRuntimeException(String.format(
              "unknown rollout task type %s for deployment group %s.", action, group.getName()));
      }
    }
    return result;
  }

  private RollingUpdateTaskResult rollingUpdateAwaitRunning(final DeploymentGroup deploymentGroup,
                                                            final String host) {
    final ZooKeeperClient client = provider.get("rollingUpdateAwaitRunning");
    final Map taskStatuses = getTaskStatuses(client, host);

    if (!taskStatuses.containsKey(deploymentGroup.getJobId())) {
      // Handle cases where agent has not written job status to zookeeper.

      // If job is not listed under /config/hosts node, it may have been deployed successfully and
      // then manually undeployed. The job will not get redeployed, so treat this as a failure.
      final Deployment deployment = getDeployment(host, deploymentGroup.getJobId());
      if (deployment == null) {
        return RollingUpdateTaskResult.error(
            "Job unexpectedly undeployed. Perhaps it was manually undeployed?", host);
      }

      // Check if we've exceeded the timeout for the rollout operation.
      if (isRolloutTimedOut(deploymentGroup, client)) {
        return RollingUpdateTaskResult.error("timed out while retrieving job status", host);
      }

      // We haven't detected any errors, so assume the agent will write the status soon.
      return RollingUpdateTaskResult.TASK_IN_PROGRESS;
    } else if (!taskStatuses.get(deploymentGroup.getJobId()).getState()
        .equals(TaskStatus.State.RUNNING)) {
      // job isn't running yet

      if (isRolloutTimedOut(deploymentGroup, client)) {
        // time exceeding the configured deploy timeout has passed, and this job is still not
        // running
        return RollingUpdateTaskResult.error("timed out waiting for job to reach RUNNING", host);
      }

      return RollingUpdateTaskResult.TASK_IN_PROGRESS;
    } else {
      // the job is running on the host. last thing we have to ensure is that it was
      // deployed by this deployment group. otherwise some weird conflict has occurred and we
      // won't be able to undeploy the job on the next update.
      final Deployment deployment = getDeployment(host, deploymentGroup.getJobId());
      if (deployment == null) {
        return RollingUpdateTaskResult.error(
            "deployment for this job not found in zookeeper. " +
            "Perhaps it was manually undeployed?", host);
      } else if (!Objects.equals(deployment.getDeploymentGroupName(), deploymentGroup.getName())) {
        return RollingUpdateTaskResult.error(
            "job was already deployed, either manually or by a different deployment group", host);
      }

      return RollingUpdateTaskResult.TASK_COMPLETE;
    }
  }

  private boolean isRolloutTimedOut(final DeploymentGroup deploymentGroup,
                                    final ZooKeeperClient client) {
    try {
      final String statusPath = Paths.statusDeploymentGroup(deploymentGroup.getName());
      final long secondsSinceDeploy = MILLISECONDS.toSeconds(
          System.currentTimeMillis() - client.getNode(statusPath).getStat().getMtime());
      return secondsSinceDeploy > deploymentGroup.getRolloutOptions().getTimeout();
    } catch (KeeperException e) {
      // statusPath doesn't exist or some other ZK issue. probably this deployment group
      // was removed.
      log.warn("error determining deployment group modification time: {} - {}",
               deploymentGroup.getName(), e);
      return false;
    }
  }

  private RollingUpdateTaskResult rollingUpdateDeploy(final DeploymentGroup deploymentGroup,
                                                      final String host) {
    final Deployment deployment = Deployment.of(deploymentGroup.getJobId(), Goal.START,
                                                Deployment.EMTPY_DEPLOYER_USER, this.name,
                                                deploymentGroup.getName());
    final ZooKeeperClient client = provider.get("rollingUpdateDeploy");

    try {
      return RollingUpdateTaskResult.of(getDeployOperations(client, host, deployment,
                                                            Job.EMPTY_TOKEN));
    } catch (JobDoesNotExistException | TokenVerificationException | HostNotFoundException e) {
      return RollingUpdateTaskResult.error(e);
    } catch (JobAlreadyDeployedException e) {
     return RollingUpdateTaskResult.TASK_COMPLETE;
    }
  }

  private RollingUpdateTaskResult rollingUpdateUndeploy(final DeploymentGroup deploymentGroup,
                                                        final String host) {
    final ZooKeeperClient client = provider.get("rollingUpdateUndeploy");
    final List operations = Lists.newArrayList();

    for (final Deployment deployment : getTasks(client, host).values()) {
      final boolean isOwnedByDeploymentGroup = Objects.equals(
          deployment.getDeploymentGroupName(), deploymentGroup.getName());
      final boolean isSameJob = deployment.getJobId().equals(deploymentGroup.getJobId());

      if (isOwnedByDeploymentGroup || (
          isSameJob && deploymentGroup.getRolloutOptions().getMigrate())) {
        if (isSameJob && isOwnedByDeploymentGroup && deployment.getGoal().equals(Goal.START)) {
          // The job we want deployed is already deployed and set to run, so just leave it.
          continue;
        }

        try {
          operations.addAll(getUndeployOperations(client, host, deployment.getJobId(),
                                                  Job.EMPTY_TOKEN));
        } catch (TokenVerificationException | HostNotFoundException e) {
          return RollingUpdateTaskResult.error(e, host);
        } catch (JobNotDeployedException e) {
          // probably somebody beat us to the punch of undeploying. that's fine.
        }
      }
    }

    return RollingUpdateTaskResult.of(operations);
  }

  @Override
  public void stopDeploymentGroup(final String deploymentGroupName)
      throws DeploymentGroupDoesNotExistException {
    checkNotNull(deploymentGroupName, "name");

    log.info("stop deployment-group: name={}", deploymentGroupName);

    final ZooKeeperClient client = provider.get("stopDeploymentGroup");

    final DeploymentGroup deploymentGroup = getDeploymentGroup(deploymentGroupName);

    final String statusPath = Paths.statusDeploymentGroup(deploymentGroupName);
    final DeploymentGroupStatus status = DeploymentGroupStatus.newBuilder()
        .setDeploymentGroup(deploymentGroup)
        .setState(FAILED)
        .setError("Stopped by user")
        .build();

    try {
      client.ensurePath(statusPath);
      client.transaction(set(statusPath, status));
    } catch (final NoNodeException e) {
      throw new DeploymentGroupDoesNotExistException(deploymentGroupName);
    } catch (final KeeperException e) {
      throw new HeliosRuntimeException(
          "stop deployment-group " + deploymentGroupName + " failed", e);
    }
  }

  /**
   * Returns a {@link Map} of deployment group name to {@link DeploymentGroup} objects for all of
   * the deployment groups known.
   */
  @Override
  public Map getDeploymentGroups() {
    log.debug("getting deployment groups");
    final String folder = Paths.configDeploymentGroups();
    final ZooKeeperClient client = provider.get("getDeploymentGroups");
    try {
      final List names;
      try {
        names = client.getChildren(folder);
      } catch (NoNodeException e) {
        return Maps.newHashMap();
      }
      final Map descriptors = Maps.newHashMap();
      for (final String name : names) {
        final String path = Paths.configDeploymentGroup(name);
        try {
          final byte[] data = client.getData(path);
          final DeploymentGroup descriptor = parse(data, DeploymentGroup.class);
          descriptors.put(descriptor.getName(), descriptor);
        } catch (NoNodeException e) {
          // Ignore, the deployment group was deleted before we had a chance to read it.
          log.debug("Ignoring deleted deployment group {}", name);
        }
      }
      return descriptors;
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("getting deployment groups failed", e);
    }
  }

  @Override
  public DeploymentGroupStatus getDeploymentGroupStatus(final String name)
      throws DeploymentGroupDoesNotExistException {
    log.debug("getting deployment group status: {}", name);
    final ZooKeeperClient client = provider.get("getDeploymentGroupStatus");

    final DeploymentGroup deploymentGroup = getDeploymentGroup(client, name);
    if (deploymentGroup == null) {
      return null;
    }

    try {
      final Node node = client.getNode(Paths.statusDeploymentGroup(name));

      final byte[] bytes = node.getBytes();
      if (bytes.length == 0) {
        return null;
      }

      final DeploymentGroupStatus status = Json.read(bytes, DeploymentGroupStatus.class);
      return status.toBuilder()
          .setVersion(node.getStat().getVersion())
          .build();
    } catch (NoNodeException e) {
      return null;
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("getting deployment group status " + name + " failed", e);
    }
  }

  @Override
  public List getDeploymentGroupHosts(final String name)
      throws DeploymentGroupDoesNotExistException {
    log.debug("getting deployment group hosts: {}", name);
    final ZooKeeperClient client = provider.get("getDeploymentGroupHosts");

    final DeploymentGroup deploymentGroup = getDeploymentGroup(client, name);
    if (deploymentGroup == null) {
      throw new DeploymentGroupDoesNotExistException(name);
    }

    try {
      final byte[] data = client.getData(Paths.statusDeploymentGroupHosts(name));

      if (data.length > 0) {
        return Json.read(data, STRING_LIST_TYPE);
      }
    } catch (NoNodeException e) {
      // not fatal
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("reading deployment group hosts failed: " + name, e);
    }

    return emptyList();
  }

  /**
   * Returns the job configuration for the job specified by {@code id} as a
   * {@link Job} object.
   */
  @Override
  public Job getJob(final JobId id) {
    log.debug("getting job: {}", id);
    final ZooKeeperClient client = provider.get("getJobId");
    return getJob(client, id);
  }


  private Job getJob(final ZooKeeperClient client, final JobId id) {
    final String path = Paths.configJob(id);
    try {
      final byte[] data = client.getData(path);
      return Json.read(data, Job.class);
    } catch (NoNodeException e) {
      // Return null to indicate that the job does not exist
      return null;
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("getting job " + id + " failed", e);
    }
  }

  /**
   * Returns a {@link Map} of {@link JobId} to {@link Job} objects for all of the jobs known.
   */
  @Override
  public Map getJobs() {
    log.debug("getting jobs");
    final String folder = Paths.configJobs();
    final ZooKeeperClient client = provider.get("getJobs");
    try {
      final List ids;
      try {
        ids = client.getChildren(folder);
      } catch (NoNodeException e) {
        return Maps.newHashMap();
      }
      final Map descriptors = Maps.newHashMap();
      for (final String id : ids) {
        final JobId jobId = JobId.fromString(id);
        final String path = Paths.configJob(jobId);
        try {
          final byte[] data = client.getData(path);
          final Job descriptor = parse(data, Job.class);
          descriptors.put(descriptor.getId(), descriptor);
        } catch (NoNodeException e) {
          // Ignore, the job was deleted before we had a chance to read it.
          log.debug("Ignoring deleted job {}", jobId);
        }
      }
      return descriptors;
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("getting jobs failed", e);
    }
  }

  /**
   * Returns the current job status as a {@link JobStatus} object.
   */
  @Override
  public JobStatus getJobStatus(final JobId jobId) {
    final ZooKeeperClient client = provider.get("getJobStatus");

    final Job job = getJob(client, jobId);
    if (job == null) {
      return null;
    }

    final List hosts;
    try {
      hosts = listJobHosts(client, jobId);
    } catch (JobDoesNotExistException e) {
      return null;
    }

    final ImmutableMap.Builder deployments = ImmutableMap.builder();
    final ImmutableMap.Builder taskStatuses = ImmutableMap.builder();
    for (final String host : hosts) {
      final TaskStatus taskStatus = getTaskStatus(client, host, jobId);
      if (taskStatus != null) {
        taskStatuses.put(host, taskStatus);
      }
      final Deployment deployment = getDeployment(host, jobId);
      if (deployment != null) {
        deployments.put(host, deployment);
      }
    }

    final Map deploymentsMap = deployments.build();
    return JobStatus.newBuilder()
        .setJob(job)
        .setDeployments(deploymentsMap)
        .setTaskStatuses(taskStatuses.build())
        .build();
  }

  private List listJobHosts(final ZooKeeperClient client, final JobId jobId)
      throws JobDoesNotExistException {
    final List hosts;
    try {
      hosts = client.getChildren(Paths.configJobHosts(jobId));
    } catch (NoNodeException e) {
      throw new JobDoesNotExistException(jobId);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("failed to list hosts for job: " + jobId, e);
    }
    return hosts;
  }

  @Override
  public Job removeJob(JobId jobId) throws JobDoesNotExistException, JobStillDeployedException {
    try {
      return removeJob(jobId, Job.EMPTY_TOKEN);
    } catch (TokenVerificationException e) {
      throw Throwables.propagate(e);
    }
  }

  /**
   * Deletes a job from ZooKeeper.  Ensures that job is not currently running anywhere.
   */
  @Override
  public Job removeJob(final JobId id, final String token)
      throws JobDoesNotExistException, JobStillDeployedException, TokenVerificationException {
    log.info("removing job: id={}", id);
    final ZooKeeperClient client = provider.get("removeJob");
    final Job job = getJob(client, id);
    if (job == null) {
      throw new JobDoesNotExistException(id);
    }
    verifyToken(token, job);

    // TODO (dano): handle retry failures
    try {
      final ImmutableList.Builder operations = ImmutableList.builder();
      final UUID jobCreationOperationId = getJobCreation(client, id);
      if (jobCreationOperationId != null) {
        operations.add(delete(Paths.configJobCreation(id, jobCreationOperationId)));
      }
      operations.add(delete(Paths.configJobHosts(id)),
                     delete(Paths.configJobRefShort(id)),
                     delete(Paths.configJob(id)),
                     // Touch the jobs root node so that its version is bumped on every job
                     // change down the tree. Effectively, make it that version == cVersion.
                     set(Paths.configJobs(), UUID.randomUUID().toString().getBytes()));
      client.transaction(operations.build());
    } catch (final NoNodeException e) {
      throw new JobDoesNotExistException(id);
    } catch (final NotEmptyException e) {
      throw new JobStillDeployedException(id, listJobHosts(client, id));
    } catch (final KeeperException e) {
      throw new HeliosRuntimeException("removing job " + id + " failed", e);
    }

    return job;
  }

  private UUID getJobCreation(final ZooKeeperClient client, final JobId id)
      throws KeeperException {
    final String parent = Paths.configHostJobCreationParent(id);
    final List children = client.getChildren(parent);
    for (final String child : children) {
      if (Paths.isConfigJobCreation(id, parent, child)) {
        return Paths.configJobCreationId(id, parent, child);
      }
    }
    return null;
  }

  @Override
  public void deployJob(String host, Deployment job)
      throws HostNotFoundException, JobAlreadyDeployedException, JobDoesNotExistException,
             JobPortAllocationConflictException {
    try {
      deployJob(host, job, Job.EMPTY_TOKEN);
    } catch (TokenVerificationException e) {
      throw Throwables.propagate(e);
    }
  }

  /**
   * Creates a config entry within the specified agent to un/deploy a job, or more generally, change
   * the deployment status according to the {@code Goal} value in {@link Deployment}.
   */
  @Override
  public void deployJob(final String host, final Deployment deployment, final String token)
      throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException,
             JobPortAllocationConflictException, TokenVerificationException {
    final ZooKeeperClient client = provider.get("deployJob");
    deployJobRetry(client, host, deployment, 0, token);
  }

  private void deployJobRetry(final ZooKeeperClient client, final String host,
                              final Deployment deployment, int count, final String token)
      throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException,
             JobPortAllocationConflictException, TokenVerificationException {
    if (count == 3) {
      throw new HeliosRuntimeException("3 failures (possibly concurrent modifications) while " +
                                       "deploying. Giving up.");
    }
    log.info("deploying {}: {} (retry={})", deployment, host, count);

    final JobId id = deployment.getJobId();
    final Job job = getJob(id);

    if (job == null) {
      throw new JobDoesNotExistException(id);
    }
    verifyToken(token, job);

    final UUID operationId = UUID.randomUUID();
    final String jobPath = Paths.configJob(id);

    try {
      Paths.configHostJob(host, id);
    } catch (IllegalArgumentException e) {
      throw new HostNotFoundException("Could not find Helios host '" + host + "'");
    }

    final String taskPath = Paths.configHostJob(host, id);
    final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId);

    final List staticPorts = staticPorts(job);
    final Map portNodes = Maps.newHashMap();
    final byte[] idJson = id.toJsonBytes();
    for (final int port : staticPorts) {
      final String path = Paths.configHostPort(host, port);
      portNodes.put(path, idJson);
    }

    final Task task = new Task(job, deployment.getGoal(), deployment.getDeployerUser(),
                               deployment.getDeployerMaster(), deployment.getDeploymentGroupName());
    final List operations = Lists.newArrayList(
        check(jobPath),
        create(portNodes),
        create(Paths.configJobHost(id, host)));

    // Attempt to read a task here.
    try {
      client.getNode(taskPath);
      // if we get here the node exists already
      throw new JobAlreadyDeployedException(host, id);
    } catch (NoNodeException e) {
      operations.add(create(taskPath, task));
      operations.add(create(taskCreationPath));
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("reading existing task description failed", e);
    }

    // TODO (dano): Failure handling is racy wrt agent and job modifications.
    try {
      client.transaction(operations);
      log.info("deployed {}: {} (retry={})", deployment, host, count);
    } catch (NoNodeException e) {
      // Either the job, the host or the task went away
      assertJobExists(client, id);
      assertHostExists(client, host);
      // If the job and host still exists, we likely tried to redeploy a job that had an UNDEPLOY
      // goal and lost the race with the agent removing the task before we could set it. Retry.
      deployJobRetry(client, host, deployment, count + 1, token);
    } catch (NodeExistsException e) {
      // Check for conflict due to transaction retry
      try {
        if (client.exists(taskCreationPath) != null) {
          // Our creation operation node existed, we're done here
          return;
        }
      } catch (KeeperException ex) {
        throw new HeliosRuntimeException("checking job deployment failed", ex);
      }
      try {
        // Check if the job was already deployed
        if (client.stat(taskPath) != null) {
          throw new JobAlreadyDeployedException(host, id);
        }
      } catch (KeeperException ex) {
        throw new HeliosRuntimeException("checking job deployment failed", e);
      }

      // Check for static port collisions
      for (final int port : staticPorts) {
        final String path = Paths.configHostPort(host, port);
        try {
          if (client.stat(path) == null) {
            continue;
          }
          final byte[] b = client.getData(path);
          final JobId existingJobId = parse(b, JobId.class);
          throw new JobPortAllocationConflictException(id, existingJobId, host, port);
        } catch (KeeperException | IOException ex) {
          throw new HeliosRuntimeException("checking port allocations failed", e);
        }
      }

      // Catch all for logic and ephemeral issues
      throw new HeliosRuntimeException("deploying job failed", e);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("deploying job failed", e);
    }
  }

  private void assertJobExists(final ZooKeeperClient client, final JobId id)
      throws JobDoesNotExistException {
    try {
      final String path = Paths.configJob(id);
      if (client.stat(path) == null) {
        throw new JobDoesNotExistException(id);
      }
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("checking job existence failed", e);
    }
  }

  private List staticPorts(final Job job) {
    final List staticPorts = Lists.newArrayList();
    for (final PortMapping portMapping : job.getPorts().values()) {
      if (portMapping.getExternalPort() != null) {
        staticPorts.add(portMapping.getExternalPort());
      }
    }
    return staticPorts;
  }

  @Override
  public void updateDeployment(String host, Deployment deployment)
      throws HostNotFoundException, JobNotDeployedException {
    try {
      updateDeployment(host, deployment, Job.EMPTY_TOKEN);
    } catch (TokenVerificationException e) {
      Throwables.propagate(e);
    }
  }

  /**
   * Used to update the existing deployment of a job.
   */
  @Override
  public void updateDeployment(final String host, final Deployment deployment, final String token)
      throws HostNotFoundException, JobNotDeployedException, TokenVerificationException {
    log.info("updating deployment {}: {}", deployment, host);

    final ZooKeeperClient client = provider.get("updateDeployment");

    final JobId jobId = deployment.getJobId();
    final Job job = getJob(client, jobId);
    final Deployment existingDeployment = getDeployment(host, jobId);

    if (job == null) {
      throw new JobNotDeployedException(host, jobId);
    }
    verifyToken(token, job);

    assertHostExists(client, host);
    assertTaskExists(client, host, deployment.getJobId());

    final String path = Paths.configHostJob(host, jobId);
    final Task task = new Task(job, deployment.getGoal(),
                               existingDeployment.getDeployerUser(),
                               existingDeployment.getDeployerMaster(),
                               existingDeployment.getDeploymentGroupName());
    try {
      client.setData(path, task.toJsonBytes());
    } catch (Exception e) {
      throw new HeliosRuntimeException("updating deployment " + deployment +
                                       " on host " + host + " failed", e);
    }
  }

  private void assertHostExists(final ZooKeeperClient client, final String host)
  throws HostNotFoundException {
    try {
      client.getData(Paths.configHost(host));
    } catch (NoNodeException e) {
      throw new HostNotFoundException(host, e);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException(e);
    }
  }

  private void assertTaskExists(final ZooKeeperClient client, final String host, final JobId jobId)
  throws JobNotDeployedException {
    try {
      client.getData(Paths.configHostJob(host, jobId));
    } catch (NoNodeException e) {
      throw new JobNotDeployedException(host, jobId);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException(e);
    }
  }

  /**
   * Returns the current deployment state of {@code jobId} on {@code host}.
   */
  @Override
  public Deployment getDeployment(final String host, final JobId jobId) {
    final String path = Paths.configHostJob(host, jobId);
    final ZooKeeperClient client = provider.get("getDeployment");
    try {
      final byte[] data = client.getData(path);
      final Task task = parse(data, Task.class);
      return Deployment.of(jobId, task.getGoal(), task.getDeployerUser(), task.getDeployerMaster(),
                           task.getDeploymentGroupName());
    } catch (KeeperException.NoNodeException e) {
      return null;
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("getting deployment failed", e);
    }
  }

  /**
   * Returns the current status of the host named by {@code host}.
   */
  @Override
  public HostStatus getHostStatus(final String host) {
    final Stat stat;
    final ZooKeeperClient client = provider.get("getHostStatus");

    try {
      stat = client.exists(Paths.configHostId(host));
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("Failed to check host status", e);
    }

    if (stat == null) {
      return null;
    }

    final boolean up = checkHostUp(client, host);
    final HostInfo hostInfo = getHostInfo(client, host);
    final AgentInfo agentInfo = getAgentInfo(client, host);
    final Map tasks = getTasks(client, host);
    final Map statuses = getTaskStatuses(client, host);
    final Map environment = getEnvironment(client, host);
    final Map labels = getLabels(client, host);

    return HostStatus.newBuilder()
        .setJobs(tasks)
        .setStatuses(fromNullable(statuses).or(EMPTY_STATUSES))
        .setHostInfo(hostInfo)
        .setAgentInfo(agentInfo)
        .setStatus(up ? UP : DOWN)
        .setEnvironment(environment)
        .setLabels(labels)
        .build();
  }

  private  T tryGetEntity(final ZooKeeperClient client, String path, TypeReference type,
                             String name) {
    try {
      final byte[] data = client.getData(path);
      return Json.read(data, type);
    } catch (NoNodeException e) {
      return null;
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("reading " + name + " info failed", e);
    }
  }

  private Map getEnvironment(final ZooKeeperClient client, final String host) {
    return tryGetEntity(client, Paths.statusHostEnvVars(host), STRING_MAP_TYPE, "environment");
  }

  private Map getLabels(final ZooKeeperClient client, final String host) {
    return tryGetEntity(client, Paths.statusHostLabels(host), STRING_MAP_TYPE, "labels");
  }

  private AgentInfo getAgentInfo(final ZooKeeperClient client, final String host) {
    return tryGetEntity(client, Paths.statusHostAgentInfo(host), AGENT_INFO_TYPE, "agent info");
  }

  private HostInfo getHostInfo(final ZooKeeperClient client, final String host) {
    return tryGetEntity(client, Paths.statusHostInfo(host), HOST_INFO_TYPE, "host info");
  }

  private boolean checkHostUp(final ZooKeeperClient client, final String host) {
    try {
      final Stat stat = client.exists(Paths.statusHostUp(host));
      return stat != null;
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("getting host " + host + " up status failed", e);
    }
  }

  private Map getTaskStatuses(final ZooKeeperClient client, final String host) {
    final Map statuses = Maps.newHashMap();
    final List jobIds = listHostJobs(client, host);
    for (final JobId jobId : jobIds) {
      TaskStatus status;
      try {
        status = getTaskStatus(client, host, jobId);
      } catch (HeliosRuntimeException e) {
        // Skip this task status so we can return other available information instead of failing the
        // entire thing.
        status = null;
      }

      if (status != null) {
        statuses.put(jobId, status);
      } else {
        log.debug("Task {} status missing for host {}", jobId, host);
      }
    }

    return statuses;
  }

  private List listHostJobs(final ZooKeeperClient client, final String host) {
    final List jobIdStrings;
    final String folder = Paths.statusHostJobs(host);
    try {
      jobIdStrings = client.getChildren(folder);
    } catch (KeeperException.NoNodeException e) {
      return null;
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("List tasks for host failed: " + host, e);
    }
    final ImmutableList.Builder jobIds = ImmutableList.builder();
    for (String jobIdString : jobIdStrings) {
      jobIds.add(JobId.fromString(jobIdString));
    }
    return jobIds.build();
  }

  @Nullable
  private TaskStatus getTaskStatus(final ZooKeeperClient client, final String host,
                                   final JobId jobId) {
    final String containerPath = Paths.statusHostJob(host, jobId);
    try {
      final byte[] data = client.getData(containerPath);
      return parse(data, TaskStatus.class);
    } catch (NoNodeException ignored) {
      return null;
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("Getting task " + jobId + " status " +
                                       "for host " + host + " failed", e);
    }
  }

  private Map getTasks(final ZooKeeperClient client, final String host) {
    final Map jobs = Maps.newHashMap();
    try {
      final String folder = Paths.configHostJobs(host);
      final List jobIds;
      try {
        jobIds = client.getChildren(folder);
      } catch (KeeperException.NoNodeException e) {
        return null;
      }

      for (final String jobIdString : jobIds) {
        final JobId jobId = JobId.fromString(jobIdString);
        final String containerPath = Paths.configHostJob(host, jobId);
        try {
          final byte[] data = client.getData(containerPath);
          final Task task = parse(data, Task.class);
          jobs.put(jobId, Deployment.of(jobId, task.getGoal(), task.getDeployerUser(),
                                        task.getDeployerMaster(), task.getDeploymentGroupName()));
        } catch (KeeperException.NoNodeException ignored) {
          log.debug("deployment config node disappeared: {}", jobIdString);
        }
      }
    } catch (KeeperException | IOException e) {
      throw new HeliosRuntimeException("getting deployment config failed", e);
    }

    return jobs;
  }

  @Override
  public Deployment undeployJob(String host, JobId jobId)
      throws HostNotFoundException, JobNotDeployedException {
    try {
      return undeployJob(host, jobId, Job.EMPTY_TOKEN);
    } catch (TokenVerificationException e) {
      throw Throwables.propagate(e);
    }
  }

  /**
   * Undeploys the job specified by {@code jobId} on {@code host}.
   */
  @Override
  public Deployment undeployJob(final String host, final JobId jobId, final String token)
      throws HostNotFoundException, JobNotDeployedException, TokenVerificationException {
    log.info("undeploying {}: {}", jobId, host);
    final ZooKeeperClient client = provider.get("undeployJob");

    assertHostExists(client, host);

    final Deployment deployment = getDeployment(host, jobId);
    if (deployment == null) {
      throw new JobNotDeployedException(host, jobId);
    }

    final Job job = getJob(client, jobId);
    verifyToken(token, job);
    final String configHostJobPath = Paths.configHostJob(host, jobId);

    try {
      // use listRecursive to remove both job node and its child creation node
      final List nodes = newArrayList(reverse(client.listRecursive(configHostJobPath)));
      nodes.add(Paths.configJobHost(jobId, host));

      final List staticPorts = staticPorts(job);
      for (int port : staticPorts) {
          nodes.add(Paths.configHostPort(host, port));
      }

      client.transaction(delete(nodes));

    } catch (NoNodeException e) {
      // This method is racy since it's possible someone undeployed the job after we called
      // getDeployment and checked the job exists. If we now discover the job is undeployed,
      // throw an exception and handle it the same as if we discovered this earlier.
      throw new JobNotDeployedException(host, jobId);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("Removing deployment failed", e);
    }
    return deployment;
  }

  private List getUndeployOperations(final ZooKeeperClient client,
                                                        final String host, final JobId jobId,
                                                        final String token)
      throws HostNotFoundException, JobNotDeployedException, TokenVerificationException {
    assertHostExists(client, host);

    final Deployment deployment = getDeployment(host, jobId);
    if (deployment == null) {
      throw new JobNotDeployedException(host, jobId);
    }

    final Job job = getJob(client, jobId);
    verifyToken(token, job);
    final String configHostJobPath = Paths.configHostJob(host, jobId);

    try {
      // use listRecursive to remove both job node and its child creation node
      final List nodes = newArrayList(reverse(client.listRecursive(configHostJobPath)));
      nodes.add(Paths.configJobHost(jobId, host));

      final List staticPorts = staticPorts(job);
      for (int port : staticPorts) {
        nodes.add(Paths.configHostPort(host, port));
      }

      return ImmutableList.of(delete(nodes));

    } catch (NoNodeException e) {
      // This method is racy since it's possible someone undeployed the job after we called
      // getDeployment and checked the job exists. If we now discover the job is undeployed,
      // throw an exception and handle it the same as if we discovered this earlier.
      throw new JobNotDeployedException(host, jobId);
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("calculating undeploy operations failed", e);
    }
  }

  private List getDeployOperations(final ZooKeeperClient client,
                                                       final String host,
                                                       final Deployment deployment,
                                                       final String token)
      throws JobDoesNotExistException, JobAlreadyDeployedException, TokenVerificationException,
             HostNotFoundException {
    assertHostExists(client, host);
    final JobId id = deployment.getJobId();
    final Job job = getJob(id);

    if (job == null) {
      throw new JobDoesNotExistException(id);
    }
    verifyToken(token, job);

    final UUID operationId = UUID.randomUUID();
    final String jobPath = Paths.configJob(id);
    final String taskPath = Paths.configHostJob(host, id);
    final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId);

    final List staticPorts = staticPorts(job);
    final Map portNodes = Maps.newHashMap();
    final byte[] idJson = id.toJsonBytes();
    for (final int port : staticPorts) {
      final String path = Paths.configHostPort(host, port);
      portNodes.put(path, idJson);
    }

    final Task task = new Task(job, deployment.getGoal(), deployment.getDeployerUser(),
                               deployment.getDeployerMaster(), deployment.getDeploymentGroupName());
    final List operations = Lists.newArrayList(
        check(jobPath),
        create(portNodes),
        create(Paths.configJobHost(id, host)));

    // Attempt to read a task here.
    try {
      client.getNode(taskPath);
      // if we get here the node exists already
      throw new JobAlreadyDeployedException(host, id);
    } catch (NoNodeException e) {
      operations.add(create(taskPath, task));
      operations.add(create(taskCreationPath));
    } catch (KeeperException e) {
      throw new HeliosRuntimeException("reading existing task description failed", e);
    }

    return ImmutableList.copyOf(operations);
  }

  private static void verifyToken(final String token, final Job job)
      throws TokenVerificationException {
    checkNotNull(token, "token");
    if (!token.equals(job.getToken())) {
      throw new TokenVerificationException(job.getId());
    }
  }

  private static class RollingUpdateTaskResult {
    private final List operations;
    private final Exception error;
    private final String host;

    public static final RollingUpdateTaskResult TASK_IN_PROGRESS = of(null);

    public static final RollingUpdateTaskResult TASK_COMPLETE = of(
        Collections.emptyList());

    @Override
    public boolean equals(Object o) {
      if (this == o) {
        return true;
      }
      if (o == null || getClass() != o.getClass()) {
        return false;
      }

      RollingUpdateTaskResult that = (RollingUpdateTaskResult) o;

      if (operations != null ? !operations.equals(that.operations) : that.operations != null) {
        return false;
      }
      if (error != null ? !error.equals(that.error) : that.error != null) {
        return false;
      }
      return !(host != null ? !host.equals(that.host) : that.host != null);

    }

    @Override
    public int hashCode() {
      int result = operations != null ? operations.hashCode() : 0;
      result = 31 * result + (error != null ? error.hashCode() : 0);
      result = 31 * result + (host != null ? host.hashCode() : 0);
      return result;
    }

    private RollingUpdateTaskResult(final List operations,
                                    final Exception error,
                                    final String host) {
      this.operations = operations;
      this.error = error;
      this.host = host;
    }

    public static RollingUpdateTaskResult of(final List operations) {
      return new RollingUpdateTaskResult(operations, null, null);
    }

    public static RollingUpdateTaskResult error(final Exception error) {
      return RollingUpdateTaskResult.error(error, null);
    }

    public static RollingUpdateTaskResult error(final Exception error, final String host) {
      return new RollingUpdateTaskResult(null, error, host);
    }

    public static RollingUpdateTaskResult error(final String error) {
      return RollingUpdateTaskResult.error(error, null);
    }

    public static RollingUpdateTaskResult error(final String error, final String host) {
      return new RollingUpdateTaskResult(null, new HeliosRuntimeException(error), host);
    }
  }

  private static class RolloutOpsEvents {
    private final List operations = Lists.newArrayList();
    private final List events = Lists.newArrayList();

    private void addOperation(final ZooKeeperOperation op) {
      operations.add(op);
    }

    private void addEvent(final DeploymentGroupEvent event) {
      events.add(event);
    }

    private void addAll(final RolloutOpsEvents other) {
      operations.addAll(other.getOperations());
      events.addAll(other.getEvents());
    }

    public List getOperations() {
      return ImmutableList.copyOf(operations);
    }

    public List getEvents() {
      return ImmutableList.copyOf(events);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy