All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdaterImpl Maven / Gradle / Ivy

The newest version!
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.yarn.server.nodemanager;

import java.io.IOException;
import java.net.ConnectException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;

import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DataInputByteBuffer;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.VersionUtil;
import org.apache.hadoop.yarn.api.protocolrecords.SignalContainerRequest;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeLabel;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
import org.apache.hadoop.yarn.api.records.NodeAttribute;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factories.impl.pb.RecordFactoryPBImpl;
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
import org.apache.hadoop.yarn.nodelabels.NodeLabelUtil;
import org.apache.hadoop.yarn.server.api.ResourceManagerConstants;
import org.apache.hadoop.yarn.server.api.ResourceTracker;
import org.apache.hadoop.yarn.server.api.ServerRMProxy;
import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
import org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse;
import org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerRequest;
import org.apache.hadoop.yarn.server.api.records.AppCollectorData;
import org.apache.hadoop.yarn.server.api.records.ContainerQueuingLimit;
import org.apache.hadoop.yarn.server.api.records.MasterKey;
import org.apache.hadoop.yarn.server.api.records.NodeAction;
import org.apache.hadoop.yarn.server.api.records.NodeHealthStatus;
import org.apache.hadoop.yarn.server.api.records.NodeStatus;
import org.apache.hadoop.yarn.server.api.records.OpportunisticContainersStatus;
import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.ApplicationState;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin;
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeAttributesProvider;
import org.apache.hadoop.yarn.server.nodemanager.nodelabels.NodeLabelsProvider;
import org.apache.hadoop.yarn.server.nodemanager.timelineservice.NMTimelinePublisher;
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
import org.apache.hadoop.yarn.util.YarnVersionInfo;
import org.apache.hadoop.yarn.util.resource.Resources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;

public class NodeStatusUpdaterImpl extends AbstractService implements
    NodeStatusUpdater {

  public static final String YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS =
      YarnConfiguration.NM_PREFIX + "duration-to-track-stopped-containers";

  private static final Logger LOG =
       LoggerFactory.getLogger(NodeStatusUpdaterImpl.class);

  private final Object heartbeatMonitor = new Object();
  private final Object shutdownMonitor = new Object();

  private final Context context;
  private final Dispatcher dispatcher;

  private NodeId nodeId;
  private long nextHeartBeatInterval;
  private ResourceTracker resourceTracker;
  private Resource totalResource;
  private Resource physicalResource;
  private int httpPort;
  private String nodeManagerVersionId;
  private String minimumResourceManagerVersion;
  private volatile boolean isStopped;
  private boolean tokenKeepAliveEnabled;
  private long tokenRemovalDelayMs;
  /** Keeps track of when the next keep alive request should be sent for an app*/
  private Map appTokenKeepAliveMap =
      new HashMap();
  private Random keepAliveDelayRandom = new Random();
  // It will be used to track recently stopped containers on node manager, this
  // is to avoid the misleading no-such-container exception messages on NM, when
  // the AM finishes it informs the RM to stop the may-be-already-completed
  // containers.
  private final Map recentlyStoppedContainers;
  // Save the reported completed containers in case of lost heartbeat responses.
  // These completed containers will be sent again till a successful response.
  private final Map pendingCompletedContainers;
  // Duration for which to track recently stopped container.
  private long durationToTrackStoppedContainers;

  private boolean logAggregationEnabled;

  private final List logAggregationReportForAppsTempList;

  private final NodeHealthCheckerService healthChecker;
  private final NodeManagerMetrics metrics;

  private Runnable statusUpdaterRunnable;
  private Thread  statusUpdater;
  private boolean failedToConnect = false;
  private long rmIdentifier = ResourceManagerConstants.RM_INVALID_IDENTIFIER;
  private boolean registeredWithRM = false;
  Set pendingContainersToRemove = new HashSet();

  private NMNodeLabelsHandler nodeLabelsHandler;
  private NMNodeAttributesHandler nodeAttributesHandler;
  private NodeLabelsProvider nodeLabelsProvider;
  private NodeAttributesProvider nodeAttributesProvider;
  private boolean timelineServiceV2Enabled;

  public NodeStatusUpdaterImpl(Context context, Dispatcher dispatcher,
      NodeHealthCheckerService healthChecker, NodeManagerMetrics metrics) {
    super(NodeStatusUpdaterImpl.class.getName());
    this.healthChecker = healthChecker;
    this.context = context;
    this.dispatcher = dispatcher;
    this.metrics = metrics;
    this.recentlyStoppedContainers = new LinkedHashMap();
    this.pendingCompletedContainers =
        new HashMap();
    this.logAggregationReportForAppsTempList =
        new ArrayList();
  }

  @Override
  public void setNodeAttributesProvider(NodeAttributesProvider provider) {
    this.nodeAttributesProvider = provider;
  }

  @Override
  public void setNodeLabelsProvider(NodeLabelsProvider provider) {
    this.nodeLabelsProvider = provider;
  }

  @Override
  protected void serviceInit(Configuration conf) throws Exception {
    this.totalResource = NodeManagerHardwareUtils.getNodeResources(conf);
    long memoryMb = totalResource.getMemorySize();
    float vMemToPMem =
        conf.getFloat(
            YarnConfiguration.NM_VMEM_PMEM_RATIO,
            YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
    long virtualMemoryMb = (long)Math.ceil(memoryMb * vMemToPMem);
    int virtualCores = totalResource.getVirtualCores();

    // Update configured resources via plugins.
    updateConfiguredResourcesViaPlugins(totalResource);

    LOG.info("Nodemanager resources is set to: " + totalResource);

    metrics.addResource(totalResource);

    // Get actual node physical resources
    long physicalMemoryMb = memoryMb;
    int physicalCores = virtualCores;
    ResourceCalculatorPlugin rcp =
        ResourceCalculatorPlugin.getNodeResourceMonitorPlugin(conf);
    if (rcp != null) {
      physicalMemoryMb = rcp.getPhysicalMemorySize() / (1024 * 1024);
      physicalCores = rcp.getNumProcessors();
    }
    this.physicalResource =
        Resource.newInstance(physicalMemoryMb, physicalCores);

    this.tokenKeepAliveEnabled = isTokenKeepAliveEnabled(conf);
    this.tokenRemovalDelayMs =
        conf.getInt(YarnConfiguration.RM_NM_EXPIRY_INTERVAL_MS,
            YarnConfiguration.DEFAULT_RM_NM_EXPIRY_INTERVAL_MS);

    this.minimumResourceManagerVersion = conf.get(
        YarnConfiguration.NM_RESOURCEMANAGER_MINIMUM_VERSION,
        YarnConfiguration.DEFAULT_NM_RESOURCEMANAGER_MINIMUM_VERSION);

    nodeLabelsHandler =
        createNMNodeLabelsHandler(nodeLabelsProvider);
    nodeAttributesHandler =
        createNMNodeAttributesHandler(nodeAttributesProvider);

    // Default duration to track stopped containers on nodemanager is 10Min.
    // This should not be assigned very large value as it will remember all the
    // containers stopped during that time.
    durationToTrackStoppedContainers =
        conf.getLong(YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS,
          600000);
    if (durationToTrackStoppedContainers < 0) {
      String message = "Invalid configuration for "
        + YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS + " default "
          + "value is 10Min(600000).";
      LOG.error(message);
      throw new YarnException(message);
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug(YARN_NODEMANAGER_DURATION_TO_TRACK_STOPPED_CONTAINERS + " :"
        + durationToTrackStoppedContainers);
    }
    super.serviceInit(conf);
    LOG.info("Initialized nodemanager with :" +
        " physical-memory=" + memoryMb + " virtual-memory=" + virtualMemoryMb +
        " virtual-cores=" + virtualCores);

    this.logAggregationEnabled =
        conf.getBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED,
          YarnConfiguration.DEFAULT_LOG_AGGREGATION_ENABLED);
    this.timelineServiceV2Enabled = YarnConfiguration.
        timelineServiceV2Enabled(conf);
  }

  @Override
  protected void serviceStart() throws Exception {

    // NodeManager is the last service to start, so NodeId is available.
    this.nodeId = this.context.getNodeId();
    LOG.info("Node ID assigned is : " + this.nodeId);
    this.httpPort = this.context.getHttpPort();
    this.nodeManagerVersionId = YarnVersionInfo.getVersion();
    try {
      // Registration has to be in start so that ContainerManager can get the
      // perNM tokens needed to authenticate ContainerTokens.
      this.resourceTracker = getRMClient();
      registerWithRM();
      super.serviceStart();
      startStatusUpdater();
    } catch (Exception e) {
      String errorMessage = "Unexpected error starting NodeStatusUpdater";
      LOG.error(errorMessage, e);
      throw new YarnRuntimeException(e);
    }
  }

  @Override
  protected void serviceStop() throws Exception {
    // the isStopped check is for avoiding multiple unregistrations.
    synchronized(shutdownMonitor) {
      if (this.registeredWithRM && !this.isStopped
          && !isNMUnderSupervisionWithRecoveryEnabled()
          && !context.getDecommissioned() && !failedToConnect) {
        unRegisterNM();
      }
      // Interrupt the updater.
      this.isStopped = true;
      stopRMProxy();
      super.serviceStop();
    }
  }

  private boolean isNMUnderSupervisionWithRecoveryEnabled() {
    Configuration config = getConfig();
    return config.getBoolean(YarnConfiguration.NM_RECOVERY_ENABLED,
        YarnConfiguration.DEFAULT_NM_RECOVERY_ENABLED)
        && config.getBoolean(YarnConfiguration.NM_RECOVERY_SUPERVISED,
            YarnConfiguration.DEFAULT_NM_RECOVERY_SUPERVISED);
  }

  private void unRegisterNM() {
    RecordFactory recordFactory = RecordFactoryPBImpl.get();
    UnRegisterNodeManagerRequest request = recordFactory
        .newRecordInstance(UnRegisterNodeManagerRequest.class);
    request.setNodeId(this.nodeId);
    try {
      resourceTracker.unRegisterNodeManager(request);
      LOG.info("Successfully Unregistered the Node " + this.nodeId
          + " with ResourceManager.");
    } catch (Exception e) {
      LOG.warn("Unregistration of the Node " + this.nodeId + " failed.", e);
    }
  }

  protected void rebootNodeStatusUpdaterAndRegisterWithRM() {
    // Interrupt the updater.
    synchronized(shutdownMonitor) {
      if(this.isStopped) {
        LOG.info("Currently being shutdown. Aborting reboot");
        return;
      }
      this.isStopped = true;
      sendOutofBandHeartBeat();
      try {
        statusUpdater.join();
        registerWithRM();
        statusUpdater = new Thread(statusUpdaterRunnable, "Node Status Updater");
        this.isStopped = false;
        statusUpdater.start();
        LOG.info("NodeStatusUpdater thread is reRegistered and restarted");
      } catch (Exception e) {
        String errorMessage = "Unexpected error rebooting NodeStatusUpdater";
        LOG.error(errorMessage, e);
        throw new YarnRuntimeException(e);
      }
    }
  }

  @VisibleForTesting
  protected void stopRMProxy() {
    if(this.resourceTracker != null) {
      RPC.stopProxy(this.resourceTracker);
    }
  }

  @Private
  protected boolean isTokenKeepAliveEnabled(Configuration conf) {
    return conf.getBoolean(YarnConfiguration.LOG_AGGREGATION_ENABLED,
        YarnConfiguration.DEFAULT_LOG_AGGREGATION_ENABLED)
        && UserGroupInformation.isSecurityEnabled();
  }

  @VisibleForTesting
  protected ResourceTracker getRMClient() throws IOException {
    Configuration conf = getConfig();
    return ServerRMProxy.createRMProxy(conf, ResourceTracker.class);
  }

  private void updateConfiguredResourcesViaPlugins(
      Resource configuredResource) throws YarnException {
    ResourcePluginManager pluginManager = context.getResourcePluginManager();
    if (pluginManager != null && pluginManager.getNameToPlugins() != null) {
      // Update configured resource
      for (ResourcePlugin resourcePlugin : pluginManager.getNameToPlugins()
          .values()) {
        if (resourcePlugin.getNodeResourceHandlerInstance() != null) {
          resourcePlugin.getNodeResourceHandlerInstance()
              .updateConfiguredResource(configuredResource);
        }
      }
    }
  }

  @VisibleForTesting
  protected void registerWithRM()
      throws YarnException, IOException {
    RegisterNodeManagerResponse regNMResponse;
    Set nodeLabels = nodeLabelsHandler.getNodeLabelsForRegistration();
    Set nodeAttributes =
        nodeAttributesHandler.getNodeAttributesForRegistration();

    // Synchronize NM-RM registration with
    // ContainerManagerImpl#increaseContainersResource and
    // ContainerManagerImpl#startContainers to avoid race condition
    // during RM recovery
    synchronized (this.context) {
      List containerReports = getNMContainerStatuses();
      RegisterNodeManagerRequest request =
          RegisterNodeManagerRequest.newInstance(nodeId, httpPort, totalResource,
              nodeManagerVersionId, containerReports, getRunningApplications(),
              nodeLabels, physicalResource, nodeAttributes);

      if (containerReports != null) {
        LOG.info("Registering with RM using containers :" + containerReports);
      }
      if (logAggregationEnabled) {
        // pull log aggregation status for application running in this NM
        List logAggregationReports =
            context.getNMLogAggregationStatusTracker()
                .pullCachedLogAggregationReports();
        if (LOG.isDebugEnabled()) {
          LOG.debug("The cache log aggregation status size:"
              + logAggregationReports.size());
        }
        if (logAggregationReports != null
            && !logAggregationReports.isEmpty()) {
          request.setLogAggregationReportsForApps(logAggregationReports);
        }
      }
      regNMResponse =
          resourceTracker.registerNodeManager(request);
      // Make sure rmIdentifier is set before we release the lock
      this.rmIdentifier = regNMResponse.getRMIdentifier();
    }

    // if the Resource Manager instructs NM to shutdown.
    if (NodeAction.SHUTDOWN.equals(regNMResponse.getNodeAction())) {
      String message =
          "Message from ResourceManager: "
              + regNMResponse.getDiagnosticsMessage();
      throw new YarnRuntimeException(
        "Received SHUTDOWN signal from Resourcemanager, Registration of NodeManager failed, "
            + message);
    }

    // if ResourceManager version is too old then shutdown
    if (!minimumResourceManagerVersion.equals("NONE")){
      if (minimumResourceManagerVersion.equals("EqualToNM")){
        minimumResourceManagerVersion = nodeManagerVersionId;
      }
      String rmVersion = regNMResponse.getRMVersion();
      if (rmVersion == null) {
        String message = "The Resource Manager's did not return a version. "
            + "Valid version cannot be checked.";
        throw new YarnRuntimeException("Shutting down the Node Manager. "
            + message);
      }
      if (VersionUtil.compareVersions(rmVersion,minimumResourceManagerVersion) < 0) {
        String message = "The Resource Manager's version ("
            + rmVersion +") is less than the minimum "
            + "allowed version " + minimumResourceManagerVersion;
        throw new YarnRuntimeException("Shutting down the Node Manager on RM "
            + "version error, " + message);
      }
    }
    this.registeredWithRM = true;
    MasterKey masterKey = regNMResponse.getContainerTokenMasterKey();
    // do this now so that its set before we start heartbeating to RM
    // It is expected that status updater is started by this point and
    // RM gives the shared secret in registration during
    // StatusUpdater#start().
    if (masterKey != null) {
      this.context.getContainerTokenSecretManager().setMasterKey(masterKey);
    }

    masterKey = regNMResponse.getNMTokenMasterKey();
    if (masterKey != null) {
      this.context.getNMTokenSecretManager().setMasterKey(masterKey);
    }

    StringBuilder successfullRegistrationMsg = new StringBuilder();
    successfullRegistrationMsg.append("Registered with ResourceManager as ")
        .append(this.nodeId);

    Resource newResource = regNMResponse.getResource();
    if (newResource != null) {
      updateNMResource(newResource);
      successfullRegistrationMsg.append(" with updated total resource of ")
          .append(this.totalResource);
    } else {
      successfullRegistrationMsg.append(" with total resource of ")
          .append(this.totalResource);
    }

    successfullRegistrationMsg.append(nodeLabelsHandler
        .verifyRMRegistrationResponseForNodeLabels(regNMResponse));
    successfullRegistrationMsg.append(nodeAttributesHandler
        .verifyRMRegistrationResponseForNodeAttributes(regNMResponse));

    LOG.info(successfullRegistrationMsg.toString());
  }

  private List createKeepAliveApplicationList() {
    if (!tokenKeepAliveEnabled) {
      return Collections.emptyList();
    }

    List appList = new ArrayList();
    for (Iterator> i =
        this.appTokenKeepAliveMap.entrySet().iterator(); i.hasNext();) {
      Entry e = i.next();
      ApplicationId appId = e.getKey();
      Long nextKeepAlive = e.getValue();
      if (!this.context.getApplications().containsKey(appId)) {
        // Remove if the application has finished.
        i.remove();
      } else if (System.currentTimeMillis() > nextKeepAlive) {
        // KeepAlive list for the next hearbeat.
        appList.add(appId);
        trackAppForKeepAlive(appId);
      }
    }
    return appList;
  }

  @VisibleForTesting
  protected NodeStatus getNodeStatus(int responseId) throws IOException {

    NodeHealthStatus nodeHealthStatus = this.context.getNodeHealthStatus();
    nodeHealthStatus.setHealthReport(healthChecker.getHealthReport());
    nodeHealthStatus.setIsNodeHealthy(healthChecker.isHealthy());
    nodeHealthStatus.setLastHealthReportTime(healthChecker
      .getLastHealthReportTime());
    if (LOG.isDebugEnabled()) {
      LOG.debug("Node's health-status : " + nodeHealthStatus.getIsNodeHealthy()
          + ", " + nodeHealthStatus.getHealthReport());
    }
    List containersStatuses = getContainerStatuses();
    ResourceUtilization containersUtilization = getContainersUtilization();
    ResourceUtilization nodeUtilization = getNodeUtilization();
    List increasedContainers
        = getIncreasedContainers();
    NodeStatus nodeStatus =
        NodeStatus.newInstance(nodeId, responseId, containersStatuses,
          createKeepAliveApplicationList(), nodeHealthStatus,
          containersUtilization, nodeUtilization, increasedContainers);

    nodeStatus.setOpportunisticContainersStatus(
        getOpportunisticContainersStatus());
    return nodeStatus;
  }

  /**
   * Get the status of the OPPORTUNISTIC containers.
   * @return the status of the OPPORTUNISTIC containers.
   */
  private OpportunisticContainersStatus getOpportunisticContainersStatus() {
    OpportunisticContainersStatus status =
        this.context.getContainerManager().getOpportunisticContainersStatus();
    return status;
  }

  /**
   * Get the aggregated utilization of the containers in this node.
   * @return Resource utilization of all the containers.
   */
  private ResourceUtilization getContainersUtilization() {
    ContainersMonitor containersMonitor =
        this.context.getContainerManager().getContainersMonitor();
    return containersMonitor.getContainersUtilization();
  }

  /**
   * Get the utilization of the node. This includes the containers.
   * @return Resource utilization of the node.
   */
  private ResourceUtilization getNodeUtilization() {
    NodeResourceMonitorImpl nodeResourceMonitor =
        (NodeResourceMonitorImpl) this.context.getNodeResourceMonitor();
    return nodeResourceMonitor.getUtilization();
  }

  /* Get the containers whose resource has been increased since last
   * NM-RM heartbeat.
   */
  private List
      getIncreasedContainers() {
    List
        increasedContainers = new ArrayList<>(
            this.context.getIncreasedContainers().values());
    for (org.apache.hadoop.yarn.api.records.Container
        container : increasedContainers) {
      this.context.getIncreasedContainers().remove(container.getId());
    }
    return increasedContainers;
  }

  // Update NM's Resource.
  private void updateNMResource(Resource resource) {
    metrics.addResource(Resources.subtract(resource, totalResource));
    this.totalResource = resource;
  }

  // Iterate through the NMContext and clone and get all the containers'
  // statuses. If it's a completed container, add into the
  // recentlyStoppedContainers collections.
  @VisibleForTesting
  protected List getContainerStatuses() throws IOException {
    List containerStatuses = new ArrayList();
    for (Container container : this.context.getContainers().values()) {
      ContainerId containerId = container.getContainerId();
      ApplicationId applicationId = containerId.getApplicationAttemptId()
          .getApplicationId();
      org.apache.hadoop.yarn.api.records.ContainerStatus containerStatus =
          container.cloneAndGetContainerStatus();
      if (containerStatus.getState() == ContainerState.COMPLETE) {
        if (isApplicationStopped(applicationId)) {
          if (LOG.isDebugEnabled()) {
            LOG.debug(applicationId + " is completing, " + " remove "
                + containerId + " from NM context.");
          }
          context.getContainers().remove(containerId);
          pendingCompletedContainers.put(containerId, containerStatus);
        } else {
          if (!isContainerRecentlyStopped(containerId)) {
            pendingCompletedContainers.put(containerId, containerStatus);
          }
        }
        // Adding to finished containers cache. Cache will keep it around at
        // least for #durationToTrackStoppedContainers duration. In the
        // subsequent call to stop container it will get removed from cache.
        addCompletedContainer(containerId);
      } else {
        containerStatuses.add(containerStatus);
      }
    }

    containerStatuses.addAll(pendingCompletedContainers.values());

    if (LOG.isDebugEnabled()) {
      LOG.debug("Sending out " + containerStatuses.size()
          + " container statuses: " + containerStatuses);
    }
    return containerStatuses;
  }

  private List getRunningApplications() {
    List runningApplications = new ArrayList();
    for (Entry appEntry : this.context
        .getApplications().entrySet()) {
      if (ApplicationState.FINISHED != appEntry.getValue()
          .getApplicationState()) {
        runningApplications.add(appEntry.getKey());
      }
    }
    return runningApplications;
  }

  // These NMContainerStatus are sent on NM registration and used by YARN only.
  private List getNMContainerStatuses() throws IOException {
    List containerStatuses =
        new ArrayList();
    for (Container container : this.context.getContainers().values()) {
      ContainerId containerId = container.getContainerId();
      ApplicationId applicationId = containerId.getApplicationAttemptId()
          .getApplicationId();
      if (!this.context.getApplications().containsKey(applicationId)) {
        context.getContainers().remove(containerId);
        continue;
      }
      NMContainerStatus status =
          container.getNMContainerStatus();
      containerStatuses.add(status);
      if (status.getContainerState() == ContainerState.COMPLETE) {
        // Adding to finished containers cache. Cache will keep it around at
        // least for #durationToTrackStoppedContainers duration. In the
        // subsequent call to stop container it will get removed from cache.
        addCompletedContainer(containerId);
      }
    }
    LOG.info("Sending out " + containerStatuses.size()
      + " NM container statuses: " + containerStatuses);
    return containerStatuses;
  }

  private boolean isApplicationStopped(ApplicationId applicationId) {
    if (!this.context.getApplications().containsKey(applicationId)) {
      return true;
    }

    ApplicationState applicationState = this.context.getApplications().get(
        applicationId).getApplicationState();
    if (applicationState == ApplicationState.FINISHING_CONTAINERS_WAIT
        || applicationState == ApplicationState.APPLICATION_RESOURCES_CLEANINGUP
        || applicationState == ApplicationState.FINISHED) {
      return true;
    } else {
      return false;
    }
  }

  @Override
  public void addCompletedContainer(ContainerId containerId) {
    synchronized (recentlyStoppedContainers) {
      removeVeryOldStoppedContainersFromCache();
      if (!recentlyStoppedContainers.containsKey(containerId)) {
        recentlyStoppedContainers.put(containerId,
            System.currentTimeMillis() + durationToTrackStoppedContainers);
      }
    }
  }

  @VisibleForTesting
  @Private
  public void removeOrTrackCompletedContainersFromContext(
      List containerIds) throws IOException {
    Set removedContainers = new HashSet();

    pendingContainersToRemove.addAll(containerIds);
    Iterator iter = pendingContainersToRemove.iterator();
    while (iter.hasNext()) {
      ContainerId containerId = iter.next();
      // remove the container only if the container is at DONE state
      Container nmContainer = context.getContainers().get(containerId);
      if (nmContainer == null) {
        iter.remove();
      } else if (nmContainer.getContainerState().equals(
        org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerState.DONE)) {
        context.getContainers().remove(containerId);
        removedContainers.add(containerId);
        iter.remove();
      }
    }

    if (!removedContainers.isEmpty()) {
      LOG.info("Removed completed containers from NM context: "
          + removedContainers);
    }
    pendingCompletedContainers.clear();
  }

  private void trackAppsForKeepAlive(List appIds) {
    if (tokenKeepAliveEnabled && appIds != null && appIds.size() > 0) {
      for (ApplicationId appId : appIds) {
        trackAppForKeepAlive(appId);
      }
    }
  }

  private void trackAppForKeepAlive(ApplicationId appId) {
    // Next keepAlive request for app between 0.7 & 0.9 of when the token will
    // likely expire.
    long nextTime = System.currentTimeMillis()
    + (long) (0.7 * tokenRemovalDelayMs + (0.2 * tokenRemovalDelayMs
        * keepAliveDelayRandom.nextInt(100))/100);
    appTokenKeepAliveMap.put(appId, nextTime);
  }

  @Override
  public void sendOutofBandHeartBeat() {
    synchronized (this.heartbeatMonitor) {
      this.heartbeatMonitor.notify();
    }
  }

  @VisibleForTesting
  Thread.State getStatusUpdaterThreadState() {
    return statusUpdater.getState();
  }

  public boolean isContainerRecentlyStopped(ContainerId containerId) {
    synchronized (recentlyStoppedContainers) {
      return recentlyStoppedContainers.containsKey(containerId);
    }
  }

  @Override
  public void clearFinishedContainersFromCache() {
    synchronized (recentlyStoppedContainers) {
      recentlyStoppedContainers.clear();
    }
  }

  @Private
  @VisibleForTesting
  public void removeVeryOldStoppedContainersFromCache() {
    synchronized (recentlyStoppedContainers) {
      long currentTime = System.currentTimeMillis();
      Iterator> i =
          recentlyStoppedContainers.entrySet().iterator();
      while (i.hasNext()) {
        Entry mapEntry = i.next();
        ContainerId cid = mapEntry.getKey();
        if (mapEntry.getValue() < currentTime) {
          if (!context.getContainers().containsKey(cid)) {
            i.remove();
            try {
              context.getNMStateStore().removeContainer(cid);
            } catch (IOException e) {
              LOG.error("Unable to remove container " + cid + " in store", e);
            }
          }
        } else {
          break;
        }
      }
    }
  }

  @Override
  public long getRMIdentifier() {
    return this.rmIdentifier;
  }

  private static Map parseCredentials(
      Map systemCredentials) throws IOException {
    Map map =
        new HashMap();
    for (Map.Entry entry : systemCredentials.entrySet()) {
      Credentials credentials = new Credentials();
      DataInputByteBuffer buf = new DataInputByteBuffer();
      ByteBuffer buffer = entry.getValue();
      buffer.rewind();
      buf.reset(buffer);
      credentials.readTokenStorageStream(buf);
      map.put(entry.getKey(), credentials);
    }
    if (LOG.isDebugEnabled()) {
      for (Map.Entry entry : map.entrySet()) {
        LOG.debug("Retrieved credentials form RM for " + entry.getKey() + ": "
            + entry.getValue().getAllTokens());
      }
    }
    return map;
  }

  protected void startStatusUpdater() {
    statusUpdaterRunnable = new StatusUpdaterRunnable();
    statusUpdater =
        new Thread(statusUpdaterRunnable, "Node Status Updater");
    statusUpdater.start();
  }

  private boolean handleShutdownOrResyncCommand(
      NodeHeartbeatResponse response) {
    if (response.getNodeAction() == NodeAction.SHUTDOWN) {
      LOG.warn("Received SHUTDOWN signal from Resourcemanager as part of"
          + " heartbeat, hence shutting down.");
      LOG.warn("Message from ResourceManager: "
          + response.getDiagnosticsMessage());
      context.setDecommissioned(true);
      dispatcher.getEventHandler().handle(
          new NodeManagerEvent(NodeManagerEventType.SHUTDOWN));
      return true;
    }
    if (response.getNodeAction() == NodeAction.RESYNC) {
      LOG.warn("Node is out of sync with ResourceManager,"
          + " hence resyncing.");
      LOG.warn("Message from ResourceManager: "
          + response.getDiagnosticsMessage());
      // Invalidate the RMIdentifier while resync
      NodeStatusUpdaterImpl.this.rmIdentifier =
          ResourceManagerConstants.RM_INVALID_IDENTIFIER;
      dispatcher.getEventHandler().handle(
          new NodeManagerEvent(NodeManagerEventType.RESYNC));
      pendingCompletedContainers.clear();
      return true;
    }
    return false;
  }

  @Override
  public void reportException(Exception ex) {
    healthChecker.reportException(ex);
    sendOutofBandHeartBeat();
  }

  private List getLogAggregationReportsForApps(
      ConcurrentLinkedQueue lastestLogAggregationStatus) {
    LogAggregationReport status;
    while ((status = lastestLogAggregationStatus.poll()) != null) {
      this.logAggregationReportForAppsTempList.add(status);
    }
    List reports = new ArrayList();
    reports.addAll(logAggregationReportForAppsTempList);
    return reports;
  }

  private NMNodeLabelsHandler createNMNodeLabelsHandler(
      NodeLabelsProvider nodeLabelsProvider) {
    if (nodeLabelsProvider == null) {
      return new NMCentralizedNodeLabelsHandler();
    } else {
      return new NMDistributedNodeLabelsHandler(nodeLabelsProvider,
          this.getConfig());
    }
  }

  /**
   * Returns a handler based on the configured node attributes provider.
   * returns null if no provider is configured.
   * @param provider
   * @return attributes handler
   */
  private NMNodeAttributesHandler createNMNodeAttributesHandler(
      NodeAttributesProvider provider) {
    if (provider == null) {
      return new NMCentralizedNodeAttributesHandler();
    } else {
      return new NMDistributedNodeAttributesHandler(provider, this.getConfig());
    }
  }

  private static abstract class CachedNodeDescriptorHandler {
    private final long resyncInterval;
    private final T defaultValue;
    private T previousValue;
    private long lastSendMills = 0L;
    private boolean isValueSented;

    CachedNodeDescriptorHandler(T defaultValue,
        long resyncInterval) {
      this.defaultValue = defaultValue;
      this.resyncInterval = resyncInterval;
    }

    public abstract T getValueFromProvider();

    public T getValueForRegistration() {
      T value = getValueFromProvider();
      if (defaultValue != null) {
        value = (null == value) ? defaultValue : value;
      }
      previousValue = value;
      try {
        validate(value);
      } catch (IOException e) {
        value = null;
      }
      return value;
    }

    public T getValueForHeartbeat() {
      T value = getValueFromProvider();
      // if the provider returns null then consider default value are set
      if (defaultValue != null) {
        value = (null == value) ? defaultValue : value;
      }
      // take some action only on modification of value
      boolean isValueUpdated = isValueUpdated(value);

      isValueSented = false;
      // When value updated or resync time is elapsed will send again in
      // heartbeat.
      if (isValueUpdated || isResyncIntervalElapsed()) {
        previousValue = value;
        try {
          validate(value);
          isValueSented = true;
        } catch (IOException e) {
          // take previous value to replace invalid value, so that invalid
          // value are not verified for every HB, and send empty set
          // to RM to have same value which was earlier set.
          value = null;
        } finally {
          // Set last send time in heartbeat
          lastSendMills = System.currentTimeMillis();
        }
      } else {
        // if value have not changed then no need to send
        value = null;
      }
      return value;
    }

    /**
     * This method checks resync interval is elapsed or not.
     */
    public boolean isResyncIntervalElapsed() {
      long elapsedTimeSinceLastSync =
          System.currentTimeMillis() - lastSendMills;
      if (elapsedTimeSinceLastSync > resyncInterval) {
        return true;
      }
      return false;
    }

    protected abstract void validate(T value) throws IOException;

    protected abstract boolean isValueUpdated(T value);

    public long getResyncInterval() {
      return resyncInterval;
    }

    public T getDefaultValue() {
      return defaultValue;
    }

    public T getPreviousValue() {
      return previousValue;
    }

    public long getLastSendMills() {
      return lastSendMills;
    }

    public boolean isValueSented() {
      return isValueSented;
    }
  }

  private interface NMNodeAttributesHandler {

    /**
     * validates nodeAttributes From Provider and returns it to the caller. Also
     * ensures that if provider returns null then empty set is considered
     */
    Set getNodeAttributesForRegistration();

    /**
     * @return the node attributes of this node manager.
     */
    Set getNodeAttributesForHeartbeat();

    /**
     * @return RMRegistration Success message and on failure will log
     *         independently and returns empty string
     */
    String verifyRMRegistrationResponseForNodeAttributes(
        RegisterNodeManagerResponse regNMResponse);

    /**
     * check whether if updated attributes sent to RM was accepted or not.
     * @param response
     */
    void verifyRMHeartbeatResponseForNodeAttributes(
        NodeHeartbeatResponse response);
  }


  /**
   * In centralized configuration, NM need not send Node attributes or process
   * the response.
   */
  private static class NMCentralizedNodeAttributesHandler
      implements NMNodeAttributesHandler {
    @Override
    public Set getNodeAttributesForHeartbeat() {
      return null;
    }

    @Override
    public Set getNodeAttributesForRegistration() {
      return null;
    }

    @Override
    public void verifyRMHeartbeatResponseForNodeAttributes(
        NodeHeartbeatResponse response) {
    }

    @Override
    public String verifyRMRegistrationResponseForNodeAttributes(
        RegisterNodeManagerResponse regNMResponse) {
      return "";
    }
  }

  private static class NMDistributedNodeAttributesHandler
      extends CachedNodeDescriptorHandler>
      implements NMNodeAttributesHandler {

    private final NodeAttributesProvider attributesProvider;

    protected NMDistributedNodeAttributesHandler(
        NodeAttributesProvider provider, Configuration conf) {
      super(Collections.unmodifiableSet(new HashSet<>(0)),
          conf.getLong(YarnConfiguration.NM_NODE_ATTRIBUTES_RESYNC_INTERVAL,
              YarnConfiguration.DEFAULT_NM_NODE_ATTRIBUTES_RESYNC_INTERVAL));
      this.attributesProvider = provider;
    }

    @Override
    public Set getNodeAttributesForRegistration() {
      return getValueForRegistration();
    }

    @Override
    public Set getNodeAttributesForHeartbeat() {
      return getValueForHeartbeat();
    }

    @Override
    public Set getValueFromProvider() {
      return attributesProvider.getDescriptors();
    }

    @Override
    protected void validate(Set nodeAttributes)
        throws IOException {
      try {
        NodeLabelUtil.validateNodeAttributes(nodeAttributes);
      } catch (IOException e) {
        LOG.error(
            "Invalid node attribute(s) from Provider : " + e.getMessage());
        throw e;
      }
    }

    @Override
    protected boolean isValueUpdated(Set value) {
      return !NodeLabelUtil.isNodeAttributesEquals(getPreviousValue(), value);
    }

    @Override
    public String verifyRMRegistrationResponseForNodeAttributes(
        RegisterNodeManagerResponse regNMResponse) {
      StringBuilder successfulNodeAttributesRegistrationMsg =
          new StringBuilder();
      if (regNMResponse.getAreNodeAttributesAcceptedByRM()) {
        successfulNodeAttributesRegistrationMsg
            .append(" and with following Node attribute(s) : {")
            .append(getPreviousValue()).append("}");
      } else {
        // case where provider is set but RM did not accept the node attributes
        String errorMsgFromRM = regNMResponse.getDiagnosticsMessage();
        LOG.error("Node attributes sent from NM while registration were"
            + " rejected by RM. " + ((errorMsgFromRM == null) ?
            "Seems like RM is configured with Centralized Attributes." :
            "And with message " + regNMResponse.getDiagnosticsMessage()));
      }
      return successfulNodeAttributesRegistrationMsg.toString();
    }

    @Override
    public void verifyRMHeartbeatResponseForNodeAttributes(
        NodeHeartbeatResponse response) {
      if (isValueSented()) {
        if (response.getAreNodeAttributesAcceptedByRM()) {
          if(LOG.isDebugEnabled()){
            LOG.debug("Node attributes {" + getPreviousValue()
                + "} were Accepted by RM ");
          }
        } else {
          // case where updated node attributes from NodeAttributesProvider
          // is sent to RM and RM rejected the attributes
          LOG.error("NM node attributes {" + getPreviousValue()
              + "} were not accepted by RM and message from RM : " + response
              .getDiagnosticsMessage());
        }
      }
    }
  }

  private static interface NMNodeLabelsHandler {
    /**
     * validates nodeLabels From Provider and returns it to the caller. Also
     * ensures that if provider returns null then empty label set is considered
     */
    Set getNodeLabelsForRegistration();

    /**
     * @return RMRegistration Success message and on failure will log
     *         independently and returns empty string
     */
    String verifyRMRegistrationResponseForNodeLabels(
        RegisterNodeManagerResponse regNMResponse);

    /**
     * If nodeLabels From Provider is different previous node labels then it
     * will check the syntax correctness and throws exception if invalid. If
     * valid, returns nodeLabels From Provider. Also ensures that if provider
     * returns null then empty label set is considered and If labels are not
     * modified it returns null.
     */
    Set getNodeLabelsForHeartbeat();

    /**
     * check whether if updated labels sent to RM was accepted or not
     * @param response
     */
    void verifyRMHeartbeatResponseForNodeLabels(NodeHeartbeatResponse response);
  }

  /**
   * In centralized configuration, NM need not send Node labels or process the
   * response
   */
  private static class NMCentralizedNodeLabelsHandler
      implements NMNodeLabelsHandler {
    @Override
    public Set getNodeLabelsForHeartbeat() {
      return null;
    }

    @Override
    public Set getNodeLabelsForRegistration() {
      return null;
    }

    @Override
    public void verifyRMHeartbeatResponseForNodeLabels(
        NodeHeartbeatResponse response) {
    }

    @Override
    public String verifyRMRegistrationResponseForNodeLabels(
        RegisterNodeManagerResponse regNMResponse) {
      return "";
    }
  }

  private static class NMDistributedNodeLabelsHandler
      extends CachedNodeDescriptorHandler>
      implements NMNodeLabelsHandler {

    private NMDistributedNodeLabelsHandler(
        NodeLabelsProvider nodeLabelsProvider, Configuration conf) {
      super(CommonNodeLabelsManager.EMPTY_NODELABEL_SET,
          conf.getLong(YarnConfiguration.NM_NODE_LABELS_RESYNC_INTERVAL,
              YarnConfiguration.DEFAULT_NM_NODE_LABELS_RESYNC_INTERVAL));
      this.nodeLabelsProvider = nodeLabelsProvider;
    }

    private final NodeLabelsProvider nodeLabelsProvider;

    @Override
    public Set getNodeLabelsForRegistration() {
      return getValueForRegistration();
    }

    @Override
    public String verifyRMRegistrationResponseForNodeLabels(
        RegisterNodeManagerResponse regNMResponse) {
      StringBuilder successfulNodeLabelsRegistrationMsg = new StringBuilder("");
      if (regNMResponse.getAreNodeLabelsAcceptedByRM()) {
        successfulNodeLabelsRegistrationMsg
            .append(" and with following Node label(s) : {")
            .append(StringUtils.join(",", getPreviousValue())).append("}");
      } else {
        // case where provider is set but RM did not accept the Node Labels
        String errorMsgFromRM = regNMResponse.getDiagnosticsMessage();
        LOG.error(
            "NodeLabels sent from NM while registration were rejected by RM. "
            + ((errorMsgFromRM == null)
                ? "Seems like RM is configured with Centralized Labels."
                : "And with message " + regNMResponse.getDiagnosticsMessage()));
      }
      return successfulNodeLabelsRegistrationMsg.toString();
    }

    @Override
    public Set getNodeLabelsForHeartbeat() {
      return getValueForHeartbeat();
    }

    protected void validate(Set nodeLabels)
        throws IOException {
      Iterator iterator = nodeLabels.iterator();
      boolean hasInvalidLabel = false;
      StringBuilder errorMsg = new StringBuilder();
      while (iterator.hasNext()) {
        try {
          NodeLabelUtil
              .checkAndThrowLabelName(iterator.next().getName());
        } catch (IOException e) {
          errorMsg.append(e.getMessage());
          errorMsg.append(" , ");
          hasInvalidLabel = true;
        }
      }
      if (hasInvalidLabel) {
        LOG.error("Invalid Node Label(s) from Provider : " + errorMsg);
        throw new IOException(errorMsg.toString());
      }
    }

    @Override
    public Set getValueFromProvider() {
      return this.nodeLabelsProvider.getDescriptors();
    }

    @Override
    protected boolean isValueUpdated(Set value) {
      return !Objects.equals(value, getPreviousValue());
    }

    @Override
    public void verifyRMHeartbeatResponseForNodeLabels(
        NodeHeartbeatResponse response) {
      if (isValueSented()) {
        if (response.getAreNodeLabelsAcceptedByRM()) {
          if(LOG.isDebugEnabled()){
            LOG.debug(
                "Node Labels {" + StringUtils.join(",", getPreviousValue())
                    + "} were Accepted by RM ");
          }
        } else {
          // case where updated labels from NodeLabelsProvider is sent to RM and
          // RM rejected the labels
          LOG.error(
              "NM node labels {" + StringUtils.join(",", getPreviousValue())
                  + "} were not accepted by RM and message from RM : "
                  + response.getDiagnosticsMessage());
        }
      }
    }
  }

  private class StatusUpdaterRunnable implements Runnable {
    @Override
    @SuppressWarnings("unchecked")
    public void run() {
      int lastHeartbeatID = 0;
      while (!isStopped) {
        // Send heartbeat
        try {
          NodeHeartbeatResponse response = null;
          Set nodeLabelsForHeartbeat =
              nodeLabelsHandler.getNodeLabelsForHeartbeat();
          Set nodeAttributesForHeartbeat =
                  nodeAttributesHandler.getNodeAttributesForHeartbeat();
          NodeStatus nodeStatus = getNodeStatus(lastHeartbeatID);
          NodeHeartbeatRequest request =
              NodeHeartbeatRequest.newInstance(nodeStatus,
                  NodeStatusUpdaterImpl.this.context
                      .getContainerTokenSecretManager().getCurrentKey(),
                  NodeStatusUpdaterImpl.this.context
                      .getNMTokenSecretManager().getCurrentKey(),
                  nodeLabelsForHeartbeat,
                  nodeAttributesForHeartbeat,
                  NodeStatusUpdaterImpl.this.context
                      .getRegisteringCollectors());

          if (logAggregationEnabled) {
            // pull log aggregation status for application running in this NM
            List logAggregationReports =
                getLogAggregationReportsForApps(context
                    .getLogAggregationStatusForApps());
            if (logAggregationReports != null
                && !logAggregationReports.isEmpty()) {
              request.setLogAggregationReportsForApps(logAggregationReports);
            }
          }

          response = resourceTracker.nodeHeartbeat(request);
          //get next heartbeat interval from response
          nextHeartBeatInterval = response.getNextHeartBeatInterval();
          updateMasterKeys(response);

          if (!handleShutdownOrResyncCommand(response)) {
            nodeLabelsHandler.verifyRMHeartbeatResponseForNodeLabels(
                response);
            nodeAttributesHandler
                .verifyRMHeartbeatResponseForNodeAttributes(response);

            // Explicitly put this method after checking the resync
            // response. We
            // don't want to remove the completed containers before resync
            // because these completed containers will be reported back to RM
            // when NM re-registers with RM.
            // Only remove the cleanedup containers that are acked
            removeOrTrackCompletedContainersFromContext(response
                .getContainersToBeRemovedFromNM());

            logAggregationReportForAppsTempList.clear();
            lastHeartbeatID = response.getResponseId();
            List containersToCleanup = response
                .getContainersToCleanup();
            if (!containersToCleanup.isEmpty()) {
              dispatcher.getEventHandler().handle(
                  new CMgrCompletedContainersEvent(containersToCleanup,
                      CMgrCompletedContainersEvent.Reason
                          .BY_RESOURCEMANAGER));
            }
            List appsToCleanup =
                response.getApplicationsToCleanup();
            //Only start tracking for keepAlive on FINISH_APP
            trackAppsForKeepAlive(appsToCleanup);
            if (!appsToCleanup.isEmpty()) {
              dispatcher.getEventHandler().handle(
                  new CMgrCompletedAppsEvent(appsToCleanup,
                      CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
            }
            Map systemCredentials =
                response.getSystemCredentialsForApps();
            if (systemCredentials != null && !systemCredentials.isEmpty()) {
              ((NMContext) context).setSystemCrendentialsForApps(
                  parseCredentials(systemCredentials));
              context.getContainerManager().handleCredentialUpdate();
            }
            List
                containersToUpdate = response.getContainersToUpdate();
            if (!containersToUpdate.isEmpty()) {
              dispatcher.getEventHandler().handle(
                  new CMgrUpdateContainersEvent(containersToUpdate));
            }

            // SignalContainer request originally comes from end users via
            // ClientRMProtocol's SignalContainer. Forward the request to
            // ContainerManager which will dispatch the event to
            // ContainerLauncher.
            List containersToSignal = response
                .getContainersToSignalList();
            if (!containersToSignal.isEmpty()) {
              dispatcher.getEventHandler().handle(
                  new CMgrSignalContainersEvent(containersToSignal));
            }

            // Update QueuingLimits if ContainerManager supports queuing
            ContainerQueuingLimit queuingLimit =
                response.getContainerQueuingLimit();
            if (queuingLimit != null) {
              context.getContainerManager().updateQueuingLimit(queuingLimit);
            }
          }
          // Handling node resource update case.
          Resource newResource = response.getResource();
          if (newResource != null) {
            updateNMResource(newResource);
            if (LOG.isDebugEnabled()) {
              LOG.debug("Node's resource is updated to " +
                  newResource.toString());
            }
          }
          if (timelineServiceV2Enabled) {
            updateTimelineCollectorData(response);
          }

        } catch (ConnectException e) {
          //catch and throw the exception if tried MAX wait time to connect RM
          dispatcher.getEventHandler().handle(
              new NodeManagerEvent(NodeManagerEventType.SHUTDOWN));
          // failed to connect to RM.
          failedToConnect = true;
          throw new YarnRuntimeException(e);
        } catch (Exception e) {

          // TODO Better error handling. Thread can die with the rest of the
          // NM still running.
          LOG.error("Caught exception in status-updater", e);
        } finally {
          synchronized (heartbeatMonitor) {
            nextHeartBeatInterval = nextHeartBeatInterval <= 0 ?
                YarnConfiguration.DEFAULT_RM_NM_HEARTBEAT_INTERVAL_MS :
                nextHeartBeatInterval;
            try {
              heartbeatMonitor.wait(nextHeartBeatInterval);
            } catch (InterruptedException e) {
              // Do Nothing
            }
          }
        }
      }
    }

    private void updateTimelineCollectorData(
        NodeHeartbeatResponse response) {
      Map incomingCollectorsMap =
          response.getAppCollectors();
      if (incomingCollectorsMap == null) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("No collectors to update RM");
        }
        return;
      }
      Map knownCollectors =
          context.getKnownCollectors();
      for (Map.Entry entry
          : incomingCollectorsMap.entrySet()) {
        ApplicationId appId = entry.getKey();
        AppCollectorData collectorData = entry.getValue();

        // Only handle applications running on local node.
        Application application = context.getApplications().get(appId);
        if (application != null) {
          // Update collector data if the newly received data happens after
          // the known data (updates the known data).
          AppCollectorData existingData = knownCollectors.get(appId);
          if (AppCollectorData.happensBefore(existingData, collectorData)) {
            if (LOG.isDebugEnabled()) {
              LOG.debug("Sync a new collector address: "
                  + collectorData.getCollectorAddr()
                  + " for application: " + appId + " from RM.");
            }
            // Update information for clients.
            NMTimelinePublisher nmTimelinePublisher =
                context.getNMTimelinePublisher();
            if (nmTimelinePublisher != null) {
              nmTimelinePublisher.setTimelineServiceAddress(
                  application.getAppId(), collectorData.getCollectorAddr());
            }
            // Update information for the node manager itself.
            knownCollectors.put(appId, collectorData);
          }
        }
        // Remove the registering collector data
        context.getRegisteringCollectors().remove(entry.getKey());
      }
    }

    private void updateMasterKeys(NodeHeartbeatResponse response) {
      // See if the master-key has rolled over
      MasterKey updatedMasterKey = response.getContainerTokenMasterKey();
      if (updatedMasterKey != null) {
        // Will be non-null only on roll-over on RM side
        context.getContainerTokenSecretManager().setMasterKey(updatedMasterKey);
      }

      updatedMasterKey = response.getNMTokenMasterKey();
      if (updatedMasterKey != null) {
        context.getNMTokenSecretManager().setMasterKey(updatedMasterKey);
      }
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy