All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.mapreduce.v2.app.TaskHeartbeatHandler Maven / Gradle / Ivy

/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.hadoop.mapreduce.v2.app;

import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;

import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.util.MRJobConfUtil;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptDiagnosticsUpdateEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.util.Clock;


/**
 * This class keeps track of tasks that have already been launched. It
 * determines if a task is alive and running or marks a task as dead if it does
 * not hear from it for a long time.
 * 
 */
@SuppressWarnings({"unchecked", "rawtypes"})
public class TaskHeartbeatHandler extends AbstractService {
  
  private static class ReportTime {
    private long lastProgress;
    
    public ReportTime(long time) {
      setLastProgress(time);
    }
    
    public synchronized void setLastProgress(long time) {
      lastProgress = time;
    }

    public synchronized long getLastProgress() {
      return lastProgress;
    }
  }
  
  private static final Log LOG = LogFactory.getLog(TaskHeartbeatHandler.class);
  
  //thread which runs periodically to see the last time since a heartbeat is
  //received from a task.
  private Thread lostTaskCheckerThread;
  private volatile boolean stopped;
  private long taskTimeOut;
  private long unregisterTimeOut;
  private int taskTimeOutCheckInterval = 30 * 1000; // 30 seconds.

  private final EventHandler eventHandler;
  private final Clock clock;
  
  private ConcurrentMap runningAttempts;
  private ConcurrentMap recentlyUnregisteredAttempts;

  public TaskHeartbeatHandler(EventHandler eventHandler, Clock clock,
      int numThreads) {
    super("TaskHeartbeatHandler");
    this.eventHandler = eventHandler;
    this.clock = clock;
    runningAttempts =
      new ConcurrentHashMap(16, 0.75f, numThreads);
    recentlyUnregisteredAttempts =
        new ConcurrentHashMap(16, 0.75f, numThreads);
  }

  @Override
  protected void serviceInit(Configuration conf) throws Exception {
    super.serviceInit(conf);
    taskTimeOut = conf.getLong(
        MRJobConfig.TASK_TIMEOUT, MRJobConfig.DEFAULT_TASK_TIMEOUT_MILLIS);
    unregisterTimeOut = conf.getLong(MRJobConfig.TASK_EXIT_TIMEOUT,
        MRJobConfig.TASK_EXIT_TIMEOUT_DEFAULT);

    // enforce task timeout is at least twice as long as task report interval
    long taskProgressReportIntervalMillis = MRJobConfUtil.
        getTaskProgressReportInterval(conf);
    long minimumTaskTimeoutAllowed = taskProgressReportIntervalMillis * 2;
    if(taskTimeOut < minimumTaskTimeoutAllowed) {
      taskTimeOut = minimumTaskTimeoutAllowed;
      LOG.info("Task timeout must be as least twice as long as the task " +
          "status report interval. Setting task timeout to " + taskTimeOut);
    }

    taskTimeOutCheckInterval =
        conf.getInt(MRJobConfig.TASK_TIMEOUT_CHECK_INTERVAL_MS, 30 * 1000);
  }

  @Override
  protected void serviceStart() throws Exception {
    lostTaskCheckerThread = new Thread(new PingChecker());
    lostTaskCheckerThread.setName("TaskHeartbeatHandler PingChecker");
    lostTaskCheckerThread.start();
    super.serviceStart();
  }

  @Override
  protected void serviceStop() throws Exception {
    stopped = true;
    if (lostTaskCheckerThread != null) {
      lostTaskCheckerThread.interrupt();
    }
    super.serviceStop();
  }

  public void progressing(TaskAttemptId attemptID) {
  //only put for the registered attempts
    //TODO throw an exception if the task isn't registered.
    ReportTime time = runningAttempts.get(attemptID);
    if(time != null) {
      time.setLastProgress(clock.getTime());
    }
  }

  
  public void register(TaskAttemptId attemptID) {
    runningAttempts.put(attemptID, new ReportTime(clock.getTime()));
  }

  public void unregister(TaskAttemptId attemptID) {
    runningAttempts.remove(attemptID);
    recentlyUnregisteredAttempts.put(attemptID,
        new ReportTime(clock.getTime()));
  }

  public boolean hasRecentlyUnregistered(TaskAttemptId attemptID) {
    return recentlyUnregisteredAttempts.containsKey(attemptID);
  }

  private class PingChecker implements Runnable {

    @Override
    public void run() {
      while (!stopped && !Thread.currentThread().isInterrupted()) {
        long currentTime = clock.getTime();
        checkRunning(currentTime);
        checkRecentlyUnregistered(currentTime);
        try {
          Thread.sleep(taskTimeOutCheckInterval);
        } catch (InterruptedException e) {
          LOG.info("TaskHeartbeatHandler thread interrupted");
          break;
        }
      }
    }

    private void checkRunning(long currentTime) {
      Iterator> iterator =
          runningAttempts.entrySet().iterator();

      while (iterator.hasNext()) {
        Map.Entry entry = iterator.next();
        boolean taskTimedOut = (taskTimeOut > 0) &&
            (currentTime > (entry.getValue().getLastProgress() + taskTimeOut));

        if(taskTimedOut) {
          // task is lost, remove from the list and raise lost event
          iterator.remove();
          eventHandler.handle(new TaskAttemptDiagnosticsUpdateEvent(entry
              .getKey(), "AttemptID:" + entry.getKey().toString()
              + " Timed out after " + taskTimeOut / 1000 + " secs"));
          eventHandler.handle(new TaskAttemptEvent(entry.getKey(),
              TaskAttemptEventType.TA_TIMED_OUT));
        }
      }
    }

    private void checkRecentlyUnregistered(long currentTime) {
      Iterator iterator =
          recentlyUnregisteredAttempts.values().iterator();
      while (iterator.hasNext()) {
        ReportTime unregisteredTime = iterator.next();
        if (currentTime >
            unregisteredTime.getLastProgress() + unregisterTimeOut) {
          iterator.remove();
        }
      }
    }
  }

  @VisibleForTesting
  public long getTaskTimeOut() {
    return taskTimeOut;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy