All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.tencent.angel.master.app.App Maven / Gradle / Ivy

There is a newer version: 3.2.0
Show newest version
/*
 * Tencent is pleased to support the open source community by making Angel available.
 *
 * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 * https://opensource.org/licenses/Apache-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 *
 */


package com.tencent.angel.master.app;

import com.tencent.angel.RunningMode;
import com.tencent.angel.conf.AngelConf;
import com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportResponse;
import com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.JobReportProto;
import com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.JobStateProto;
import com.tencent.angel.protobuf.generated.MLProtos;
import com.tencent.angel.utils.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.service.AbstractService;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.state.InvalidStateTransitonException;
import org.apache.hadoop.yarn.state.SingleArcTransition;
import org.apache.hadoop.yarn.state.StateMachine;
import org.apache.hadoop.yarn.state.StateMachineFactory;

import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

/**
 * Angel Application state machine.
 */
public class App extends AbstractService implements EventHandler {

  private static final Log LOG = LogFactory.getLog(App.class);
  private final static String JOB_STATE_PREFIX = "J_";

  private final AMContext context;

  /**
   * application diagnostics
   */
  private final List diagnostics;

  /**
   * application state machine
   */
  private final StateMachine stateMachine;

  /**
   * the state is externally enforced, its priority is higher than the state of the state machine
   */
  private AppState forcedState = null;

  /**
   * application launch time
   */
  private final long launchTime;

  /**
   * application finish time
   */
  private long finishTime;

  /**
   * read/write lock
   */
  private final Lock readLock;
  private final Lock writeLock;

  /**
   * identify whether the application needs to be retried,
   */
  private boolean shouldRetry;

  /**
   * state timeout monitor
   */
  private volatile Thread stateMonitor;

  /**
   * state to state start timestamp map
   */
  private final Map stateToTsMap;

  /**
   * the longest time in milliseconds for a state(NEW,INITED,EXECUTE_SUCCESSED,SUCCEEDED,FAILED,KILLED)
   */
  private final long stateTimeOutMs;

  private final AtomicBoolean stopped;

  public App(AMContext context) {
    super(App.class.getName());
    this.context = context;
    stateMachine = stateMachineFactory.make(this);
    ReadWriteLock readWriteLock = new ReentrantReadWriteLock();
    this.readLock = readWriteLock.readLock();
    this.writeLock = readWriteLock.writeLock();

    this.launchTime = context.getStartTime();
    shouldRetry = false;
    diagnostics = new ArrayList<>();

    stateTimeOutMs = context.getConf().getLong(AngelConf.ANGEL_AM_APPSTATE_TIMEOUT_MS,
        AngelConf.DEFAULT_ANGEL_AM_APPSTATE_TIMEOUT_MS);
    stateToTsMap = new HashMap<>();
    stateToTsMap.put(AppState.NEW, context.getClock().getTime());
    stopped = new AtomicBoolean(false);
  }

  private static final AppSuccessTransition APP_SUCCESS_TRANSITION = new AppSuccessTransition();
  private static final AppKilledTransition APP_KILLED_TRANSITION = new AppKilledTransition();
  private static final AppFailedTransition APP_FAILED_TRANSITION = new AppFailedTransition();

  protected static final StateMachineFactory
      stateMachineFactory =
      new StateMachineFactory(AppState.NEW)
          .addTransition(AppState.NEW, AppState.INITED, AppEventType.INIT)
          .addTransition(AppState.NEW, AppState.KILLED, AppEventType.KILL, APP_KILLED_TRANSITION)
          .addTransition(AppState.NEW, AppState.FAILED, AppEventType.INTERNAL_ERROR,
              APP_FAILED_TRANSITION)

          .addTransition(AppState.INITED, AppState.PREPARE_WORKERS, AppEventType.LAUNCH_WORKERS,
              new AppLaunchWorkersTransition())
          .addTransition(AppState.INITED, AppState.KILLED, AppEventType.KILL, APP_KILLED_TRANSITION)
          .addTransition(AppState.INITED, AppState.FAILED, AppEventType.INTERNAL_ERROR,
              APP_FAILED_TRANSITION)

          .addTransition(AppState.PREPARE_WORKERS, AppState.RUNNING,
              AppEventType.ALL_WORKERS_LAUNCHED)
          .addTransition(AppState.PREPARE_WORKERS, AppState.KILLED, AppEventType.KILL, APP_KILLED_TRANSITION)
          .addTransition(AppState.PREPARE_WORKERS, AppState.FAILED, AppEventType.INTERNAL_ERROR,
              APP_FAILED_TRANSITION)

          .addTransition(AppState.RUNNING, AppState.EXECUTE_SUCCESSED, AppEventType.EXECUTE_SUCESS)
          .addTransition(AppState.RUNNING, AppState.KILLED, AppEventType.KILL,
              APP_KILLED_TRANSITION)
          .addTransition(AppState.RUNNING, AppState.FAILED, AppEventType.INTERNAL_ERROR,
              APP_FAILED_TRANSITION)
          .addTransition(AppState.RUNNING, AppState.SUCCEEDED, AppEventType.SUCCESS,
              APP_SUCCESS_TRANSITION)

          .addTransition(AppState.EXECUTE_SUCCESSED, AppState.KILLED, AppEventType.KILL,
              APP_KILLED_TRANSITION)
          .addTransition(AppState.EXECUTE_SUCCESSED, AppState.FAILED, AppEventType.INTERNAL_ERROR,
              APP_FAILED_TRANSITION)
          .addTransition(AppState.EXECUTE_SUCCESSED, AppState.SUCCEEDED, AppEventType.SUCCESS,
              APP_SUCCESS_TRANSITION)

          .addTransition(AppState.SUCCEEDED, AppState.SUCCEEDED, EnumSet
              .of(AppEventType.INIT, AppEventType.LAUNCH_WORKERS, AppEventType.ALL_WORKERS_LAUNCHED,
                  AppEventType.EXECUTE_SUCESS,
                  AppEventType.SUCCESS, AppEventType.KILL, AppEventType.INTERNAL_ERROR))

          .addTransition(AppState.KILLED, AppState.KILLED, EnumSet
              .of(AppEventType.INIT, AppEventType.LAUNCH_WORKERS, AppEventType.ALL_WORKERS_LAUNCHED,
                  AppEventType.EXECUTE_SUCESS,
                  AppEventType.SUCCESS, AppEventType.KILL, AppEventType.INTERNAL_ERROR))

          .addTransition(AppState.FAILED, AppState.FAILED, EnumSet
              .of(AppEventType.INIT, AppEventType.LAUNCH_WORKERS, AppEventType.ALL_WORKERS_LAUNCHED,
                  AppEventType.EXECUTE_SUCESS,
                  AppEventType.SUCCESS, AppEventType.KILL, AppEventType.INTERNAL_ERROR));

  @SuppressWarnings("unchecked")
  public void startExecute() {
    context.getEventHandler().handle(new AppEvent(AppEventType.INIT));
    context.getEventHandler().handle(new AppEvent(AppEventType.LAUNCH_WORKERS));
  }

  /**
   * get state of application, only return RUNNING, EXECUTE_SUCCEEDED, COMMITING, SUCCEEDED, KILLED,
   * FAILED
   *
   * @return AppState the state of application
   */
  public AppState getExternAppState() {
    AppState state = getInternalState();
    switch (state) {
      case NEW:
      case INITED:
      case RUNNING:
        return AppState.RUNNING;
      default:
        return state;
    }
  }

  @Override
  protected void serviceStart() throws Exception {
    stateMonitor = new Thread(new Runnable() {
      @SuppressWarnings("unchecked")
      @Override
      public void run() {
        while (!stopped.get() && !Thread.interrupted()) {
          AppState state = getInternalState();
          try {
            readLock.lock();
            if (stateToTsMap.containsKey(state)
                && context.getClock().getTime() - stateToTsMap.get(state) >= stateTimeOutMs) {
              context.getEventHandler().handle(new InternalErrorEvent(context.getApplicationId(),
                  "app in state " + state + " over " + stateTimeOutMs + " milliseconds!"));
            }
          } finally {
            readLock.unlock();
          }
        }
      }

    });
    stateMonitor.setName("app-state-monitor");
    stateMonitor.start();
    super.serviceStart();
  }

  @Override
  protected void serviceStop() throws Exception {
    if (stopped.getAndSet(true)) {
      return;
    }

    if (stateMonitor != null) {
      stateMonitor.interrupt();
      stateMonitor = null;
    }
    super.serviceStop();
    LOG.info("App manager stopped");
  }

  /**
   * get application running state: current state, current iteration and diagnostics TODO:we can add
   * more details of the app.
   *
   * @return GetJobReportResponse application state
   */
  public GetJobReportResponse getJobReportResponse() {
    GetJobReportResponse.Builder getJobReportResBuilder = GetJobReportResponse.newBuilder();
    JobReportProto.Builder report = JobReportProto.newBuilder();
    report.setJobState(convertToProtoFormat(getExternAppState()));
    StringBuilder sb = new StringBuilder();
    sb.append(StringUtils.join("\n", getDiagnostics()));
    report.setDiagnostics(sb.toString());

    int totalIteration = context.getConf()
        .getInt(AngelConf.ANGEL_TASK_ITERATION_NUMBER,
            AngelConf.DEFAULT_ANGEL_TASK_ITERATION_NUMBER);
    report.setTotalIteration(totalIteration);
    int curIteration = 0;
    if (context.getAlgoMetricsService() != null) {
      curIteration = context.getAlgoMetricsService().getCurrentIter();
      Map metrics = context.getAlgoMetricsService().getAlgoMetrics(curIteration);
      MLProtos.Pair.Builder pairBuilder = MLProtos.Pair.newBuilder();

      if (metrics != null) {
        for (Map.Entry entry : metrics.entrySet()) {
          report.addMetrics(
              pairBuilder.setKey(entry.getKey()).setValue(String.valueOf(entry.getValue()))
                  .build());
        }
      }
    }
    report.setCurIteration(curIteration);

    getJobReportResBuilder.setJobReport(report);
    return getJobReportResBuilder.build();
  }

  private String toString(Map metrics) {
    StringBuilder sb = new StringBuilder();

    for (Map.Entry entry : metrics.entrySet()) {
      sb.append("index name=").append(entry.getKey()).append(",").append("value=")
          .append(entry.getValue());
    }

    return sb.toString();
  }


  /**
   * write application state to output stream
   *
   * @param out output stream
   */
  public void serilize(FSDataOutputStream out) throws IOException {
    GetJobReportResponse jobState = getJobReportResponse();
    jobState.writeTo(out);
    LOG.info("write app report to file successfully " + jobState);
  }

  private JobStateProto convertToProtoFormat(AppState appState) {
    return JobStateProto.valueOf(JOB_STATE_PREFIX + appState.name());
  }

  public void addDiagnostics(String message) {
    try {
      writeLock.lock();
      diagnostics.add(message);
    } finally {
      writeLock.unlock();
    }
  }

  @SuppressWarnings("unchecked")
  @Override
  public void handle(AppEvent event) {
    LOG.info("Processing AppEvent type " + event);
    try {
      writeLock.lock();
      AppState oldState = getInternalState();
      try {
        stateMachine.doTransition(event.getType(), event);
      } catch (InvalidStateTransitonException e) {
        LOG.error("Can't handle this event at current state", e);
        context.getEventHandler().handle(new InternalErrorEvent(context.getApplicationId(),
            "Can't handle this event at current state" + e.getMessage()));
      }
      // notify the event handler of state change
      AppState newState = getInternalState();
      if (oldState != newState) {
        // If new state is not RUNNING and EXECUTE_SUCCESSED, add it to state timeout monitor
        if (newState != AppState.RUNNING && newState != AppState.EXECUTE_SUCCESSED) {
          stateToTsMap.put(newState, context.getClock().getTime());
        }
        LOG.info(
            context.getApplicationId() + "Job Transitioned from " + oldState + " to " + newState);
      }
    } finally {
      writeLock.unlock();
    }
  }

  /**
   * get application state
   *
   * @return AppState application state
   */
  public AppState getInternalState() {
    readLock.lock();
    try {
      // if forcedState is set, just return
      if (forcedState != null) {
        return forcedState;
      }

      // else get state from state machine
      return stateMachine.getCurrentState();
    } finally {
      readLock.unlock();
    }
  }

  /**
   * get application diagnostics
   *
   * @return List application diagnostics
   */
  public List getDiagnostics() {
    try {
      readLock.lock();
      List cloneDiagnostics = new ArrayList();
      cloneDiagnostics.addAll(diagnostics);
      return cloneDiagnostics;
    } finally {
      readLock.unlock();
    }
  }

  /**
   * get application launch time
   *
   * @return long application launch time, in milliseconds
   */
  public long getLaunchTime() {
    return launchTime;
  }

  /**
   * get application finish time
   *
   * @return long application finish time, in milliseconds
   */
  public long getFinishTime() {
    try {
      readLock.lock();
      return finishTime;
    } finally {
      readLock.unlock();
    }
  }

  private void setFinishTime() {
    this.finishTime = context.getClock().getTime();
  }

  /**
   * force the application to specific state
   *
   * @param state forced state need to set
   */
  public void forceState(AppState state) {
    if (isFinish()) {
      return;
    }

    try {
      writeLock.lock();
      forcedState = state;
      if ((state == AppState.SUCCEEDED) || (state == AppState.FAILED) || (state
          == AppState.KILLED)) {
        setFinishTime();
      }
    } finally {
      writeLock.unlock();
    }
  }

  /**
   * check application is finish or not. when the application is in the SUCCEEDED, FAILED or KILLED
   * state, indicating that the application is finish
   *
   * @return boolean true if finish, otherwise false
   */
  public boolean isFinish() {
    AppState state = getInternalState();
    return (state == AppState.SUCCEEDED) || (state == AppState.FAILED) || (state
        == AppState.KILLED);
  }

  /**
   * check application is finish with SUCCEEDED
   *
   * @return boolean true if finish with SUCCEEDED, otherwise false
   */
  public boolean isSuccess() {
    AppState state = getInternalState();
    return state == AppState.SUCCEEDED;
  }

  /**
   * check if the application master needs to retry
   *
   * @return boolean true if need retry, otherwise false
   */
  public boolean isShouldRetry() {
    try {
      readLock.lock();
      return shouldRetry;
    } finally {
      readLock.unlock();
    }
  }

  /**
   * set application master retry flag
   *
   * @param retryEnable retry flag
   */
  public void shouldRetry(boolean retryEnable) {
    try {
      writeLock.lock();
      shouldRetry = retryEnable;
    } finally {
      writeLock.unlock();
    }
  }

  public static class AppSuccessTransition implements SingleArcTransition {

    @SuppressWarnings("unchecked")
    @Override
    public void transition(App app, AppEvent event) {
      app.context.getEventHandler().handle(
          new AppFinishEvent(AppFinishEventType.SUCCESS_FINISH, app.context.getApplicationId()));
      app.setFinishTime();
    }
  }


  public static class AppLaunchWorkersTransition implements SingleArcTransition {

    @SuppressWarnings("unchecked")
    @Override
    public void transition(App app, AppEvent event) {
      if (app.context.getRunningMode() == RunningMode.ANGEL_PS_WORKER) {
        app.context.getWorkerManager().startAllWorker();
      } else {
        app.context.getEventHandler().handle(new AppEvent(AppEventType.ALL_WORKERS_LAUNCHED));
      }
    }
  }

  public static class AppKilledTransition implements SingleArcTransition {

    @SuppressWarnings("unchecked")
    @Override
    public void transition(App app, AppEvent event) {
      app.context.getEventHandler()
          .handle(
              new AppFinishEvent(AppFinishEventType.KILL_FINISH, app.context.getApplicationId()));
      app.setFinishTime();
    }
  }


  public static class AppFailedTransition implements SingleArcTransition {

    @SuppressWarnings("unchecked")
    @Override
    public void transition(App app, AppEvent event) {
      app.context.getEventHandler().handle(
          new AppFinishEvent(AppFinishEventType.INTERNAL_ERROR_FINISH,
              app.context.getApplicationId()));

      InternalErrorEvent errorEvent = (InternalErrorEvent) event;
      app.shouldRetry = errorEvent.isShouldRetry();

      LOG.info("some error happened, " + errorEvent);
      app.diagnostics.add(errorEvent.getErrorMsg());
      app.setFinishTime();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy