com.tencent.angel.master.AngelApplicationMaster Maven / Gradle / Ivy
/*
* Tencent is pleased to support the open source community by making Angel available.
*
* Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* https://opensource.org/licenses/Apache-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*
*/
package com.tencent.angel.master;
import com.tencent.angel.AngelDeployMode;
import com.tencent.angel.RunningMode;
import com.tencent.angel.common.location.LocationManager;
import com.tencent.angel.conf.AngelConf;
import com.tencent.angel.master.app.*;
import com.tencent.angel.master.client.ClientManager;
import com.tencent.angel.master.data.DataSpliter;
import com.tencent.angel.master.data.DummyDataSpliter;
import com.tencent.angel.master.deploy.ContainerAllocator;
import com.tencent.angel.master.deploy.ContainerAllocatorEventType;
import com.tencent.angel.master.deploy.ContainerLauncher;
import com.tencent.angel.master.deploy.ContainerLauncherEventType;
import com.tencent.angel.master.deploy.local.LocalContainerAllocator;
import com.tencent.angel.master.deploy.local.LocalContainerLauncher;
import com.tencent.angel.master.deploy.yarn.YarnContainerAllocator;
import com.tencent.angel.master.deploy.yarn.YarnContainerLauncher;
import com.tencent.angel.master.matrix.committer.AMModelLoader;
import com.tencent.angel.master.matrix.committer.AMModelSaver;
import com.tencent.angel.master.matrixmeta.AMMatrixMetaManager;
import com.tencent.angel.master.metrics.MetricsEventType;
import com.tencent.angel.master.metrics.MetricsService;
import com.tencent.angel.master.oplog.AppStateStorage;
import com.tencent.angel.master.ps.ParameterServerManager;
import com.tencent.angel.master.ps.ParameterServerManagerEventType;
import com.tencent.angel.master.ps.attempt.PSAttemptEvent;
import com.tencent.angel.master.ps.attempt.PSAttemptEventType;
import com.tencent.angel.master.ps.ps.AMParameterServer;
import com.tencent.angel.master.ps.ps.AMParameterServerEvent;
import com.tencent.angel.master.ps.ps.AMParameterServerEventType;
import com.tencent.angel.master.psagent.*;
import com.tencent.angel.master.slowcheck.SlowChecker;
import com.tencent.angel.master.task.AMTaskManager;
import com.tencent.angel.master.worker.WorkerManager;
import com.tencent.angel.master.worker.WorkerManagerEventType;
import com.tencent.angel.master.worker.attempt.WorkerAttemptEvent;
import com.tencent.angel.master.worker.attempt.WorkerAttemptEventType;
import com.tencent.angel.master.worker.worker.AMWorkerEvent;
import com.tencent.angel.master.worker.worker.AMWorkerEventType;
import com.tencent.angel.master.worker.workergroup.AMWorkerGroupEvent;
import com.tencent.angel.master.worker.workergroup.AMWorkerGroupEventType;
import com.tencent.angel.plugin.AngelServiceLoader;
import com.tencent.angel.ps.PSAttemptId;
import com.tencent.angel.ps.ParameterServerId;
import com.tencent.angel.webapp.AngelWebApp;
import com.tencent.angel.worker.WorkerAttemptId;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.util.ShutdownHookManager;
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.Dispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.security.client.ClientToAMTokenSecretManager;
import org.apache.hadoop.yarn.util.Clock;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.SystemClock;
import org.apache.hadoop.yarn.webapp.WebApp;
import org.apache.hadoop.yarn.webapp.WebApps;
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.*;
import java.util.Map.Entry;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* Angel application master. It contains service modules: worker manager, parameter server manager,
* container allocator, container launcher, task manager and event handler.
*/
public class AngelApplicationMaster extends CompositeService {
private static final Log LOG = LogFactory.getLog(AngelApplicationMaster.class);
public static final int SHUTDOWN_HOOK_PRIORITY = 30;
/**
* application name
*/
private final String appName;
/**
* application configuration
*/
private final Configuration conf;
/**
* application attempt id
*/
private final ApplicationAttemptId appAttemptId;
/**
* app start time
*/
private final long startTime;
/**
* app submit time
*/
private final long appSubmitTime;
/**
* container id for angel application master
*/
private final ContainerId containerId;
/**
* The host of the node manager where the angel applitaion master is located
*/
private final String nmHost;
/**
* The port of the node manager where the angel applitaion master is located
*/
private final int nmPort;
/**
* The web port of the node manager where the angel applitaion master is located
*/
private final int nmHttpPort;
/**
* angel application master credentials
*/
private final Credentials credentials;
/**
* application running context, it is used to share information between all service module
*/
private final AMContext appContext;
/**
* system clock
*/
private final SystemClock clock;
/**
* event dispatcher
*/
private Dispatcher dispatcher;
/**
* container allocator, it used to apply running resource for workers and parameter servers
*/
private ContainerAllocator containerAllocator;
/**
* container allocator, it used to launch workers and parameter servers
*/
private ContainerLauncher containerLauncher;
/**
* parameter server manager
*/
private ParameterServerManager psManager;
/**
* angel application master service, it is used to response RPC request from client, workers and
* parameter servers
*/
private volatile MasterService masterService;
/**
* matrix meta manager
*/
private AMMatrixMetaManager matrixMetaManager;
/**
* parameter server location manager
*/
private LocationManager locationManager;
/**
* worker manager
*/
private WorkerManager workerManager;
/**
* it use to split train data
*/
private DataSpliter dataSpliter;
/**
* angel application master state storage
*/
private AppStateStorage appStateStorage;
/**
* angel application state
*/
private final App angelApp;
/**
* a web service for http access
*/
private WebApp webApp;
/**
* psagent manager
*/
private PSAgentManager psAgentManager;
/**
* identifies whether the temporary resource is cleared
*/
private boolean isCleared;
/**
* task manager
*/
private AMTaskManager taskManager;
/**
* Algorithm indexes collector
*/
private MetricsService algoMetricsService;
private final Lock lock;
/**
* Angel Client manager
*/
private ClientManager clientManager;
/**
* Heartbeat monitor
*/
private HeartbeatMonitor hbMonitor;
/**
* Model saver
*/
private AMModelSaver modelSaver;
/**
* Model loader
*/
private AMModelLoader modelLoader;
public AngelApplicationMaster(Configuration conf, String appName,
ApplicationAttemptId applicationAttemptId, ContainerId containerId, String nmHost, int nmPort,
int nmHttpPort, long appSubmitTime, Credentials credentials) {
super(AngelApplicationMaster.class.getName());
this.conf = conf;
this.appName = appName;
this.appAttemptId = applicationAttemptId;
this.appSubmitTime = appSubmitTime;
this.containerId = containerId;
this.nmHost = nmHost;
this.nmPort = nmPort;
this.nmHttpPort = nmHttpPort;
this.clock = new SystemClock();
this.startTime = clock.getTime();
this.isCleared = false;
this.credentials = credentials;
appContext = new RunningAppContext(conf);
angelApp = new App(appContext);
lock = new ReentrantLock();
}
/**
* running application master context
*/
public class RunningAppContext implements AMContext {
private final ClientToAMTokenSecretManager clientToAMTokenSecretManager;
public RunningAppContext(Configuration config) {
this.clientToAMTokenSecretManager = new ClientToAMTokenSecretManager(appAttemptId, null);
}
@Override public ApplicationAttemptId getApplicationAttemptId() {
return appAttemptId;
}
@Override public MasterService getMasterService() {
return masterService;
}
@Override public ApplicationId getApplicationId() {
return appAttemptId.getApplicationId();
}
@Override public String getApplicationName() {
return appName;
}
@Override public long getStartTime() {
return startTime;
}
@SuppressWarnings("rawtypes") @Override public EventHandler getEventHandler() {
return dispatcher.getEventHandler();
}
@Override public String getUser() {
return conf.get(AngelConf.USER_NAME);
}
@Override public Clock getClock() {
return clock;
}
@Override public ClientToAMTokenSecretManager getClientToAMTokenSecretManager() {
return clientToAMTokenSecretManager;
}
@Override public Credentials getCredentials() {
return credentials;
}
@Override public ContainerAllocator getContainerAllocator() {
return containerAllocator;
}
@Override public ParameterServerManager getParameterServerManager() {
return psManager;
}
@Override public Dispatcher getDispatcher() {
return dispatcher;
}
@Override public App getApp() {
return angelApp;
}
@Override public Configuration getConf() {
return conf;
}
@Override public WebApp getWebApp() {
return webApp;
}
@Override public AMMatrixMetaManager getMatrixMetaManager() {
return matrixMetaManager;
}
@Override public LocationManager getLocationManager() {
return locationManager;
}
@Override public RunningMode getRunningMode() {
String mode = conf.get(AngelConf.ANGEL_RUNNING_MODE, AngelConf.DEFAULT_ANGEL_RUNNING_MODE);
if (mode.equals(RunningMode.ANGEL_PS.toString())) {
return RunningMode.ANGEL_PS;
} else {
return RunningMode.ANGEL_PS_WORKER;
}
}
@Override public PSAgentManager getPSAgentManager() {
return psAgentManager;
}
@Override public WorkerManager getWorkerManager() {
return workerManager;
}
@Override public DataSpliter getDataSpliter() {
return dataSpliter;
}
@Override public int getTotalIterationNum() {
return conf.getInt("ml.epoch.num", AngelConf.DEFAULT_ANGEL_TASK_ITERATION_NUMBER);
}
@Override public AMTaskManager getTaskManager() {
return taskManager;
}
@Override public MetricsService getAlgoMetricsService() {
return algoMetricsService;
}
@Override public int getPSReplicationNum() {
return conf.getInt(AngelConf.ANGEL_PS_HA_REPLICATION_NUMBER,
AngelConf.DEFAULT_ANGEL_PS_HA_REPLICATION_NUMBER);
}
@Override public ClientManager getClientManager() {
return clientManager;
}
@Override public int getYarnNMWebPort() {
String nmWebAddr =
conf.get(YarnConfiguration.NM_WEBAPP_ADDRESS, YarnConfiguration.DEFAULT_NM_WEBAPP_ADDRESS);
String[] addrItems = nmWebAddr.split(":");
if (addrItems.length == 2) {
try {
return Integer.valueOf(addrItems[1]);
} catch (Throwable x) {
LOG.error("can not get nm web port from " + nmWebAddr + ", just return default 8080");
return 8080;
}
} else {
return 8080;
}
}
@Override public AMModelSaver getModelSaver() {
return modelSaver;
}
@Override public AMModelLoader getModelLoader() {
return modelLoader;
}
@Override public int getAMAttemptTime() {
return conf
.getInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
}
@Override public AppStateStorage getAppStateStorage() {
return appStateStorage;
}
@Override public boolean needClear() {
return !getApp().isShouldRetry() || appAttemptId.getAttemptId() >= getAMAttemptTime()
|| angelApp.isSuccess();
}
@Override public AngelDeployMode getDeployMode() {
String mode = conf.get(AngelConf.ANGEL_DEPLOY_MODE, AngelConf.DEFAULT_ANGEL_DEPLOY_MODE);
if (mode.equals(AngelDeployMode.LOCAL.toString())) {
return AngelDeployMode.LOCAL;
} else {
return AngelDeployMode.YARN;
}
}
}
public void clear() throws IOException {
boolean deleteSubmitDir = appContext.getConf()
.getBoolean(AngelConf.ANGEL_JOB_REMOVE_STAGING_DIR_ENABLE,
AngelConf.DEFAULT_ANGEL_JOB_REMOVE_STAGING_DIR_ENABLE);
if (deleteSubmitDir) {
cleanupStagingDir();
}
cleanTmpOutputDir();
}
private void cleanTmpOutputDir() {
Configuration conf = appContext.getConf();
String tmpOutDir = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
if (tmpOutDir == null) {
return;
}
try {
LOG.info("Deleting tmp output directory " + tmpOutDir);
Path tmpOutPath = new Path(tmpOutDir);
FileSystem fs = tmpOutPath.getFileSystem(conf);
fs.delete(tmpOutPath, true);
} catch (IOException io) {
LOG.error("Failed to cleanup staging dir " + tmpOutDir, io);
}
}
private void cleanupStagingDir() throws IOException {
Configuration conf = appContext.getConf();
String stagingDir = conf.get(AngelConf.ANGEL_JOB_DIR);
if (stagingDir == null) {
LOG.warn("App Staging directory is null");
return;
}
try {
Path stagingDirPath = new Path(stagingDir);
FileSystem fs = stagingDirPath.getFileSystem(conf);
LOG.info("Deleting staging directory " + FileSystem.getDefaultUri(conf) + " " + stagingDir);
fs.delete(stagingDirPath, true);
} catch (IOException io) {
LOG.error("Failed to cleanup staging dir " + stagingDir, io);
}
}
@Override public void serviceStop() throws Exception {
super.serviceStop();
AngelServiceLoader.stopService();
}
/**
* stop all services of angel application master and clear tmp directory
*/
public void shutDownJob() {
try {
lock.lock();
if (isCleared) {
return;
}
// stop all services
LOG.info("Calling stop for all the services");
AngelApplicationMaster.this.stop();
// 1.write application state to file so that the client can get the state of the application
// if master exit
// 2.clear tmp and staging directory
if (appContext.needClear()) {
LOG.info("start to write app state to file and clear tmp directory");
writeAppState();
clear();
}
// waiting for client to get application state
try {
Thread.sleep(10000);
} catch (InterruptedException e) {
LOG.warn("ShutDownjob error ", e);
}
// stop the RPC server
masterService.stop();
} catch (Throwable t) {
LOG.warn("Graceful stop failed ", t);
} finally {
isCleared = true;
lock.unlock();
}
LOG.info("Exiting Angel AppMaster..GoodBye!");
exit(0);
}
private void exit(int code) {
AngelDeployMode deployMode = appContext.getDeployMode();
if (deployMode == AngelDeployMode.YARN) {
System.exit(code);
}
}
private void writeAppState() throws IllegalArgumentException, IOException {
String interalStatePath = appContext.getConf().get(AngelConf.ANGEL_APP_SERILIZE_STATE_FILE);
LOG.info("start to write app state to file " + interalStatePath);
if (interalStatePath == null) {
LOG.error("can not find app state serilize file, exit");
return;
}
Path stateFilePath = new Path(interalStatePath);
FileSystem fs = stateFilePath.getFileSystem(appContext.getConf());
if (fs.exists(stateFilePath)) {
fs.delete(stateFilePath, false);
}
FSDataOutputStream out = fs.create(stateFilePath);
appContext.getApp().serilize(out);
out.flush();
out.close();
LOG.info("write app state over");
}
@SuppressWarnings("resource") public static void main(String[] args) {
AngelAppMasterShutdownHook hook = null;
try {
Thread.setDefaultUncaughtExceptionHandler(new YarnUncaughtExceptionHandler());
String containerIdStr = System.getenv(Environment.CONTAINER_ID.name());
String nodeHostString = System.getenv(Environment.NM_HOST.name());
String nodePortString = System.getenv(Environment.NM_PORT.name());
String nodeHttpPortString = System.getenv(Environment.NM_HTTP_PORT.name());
String appSubmitTimeStr = System.getenv(ApplicationConstants.APP_SUBMIT_TIME_ENV);
String maxAppAttempts = System.getenv(ApplicationConstants.MAX_APP_ATTEMPTS_ENV);
validateInputParam(containerIdStr, Environment.CONTAINER_ID.name());
validateInputParam(nodeHostString, Environment.NM_HOST.name());
validateInputParam(nodePortString, Environment.NM_PORT.name());
validateInputParam(nodeHttpPortString, Environment.NM_HTTP_PORT.name());
validateInputParam(appSubmitTimeStr, ApplicationConstants.APP_SUBMIT_TIME_ENV);
validateInputParam(maxAppAttempts, ApplicationConstants.MAX_APP_ATTEMPTS_ENV);
ContainerId containerId = ConverterUtils.toContainerId(containerIdStr);
ApplicationAttemptId applicationAttemptId = containerId.getApplicationAttemptId();
long appSubmitTime = Long.parseLong(appSubmitTimeStr);
Configuration conf = new Configuration();
conf.addResource(AngelConf.ANGEL_JOB_CONF_FILE);
String jobUserName = System.getenv(ApplicationConstants.Environment.USER.name());
conf.set(AngelConf.USER_NAME, jobUserName);
conf.setBoolean("fs.automatic.close", false);
UserGroupInformation.setConfiguration(conf);
// Security framework already loaded the tokens into current UGI, just use
// them
Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
LOG.info("Executing with tokens:");
for (Token> token : credentials.getAllTokens()) {
LOG.info(token);
}
UserGroupInformation appMasterUgi = UserGroupInformation.createRemoteUser(jobUserName);
appMasterUgi.addCredentials(credentials);
// Now remove the AM->RM token so tasks don't have it
Iterator> iter = credentials.getAllTokens().iterator();
while (iter.hasNext()) {
Token> token = iter.next();
if (token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) {
iter.remove();
}
}
String appName = conf.get(AngelConf.ANGEL_JOB_NAME);
LOG.info("app name=" + appName);
LOG.info("app attempt id=" + applicationAttemptId);
final AngelApplicationMaster appMaster =
new AngelApplicationMaster(conf, appName, applicationAttemptId, containerId, nodeHostString,
Integer.parseInt(nodePortString), Integer.parseInt(nodeHttpPortString), appSubmitTime,
credentials);
// add a shutdown hook
hook = new AngelAppMasterShutdownHook(appMaster);
ShutdownHookManager.get().addShutdownHook(hook, SHUTDOWN_HOOK_PRIORITY);
appMasterUgi.doAs(new PrivilegedExceptionAction
© 2015 - 2025 Weber Informatics LLC | Privacy Policy