org.apache.zeppelin.submarine.job.SubmarineJob Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.zeppelin.submarine.job;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.Path;
import org.apache.zeppelin.interpreter.InterpreterContext;
import org.apache.zeppelin.submarine.hadoop.HdfsClient;
import org.apache.zeppelin.submarine.job.thread.JobRunThread;
import org.apache.zeppelin.submarine.commons.SubmarineCommand;
import org.apache.zeppelin.submarine.commons.SubmarineConstants;
import org.apache.zeppelin.submarine.commons.SubmarineUI;
import org.apache.zeppelin.submarine.commons.SubmarineUtils;
import org.apache.zeppelin.submarine.job.thread.TensorboardRunThread;
import org.apache.zeppelin.submarine.hadoop.FinalApplicationStatus;
import org.apache.zeppelin.submarine.hadoop.YarnApplicationState;
import org.apache.zeppelin.submarine.hadoop.YarnClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.JOB_STATUS;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.TENSORBOARD_URL;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.TF_TENSORBOARD_ENABLE;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APPLICATION_FINAL_STATUS;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APPLICATION_ID;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APPLICATION_NAME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APPLICATION_STATUS;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APPLICATION_URL;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_ELAPSED_TIME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_FINAL_STATUS_NAME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_FINISHED_TIME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_LAUNCHTIME_NAME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_LAUNCH_TIME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_STARTEDTIME_NAME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_STARTED_TIME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_APP_STATE_NAME;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_TENSORBOARD_URL;
import static org.apache.zeppelin.submarine.commons.SubmarineConstants.YARN_WEB_HTTP_ADDRESS;
import static org.apache.zeppelin.submarine.job.SubmarineJobStatus.EXECUTE_SUBMARINE;
public class SubmarineJob extends Thread {
private Logger LOGGER = LoggerFactory.getLogger(SubmarineJob.class);
private AtomicBoolean running = new AtomicBoolean(true);
private static final long SYNC_SUBMARINE_RUNTIME_CYCLE = 3000;
private YarnClient yarnClient = null;
private SubmarineUI submarineUI = null;
private Properties properties = null;
private HdfsClient hdfsClient = null;
private File pythonWorkDir = null;
private String noteId = null;
private String noteName = null;
private String userName = null;
private String applicationId = null;
private YarnApplicationState yarnApplicationState = null;
private FinalApplicationStatus finalApplicationStatus = null;
private long startTime = 0;
private long launchTime = 0;
private long finishTime = 0;
private float progress = 0; // [0 ~ 100]
private SubmarineJobStatus currentJobStatus = EXECUTE_SUBMARINE;
private InterpreterContext intpContext = null;
JobRunThread jobRunThread = null;
TensorboardRunThread tensorboardRunThread = null;
public static final String DIRECTORY_USER_HOME = "shell.working.directory.userName.home";
private static final boolean isWindows = System.getProperty("os.name").startsWith("Windows");
public static final String shell = isWindows ? "cmd /c" : "bash -c";
public static final String TIMEOUT_PROPERTY = "submarine.command.timeout.millisecond";
public static final String defaultTimeout = "100000";
public static final String SUBMARINE_JOBRUN_TF_JINJA
= "jinja_templates/submarine-job-run-tf.jinja";
public static final String SUBMARINE_COMMAND_JINJA
= "jinja_templates/submarine-command.jinja";
public static final String SUBMARINE_TENSORBOARD_JINJA
= "jinja_templates/submarine-tensorboard.jinja";
public SubmarineJob(InterpreterContext context, Properties properties) {
this.intpContext = context;
this.properties = properties;
this.noteId = context.getNoteId();
this.noteName = context.getNoteName();
this.userName = context.getAuthenticationInfo().getUser();
this.yarnClient = new YarnClient(properties);
this.hdfsClient = new HdfsClient(properties);
this.submarineUI = new SubmarineUI(intpContext);
this.start();
}
// 1. Synchronize submarine runtime state
@Override
public void run() {
while (running.get()) {
String jobName = SubmarineUtils.getJobName(userName, noteId);
updateJobStateByYarn(jobName);
getTensorboardStatus();
try {
Thread.sleep(SYNC_SUBMARINE_RUNTIME_CYCLE);
} catch (InterruptedException e) {
LOGGER.error(e.getMessage(), e);
}
}
}
@VisibleForTesting
public boolean getRunning() {
return running.get();
}
// Stop SubmarineJob
public void stopRunning() {
running.set(false);
// stop JobRunThread
if (null != jobRunThread && jobRunThread.isAlive()) {
jobRunThread.stopRunning();
}
// stop TensorboardRunThread
if (null != tensorboardRunThread && tensorboardRunThread.isAlive()) {
tensorboardRunThread.stopRunning();
}
}
public String getUserTensorboardPath() {
String tfCheckpointPath = properties.getProperty(SubmarineConstants.TF_CHECKPOINT_PATH, "");
return tfCheckpointPath;
}
public String getJobDefaultCheckpointPath() {
String userTensorboardPath = getUserTensorboardPath();
return userTensorboardPath + "/" + noteId;
}
public void cleanJobDefaultCheckpointPath() {
String jobCheckpointPath = getJobDefaultCheckpointPath();
Path notePath = new Path(jobCheckpointPath);
if (notePath.depth() <= 3) {
submarineUI.outputLog("ERROR", "Checkpoint path depth must be greater than 3");
return;
}
try {
String message = "Clean up the checkpoint directory: " + jobCheckpointPath;
submarineUI.outputLog("", message);
hdfsClient.delete(notePath);
} catch (IOException e) {
LOGGER.error(e.getMessage(), e);
}
}
public Properties getProperties() {
return properties;
}
public HdfsClient getHdfsClient() {
return hdfsClient;
}
public SubmarineUI getSubmarineUI() {
return submarineUI;
}
public void setPythonWorkDir(File pythonWorkDir) {
this.pythonWorkDir = pythonWorkDir;
}
public File getPythonWorkDir() {
return this.pythonWorkDir;
}
public void onDashboard() {
submarineUI.createSubmarineUI(SubmarineCommand.DASHBOARD);
}
public void runJob() {
// Need to display the UI when the page is reloaded, don't create it in the thread
submarineUI.createSubmarineUI(SubmarineCommand.JOB_RUN);
submarineUI.createLogHeadUI();
// Check if job already exists
String jobName = SubmarineUtils.getJobName(userName, noteId);
Map mapAppStatus = getJobStateByYarn(jobName);
if (mapAppStatus.size() == 0) {
if (null == jobRunThread || !jobRunThread.isAlive()) {
jobRunThread = new JobRunThread(this);
jobRunThread.start();
} else {
submarineUI.outputLog("INFO", "JOB " + jobName + " being start up.");
}
} else {
submarineUI.outputLog("INFO", "JOB " + jobName + " already running.");
}
}
public void deleteJob(String serviceName) {
submarineUI.createSubmarineUI(SubmarineCommand.JOB_STOP);
yarnClient.deleteService(serviceName);
}
public void runTensorBoard() {
submarineUI.createSubmarineUI(SubmarineCommand.TENSORBOARD_RUN);
submarineUI.createLogHeadUI();
String tensorboardName = SubmarineUtils.getTensorboardName(userName);
Map mapAppStatus = getJobStateByYarn(tensorboardName);
if (mapAppStatus.size() == 0) {
if (null == tensorboardRunThread || !tensorboardRunThread.isAlive()) {
tensorboardRunThread = new TensorboardRunThread(this);
tensorboardRunThread.start();
} else {
submarineUI.outputLog("INFO", "Tensorboard being start up.");
}
} else {
submarineUI.outputLog("INFO", "Tensorboard already running.");
}
}
// Check if tensorboard already exists
public boolean getTensorboardStatus() {
String enableTensorboard = properties.getProperty(TF_TENSORBOARD_ENABLE, "false");
boolean tensorboardExist = false;
if (StringUtils.equals(enableTensorboard, "true")) {
String tensorboardName = SubmarineUtils.getTensorboardName(userName);
// create tensorboard link of YARN
Map mapAppStatus = getJobStateByYarn(tensorboardName);
String appId = "";
if (mapAppStatus.containsKey(YARN_APPLICATION_ID)) {
appId = mapAppStatus.get(YARN_APPLICATION_ID).toString();
StringBuffer sbUrl = new StringBuffer();
String yarnBaseUrl = properties.getProperty(YARN_WEB_HTTP_ADDRESS, "");
sbUrl.append(yarnBaseUrl).append("/ui2/#/yarn-app/").append(appId);
sbUrl.append("/components?service=").append(tensorboardName);
SubmarineUtils.setAgulObjValue(intpContext, YARN_TENSORBOARD_URL, sbUrl.toString());
// Detection tensorboard Container export port
List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy