![JAR search and dependency download from the Maven repository](/logo.png)
azkaban.jobExecutor.ProcessJob Maven / Gradle / Ivy
/*
* Copyright 2017 LinkedIn Corp.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package azkaban.jobExecutor;
import static azkaban.Constants.ConfigurationKeys.AZKABAN_SERVER_GROUP_NAME;
import static azkaban.Constants.ConfigurationKeys.AZKABAN_SERVER_NATIVE_LIB_FOLDER;
import static azkaban.ServiceProvider.SERVICE_PROVIDER;
import azkaban.Constants;
import azkaban.Constants.JobProperties;
import azkaban.flow.CommonJobProperties;
import azkaban.jobExecutor.utils.process.AzkabanProcess;
import azkaban.jobExecutor.utils.process.AzkabanProcessBuilder;
import azkaban.metrics.CommonMetrics;
import azkaban.utils.ExecuteAsUser;
import azkaban.utils.Pair;
import azkaban.utils.Props;
import azkaban.utils.SystemMemoryInfo;
import com.google.common.annotations.VisibleForTesting;
import java.io.File;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.log4j.Logger;
/**
* A job that runs a simple unix command
*/
public class ProcessJob extends AbstractProcessJob {
public static final String COMMAND = "command";
public static final String AZKABAN_MEMORY_CHECK = "azkaban.memory.check";
// Use azkaban.Constants.ConfigurationKeys.AZKABAN_SERVER_NATIVE_LIB_FOLDER instead
@Deprecated
public static final String NATIVE_LIB_FOLDER = "azkaban.native.lib";
public static final String EXECUTE_AS_USER = "execute.as.user";
public static final String KRB5CCNAME = "KRB5CCNAME";
private static final Duration KILL_TIME = Duration.ofSeconds(30);
private static final String MEMCHECK_ENABLED = "memCheck.enabled";
private static final String CHOWN = "chown";
private static final String CREATE_FILE = "touch";
private static final int SUCCESSFUL_EXECUTION = 0;
private static final String TEMP_FILE_NAME = "user_can_write";
private final CommonMetrics commonMetrics;
private volatile AzkabanProcess process;
private volatile boolean killed = false;
// For testing only. True if the job process exits successfully.
private volatile boolean success;
public ProcessJob(final String jobId, final Props sysProps,
final Props jobProps, final Logger log) {
super(jobId, sysProps, jobProps, log);
// TODO: reallocf fully guicify CommonMetrics through ProcessJob dependents
this.commonMetrics = SERVICE_PROVIDER.getInstance(CommonMetrics.class);
}
/**
* Splits the command into a unix like command line structure. Quotes and single quotes are
* treated as nested strings.
*/
public static String[] partitionCommandLine(final String command) {
final ArrayList commands = new ArrayList<>();
int index = 0;
StringBuffer buffer = new StringBuffer(command.length());
boolean isApos = false;
boolean isQuote = false;
while (index < command.length()) {
final char c = command.charAt(index);
switch (c) {
case ' ':
if (!isQuote && !isApos) {
final String arg = buffer.toString();
buffer = new StringBuffer(command.length() - index);
if (arg.length() > 0) {
commands.add(arg);
}
} else {
buffer.append(c);
}
break;
case '\'':
if (!isQuote) {
isApos = !isApos;
} else {
buffer.append(c);
}
break;
case '"':
if (!isApos) {
isQuote = !isQuote;
} else {
buffer.append(c);
}
break;
default:
buffer.append(c);
}
index++;
}
if (buffer.length() > 0) {
final String arg = buffer.toString();
commands.add(arg);
}
return commands.toArray(new String[commands.size()]);
}
@Override
public void run() throws Exception {
try {
resolveProps();
} catch (final Exception e) {
handleError("Bad property definition! " + e.getMessage(), e);
}
if (this.sysProps.getBoolean(MEMCHECK_ENABLED, true)
&& this.jobProps.getBoolean(AZKABAN_MEMORY_CHECK, true)) {
final Pair memPair = getProcMemoryRequirement();
final long xms = memPair.getFirst();
final long xmx = memPair.getSecond();
// retry backoff in ms
final String oomMsg = String
.format("Cannot request memory (Xms %d kb, Xmx %d kb) from system for job %s",
xms, xmx, getId());
int attempt;
boolean isMemGranted = true;
//todo HappyRay: move to proper Guice after this class is refactored.
final SystemMemoryInfo memInfo = SERVICE_PROVIDER.getInstance(SystemMemoryInfo.class);
for (attempt = 1; attempt <= Constants.MEMORY_CHECK_RETRY_LIMIT; attempt++) {
isMemGranted = memInfo.canSystemGrantMemory(xmx);
if (isMemGranted) {
info(String.format("Memory granted for job %s", getId()));
if (attempt > 1) {
this.commonMetrics.decrementOOMJobWaitCount();
}
break;
}
if (attempt < Constants.MEMORY_CHECK_RETRY_LIMIT) {
info(String.format(oomMsg + ", sleep for %s secs and retry, attempt %s of %s",
TimeUnit.MILLISECONDS.toSeconds(
Constants.MEMORY_CHECK_INTERVAL_MS), attempt,
Constants.MEMORY_CHECK_RETRY_LIMIT));
if (attempt == 1) {
this.commonMetrics.incrementOOMJobWaitCount();
}
synchronized (this) {
try {
this.wait(Constants.MEMORY_CHECK_INTERVAL_MS);
} catch (final InterruptedException e) {
info(String
.format("Job %s interrupted while waiting for memory check retry", getId()));
}
}
if (this.killed) {
this.commonMetrics.decrementOOMJobWaitCount();
info(String.format("Job %s was killed while waiting for memory check retry", getId()));
return;
}
}
}
if (!isMemGranted) {
this.commonMetrics.decrementOOMJobWaitCount();
handleError(oomMsg, null);
}
}
List commands = null;
try {
commands = getCommandList();
} catch (final Exception e) {
handleError("Job set up failed " + e.getCause(), e);
}
final long startMs = System.currentTimeMillis();
if (commands == null) {
handleError("There are no commands to execute", null);
}
info(commands.size() + " commands to execute.");
final File[] propFiles = initPropsFiles();
// change krb5ccname env var so that each job execution gets its own cache
final Map envVars = getEnvironmentVariables();
envVars.put(KRB5CCNAME, getKrb5ccname(this.jobProps));
// determine whether to run as Azkaban or run as effectiveUser,
// by default, run as effectiveUser
String executeAsUserBinaryPath = null;
String effectiveUser = null;
final boolean isExecuteAsUser = this.sysProps.getBoolean(EXECUTE_AS_USER, true);
//Get list of users we never execute flows as. (ie: root, azkaban)
final Set blackListedUsers = new HashSet<>(
Arrays.asList(
this.sysProps.getString(Constants.ConfigurationKeys.BLACK_LISTED_USERS, "root,azkaban")
.split(",")
)
);
// nativeLibFolder specifies the path for execute-as-user file,
// which will change user from Azkaban to effectiveUser
if (isExecuteAsUser) {
final String nativeLibFolder = this.sysProps.getString(AZKABAN_SERVER_NATIVE_LIB_FOLDER);
executeAsUserBinaryPath = String.format("%s/%s", nativeLibFolder, "execute-as-user");
effectiveUser = getEffectiveUser(this.jobProps);
// Throw exception if Azkaban tries to run flow as a prohibited user
if (blackListedUsers.contains(effectiveUser)) {
throw new RuntimeException(
String.format("Not permitted to proxy as '%s' through Azkaban", effectiveUser)
);
}
// Set parent directory permissions to :azkaban so user can write in their execution directory
// if the directory is not permissioned correctly already (should happen once per execution)
if (!canWriteInCurrentWorkingDirectory(effectiveUser)) {
info("Changing current working directory ownership");
assignUserFileOwnership(effectiveUser, getWorkingDirectory());
}
// Set property file permissions to :azkaban so user can write to their prop files
// in order to pass properties from one job to another
for (final File propFile : propFiles) {
info("Changing properties files ownership");
assignUserFileOwnership(effectiveUser, propFile.getAbsolutePath());
}
}
for (String command : commands) {
AzkabanProcessBuilder builder = null;
if (isExecuteAsUser) {
command =
String.format("%s %s %s", executeAsUserBinaryPath, effectiveUser,
command);
info("Command: " + command);
builder =
new AzkabanProcessBuilder(partitionCommandLine(command))
.setEnv(envVars).setWorkingDir(getCwd()).setLogger(getLog())
.enableExecuteAsUser().setExecuteAsUserBinaryPath(executeAsUserBinaryPath)
.setEffectiveUser(effectiveUser);
} else {
info("Command: " + command);
builder =
new AzkabanProcessBuilder(partitionCommandLine(command))
.setEnv(envVars).setWorkingDir(getCwd()).setLogger(getLog());
}
if (builder.getEnv().size() > 0) {
info("Environment variables: " + builder.getEnv());
}
info("Working directory: " + builder.getWorkingDir());
// print out the Job properties to the job log.
this.logJobProperties();
synchronized (this) {
// Make sure that checking if the process job is killed and creating an AzkabanProcess
// object are atomic. The cancel method relies on this to make sure that if this.process is
// not null, this block of code which includes checking if the job is killed has not been
// executed yet.
if (this.killed) {
info("The job is killed. Abort. No job process created.");
return;
}
this.process = builder.build();
}
try {
this.process.run();
this.success = true;
} catch (final Throwable e) {
for (final File file : propFiles) {
if (file != null && file.exists()) {
file.delete();
}
}
throw new RuntimeException(e);
} finally {
info("Process completed "
+ (this.success ? "successfully" : "unsuccessfully") + " in "
+ ((System.currentTimeMillis() - startMs) / 1000) + " seconds.");
}
}
// Get the output properties from this job.
generateProperties(propFiles[1]);
}
/**
*
* This method extracts the kerberos ticket cache file name from the jobprops.
* This method will ensure that each job execution will have its own kerberos ticket cache file
* Given that the code only sets an environmental variable, the number of files created
* corresponds
* to the number of processes that are doing kinit in their flow, which should not be an
* inordinately
* high number.
*
*
* @return file name: the kerberos ticket cache file to use
*/
private String getKrb5ccname(final Props jobProps) {
final String effectiveUser = getEffectiveUser(jobProps);
final String projectName =
jobProps.getString(CommonJobProperties.PROJECT_NAME).replace(" ", "_");
final String flowId =
jobProps.getString(CommonJobProperties.FLOW_ID).replace(" ", "_");
final String jobId =
jobProps.getString(CommonJobProperties.JOB_ID).replace(" ", "_");
// execId should be an int and should not have space in it, ever
final String execId = jobProps.getString(CommonJobProperties.EXEC_ID);
final String krb5ccname =
String.format("/tmp/krb5cc__%s__%s__%s__%s__%s", projectName, flowId,
jobId, execId, effectiveUser);
return krb5ccname;
}
/**
*
* Determines what user id should the process job run as, in the following order of precedence:
* 1. USER_TO_PROXY
* 2. SUBMIT_USER
*
*
* @return the user that Azkaban is going to execute as
*/
private String getEffectiveUser(final Props jobProps) {
String effectiveUser = null;
if (jobProps.containsKey(JobProperties.USER_TO_PROXY)) {
effectiveUser = jobProps.getString(JobProperties.USER_TO_PROXY);
} else if (jobProps.containsKey(CommonJobProperties.SUBMIT_USER)) {
effectiveUser = jobProps.getString(CommonJobProperties.SUBMIT_USER);
} else {
throw new RuntimeException(
"Internal Error: No user.to.proxy or submit.user in the jobProps");
}
info("effective user is: " + effectiveUser);
return effectiveUser;
}
/**
* Checks to see if user has write access to current working directory which many users need for
* their jobs to store temporary data/jars on the executor.
*
* Accomplishes this by using execute-as-user to try to create an empty file in the cwd.
*
* @param effectiveUser user/proxy user running the job
* @return true if user has write permissions in current working directory otherwise false
*/
private boolean canWriteInCurrentWorkingDirectory(final String effectiveUser)
throws IOException {
final ExecuteAsUser executeAsUser = new ExecuteAsUser(
this.sysProps.getString(AZKABAN_SERVER_NATIVE_LIB_FOLDER));
final List checkIfUserCanWriteCommand = Arrays
.asList(CREATE_FILE, getWorkingDirectory() + "/" + TEMP_FILE_NAME);
final int result = executeAsUser.execute(effectiveUser, checkIfUserCanWriteCommand);
return result == SUCCESSFUL_EXECUTION;
}
/**
* Changes permissions on file/directory so that the file/directory is owned by the user and
* the group remains the azkaban service account name.
*
* Leverages execute-as-user with "root" as the user to run the command.
*
* @param effectiveUser user/proxy user running the job
* @param fileName the name of the file whose permissions will be changed
*/
private void assignUserFileOwnership(final String effectiveUser, final String fileName) throws
Exception {
final ExecuteAsUser executeAsUser = new ExecuteAsUser(
this.sysProps.getString(AZKABAN_SERVER_NATIVE_LIB_FOLDER));
final String groupName = this.sysProps.getString(AZKABAN_SERVER_GROUP_NAME, "azkaban");
final List changeOwnershipCommand = Arrays
.asList(CHOWN, effectiveUser + ":" + groupName, fileName);
info("Change ownership of " + fileName + " to " + effectiveUser + ":" + groupName + ".");
final int result = executeAsUser.execute("root", changeOwnershipCommand);
if (result != 0) {
handleError("Failed to change current working directory ownership. Error code: " + Integer
.toString(result), null);
}
}
/**
* This is used to get the min/max memory size requirement by processes. SystemMemoryInfo can use
* the info to determine if the memory request can be fulfilled. For Java process, this should be
* Xms/Xmx setting.
*
* @return pair of min/max memory size
*/
protected Pair getProcMemoryRequirement() throws Exception {
return new Pair<>(0L, 0L);
}
protected void handleError(final String errorMsg, final Exception e) throws Exception {
error(errorMsg);
if (e != null) {
throw new Exception(errorMsg, e);
} else {
throw new Exception(errorMsg);
}
}
protected List getCommandList() {
final List commands = new ArrayList<>();
commands.add(this.jobProps.getString(COMMAND));
for (int i = 1; this.jobProps.containsKey(COMMAND + "." + i); i++) {
commands.add(this.jobProps.getString(COMMAND + "." + i));
}
return commands;
}
@Override
public void cancel() throws InterruptedException {
// in case the job is waiting
synchronized (this) {
this.killed = true;
this.notify();
if (this.process == null) {
// The job thread has not checked if the job is killed yet.
// setting the killed flag should be enough to abort the job.
// There is no job process to kill.
return;
}
}
this.process.awaitStartup();
final boolean processkilled = this.process
.softKill(KILL_TIME.toMillis(), TimeUnit.MILLISECONDS);
if (!processkilled) {
warn("Kill with signal TERM failed. Killing with KILL signal.");
this.process.hardKill();
}
}
@Override
public double getProgress() {
return this.process != null && this.process.isComplete() ? 1.0 : 0.0;
}
public int getProcessId() {
return this.process.getProcessId();
}
@VisibleForTesting
boolean isSuccess() {
return this.success;
}
@VisibleForTesting
AzkabanProcess getProcess() {
return this.process;
}
public String getPath() {
return this._jobPath == null ? "" : this._jobPath;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy