org.apache.asterix.aoya.AsterixApplicationMaster Maven / Gradle / Ivy
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
package org.apache.asterix.aoya;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.net.InetAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.UnknownHostException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.ContainerManagementProtocol;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.LocalResource;
import org.apache.hadoop.yarn.api.records.LocalResourceType;
import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hadoop.yarn.util.Records;
import org.apache.asterix.common.config.GlobalConfig;
import org.apache.asterix.common.exceptions.AsterixException;
import org.apache.asterix.event.schema.yarnCluster.Cluster;
import org.apache.asterix.event.schema.yarnCluster.MasterNode;
import org.apache.asterix.event.schema.yarnCluster.Node;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
public class AsterixApplicationMaster {
Logger rootLogger = Logger.getRootLogger();
rootLogger.addAppender(new ConsoleAppender(
new PatternLayout("%-6r [%p] %c - %m%n")));
private static final Log LOG = LogFactory.getLog(AsterixApplicationMaster.class);
private static final String CLUSTER_DESC_PATH = "cluster-config.xml";
private static final String ASTERIX_CONF_NAME = "asterix-configuration.xml";
private static final String ASTERIX_ZIP_NAME = "asterix-server.zip";
private static final int CC_MEMORY_MBS_DEFAULT = 1024;
private static final int NC_MEMORY_MBS_DEFAULT = 1536;
private static final String EXTERNAL_CC_JAVA_OPTS_DEFAULT = "-Xmx" + CC_MEMORY_MBS_DEFAULT + "m";
private static final String EXTERNAL_NC_JAVA_OPTS_DEFAULT = "-Xmx" + NC_MEMORY_MBS_DEFAULT + "m";
private static final String OBLITERATOR_CLASSNAME = "org.apache.asterix.aoya.Deleter";
private static final String HDFS_BACKUP_CLASSNAME = "org.apache.asterix.aoya.HDFSBackup";
private static final String NC_CLASSNAME = "org.apache.hyracks.control.nc.NCDriver";
private static final String CC_CLASSNAME = "org.apache.hyracks.control.cc.CCDriver";
private static final String JAVA_HOME = System.getProperty("java.home");
private boolean doneAllocating = false;
// Configuration
private Configuration conf;
// Handle to communicate with the Resource Manager
private AMRMClientAsync resourceManager;
// Handle to communicate with the Node Manager
private NMClientAsync nmClientAsync;
// Listen to process the response from the Node Manager
private NMCallbackHandler containerListener;
// Application Attempt Id ( combination of attemptId and fail count )
private ApplicationAttemptId appAttemptID;
// For status update for clients - yet to be implemented
// Hostname of the container
private String appMasterHostname = "";
// Port on which the app master listens for status updates from clients
private int appMasterRpcPort = new Random().nextInt(65535-49152);
// Tracking url to which app master publishes info for clients to monitor
private String appMasterTrackingUrl = "";
// Counter for completed containers ( complete denotes successful or failed )
private AtomicInteger numCompletedContainers = new AtomicInteger();
// Allocated container count so that we know how many containers has the RM
// allocated to us
private AtomicInteger numAllocatedContainers = new AtomicInteger();
// Count of failed containers
private AtomicInteger numFailedContainers = new AtomicInteger();
// Count of containers already requested from the RM
// Needed as once requested, we should not request for containers again.
// Only request for more if the original requirement changes.
private AtomicInteger numRequestedContainers = new AtomicInteger();
//Tells us whether the Cluster Controller is up so we can safely start some Node Controllers
private AtomicBoolean ccUp = new AtomicBoolean();
private AtomicBoolean ccStarted = new AtomicBoolean();
private Queue pendingNCs = new ArrayDeque();
//HDFS path to AsterixDB distributable zip
private String asterixZipPath = "";
// Timestamp needed for creating a local resource
private long asterixZipTimestamp = 0;
// File length needed for local resource
private long asterixZipLen = 0;
//HDFS path to AsterixDB cluster description
private String asterixConfPath = "";
// Timestamp needed for creating a local resource
private long asterixConfTimestamp = 0;
// File length needed for local resource
private long asterixConfLen = 0;
private String instanceConfPath = "";
//base dir under which all configs and binaries lie
private String dfsBasePath;
private int numTotalContainers = 0;
// Set the local resources
private Map localResources = new HashMap();
private Cluster clusterDesc = null;
private MasterNode cC = null;
private String ccJavaOpts = null;
private int ccMem = 0;
private String ncJavaOpts = null;
private int ncMem = 0;
private volatile boolean done;
private volatile boolean success;
private boolean obliterate = false;
private Path appMasterJar = null;
private boolean backup = false;
long backupTimestamp;
String snapName;
private boolean restore = false;
private boolean initial = false;
// Launch threads
private List launchThreads = new CopyOnWriteArrayList();
public static void main(String[] args) {
boolean result = false;
try {
AsterixApplicationMaster appMaster = new AsterixApplicationMaster();
LOG.info("Initializing ApplicationMaster");
boolean doRun = appMaster.init();
if (!doRun) {
result = appMaster.run();
} catch (Exception e) {
LOG.fatal("Error running ApplicationMaster", e);
if (result) {
LOG.info("Application Master completed successfully. exiting");
} else {
LOG.info("Application Master failed. exiting");
private void dumpOutDebugInfo() {
LOG.info("Dump debug output");
Map envs = System.getenv();
for (Map.Entry env : envs.entrySet()) {
LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue());
System.out.println("System env: key=" + env.getKey() + ", val=" + env.getValue());
String cmd = "ls -alhLR";
Runtime run = Runtime.getRuntime();
Process pr = null;
try {
pr = run.exec(cmd);
BufferedReader buf = new BufferedReader(new InputStreamReader(pr.getInputStream()));
String line = "";
while ((line = buf.readLine()) != null) {
LOG.info("System CWD content: " + line);
System.out.println("System CWD content: " + line);
} catch (IOException e) {
} catch (InterruptedException e) {
public AsterixApplicationMaster() {
// Set up the configuration and RPC
conf = new YarnConfiguration();
public CommandLine setArgs(String[] args) throws ParseException {
Options opts = new Options();
opts.addOption("app_attempt_id", true, "App Attempt ID. Not to be used unless for testing purposes");
opts.addOption("priority", true, "Application Priority. Default 0");
opts.addOption("debug", false, "Dump out debug information");
opts.addOption("help", false, "Print usage");
opts.addOption("initial", false, "Initialize existing Asterix instance.");
opts.addOption("obliterate", false, "Delete asterix instance completely.");
opts.addOption("backup", false, "Back up AsterixDB instance");
opts.addOption("restore", true, "Restore an AsterixDB instance");
CommandLine cliParser = new GnuParser().parse(opts, args);
if (cliParser.hasOption("help")) {
if (cliParser.hasOption("debug")) {
if (cliParser.hasOption("obliterate")) {
obliterate = true;
initial = true;
if (cliParser.hasOption("backup")) {
backup = true;
backupTimestamp = System.currentTimeMillis();
if (cliParser.hasOption("restore")) {
restore = true;
snapName = cliParser.getOptionValue("restore");
return cliParser;
public void setEnvs(CommandLine cliParser) {
Map envs = System.getenv();
if (envs.containsKey("HADOOP_CONF_DIR")) {
File hadoopConfDir = new File(envs.get("HADOOP_CONF_DIR"));
if (hadoopConfDir.isDirectory()) {
for (File config : hadoopConfDir.listFiles()) {
if (config.getName().matches("^.*(xml)$")) {
conf.addResource(new Path(config.getAbsolutePath()));
//the containerID might be in the arguments or the environment
if (!envs.containsKey(Environment.CONTAINER_ID.name())) {
if (cliParser.hasOption("app_attempt_id")) {
String appIdStr = cliParser.getOptionValue("app_attempt_id", "");
appAttemptID = ConverterUtils.toApplicationAttemptId(appIdStr);
} else {
throw new IllegalArgumentException(
"Environment is not set correctly- please check client submission settings");
} else {
ContainerId containerId = ConverterUtils.toContainerId(envs.get(Environment.CONTAINER_ID.name()));
appAttemptID = containerId.getApplicationAttemptId();
if (!envs.containsKey(ApplicationConstants.APP_SUBMIT_TIME_ENV)
|| !envs.containsKey(Environment.NM_HOST.name()) || !envs.containsKey(Environment.NM_HTTP_PORT.name())
|| !envs.containsKey(Environment.NM_PORT.name())) {
throw new IllegalArgumentException(
"Environment is not set correctly- please check client submission settings");
System.setProperty(GlobalConfig.CONFIG_FILE_PROPERTY, envs.get("PWD") + File.separator + "bin" + File.separator
LOG.info("Application master for app" + ", appId=" + appAttemptID.getApplicationId().getId()
+ ", clustertimestamp=" + appAttemptID.getApplicationId().getClusterTimestamp() + ", attemptId="
+ appAttemptID.getAttemptId());
asterixZipPath = envs.get(AConstants.TARLOCATION);
asterixZipTimestamp = Long.parseLong(envs.get(AConstants.TARTIMESTAMP));
asterixZipLen = Long.parseLong(envs.get(AConstants.TARLEN));
asterixConfPath = envs.get(AConstants.CONFLOCATION);
asterixConfTimestamp = Long.parseLong(envs.get(AConstants.CONFTIMESTAMP));
asterixConfLen = Long.parseLong(envs.get(AConstants.CONFLEN));
instanceConfPath = envs.get(AConstants.INSTANCESTORE);
//the only time this is null is during testing, when asterix-yarn isn't packaged in a JAR yet.
if(envs.get(AConstants.APPLICATIONMASTERJARLOCATION) != null
&& !envs.get(AConstants.APPLICATIONMASTERJARLOCATION).endsWith(File.separator)){
appMasterJar = new Path(envs.get(AConstants.APPLICATIONMASTERJARLOCATION));
appMasterJar = null;
dfsBasePath = envs.get(AConstants.DFS_BASE);
//If the NM has an odd environment where the proper hadoop XML configs dont get imported, we can end up not being able to talk to the RM
// this solves that!
//in a testing environment these can be null however.
if (envs.get(AConstants.RMADDRESS) != null) {
conf.set("yarn.resourcemanager.address", envs.get(AConstants.RMADDRESS));
LOG.info("RM Address: " + envs.get(AConstants.RMADDRESS));
if (envs.get(AConstants.RMADDRESS) != null) {
conf.set("yarn.resourcemanager.scheduler.address", envs.get(AConstants.RMSCHEDULERADDRESS));
ccJavaOpts = envs.get(AConstants.CC_JAVA_OPTS);
//set defaults if no special given options
if (ccJavaOpts == null) {
ncJavaOpts = envs.get(AConstants.NC_JAVA_OPTS);
if (ncJavaOpts == null) {
LOG.info("Path suffix: " + instanceConfPath);
public boolean init() throws ParseException, IOException, AsterixException, YarnException {
try {
clusterDesc = Utils.parseYarnClusterConfig(CLUSTER_DESC_PATH);
cC = clusterDesc.getMasterNode();
appMasterTrackingUrl = "http://" + cC.getClientIp() + ":" + cC.getClientPort() + Path.SEPARATOR;
//now let's read what's in there so we can set the JVM opts right
LOG.debug("config file loc: " + System.getProperty(GlobalConfig.CONFIG_FILE_PROPERTY));
} catch (FileNotFoundException | IllegalStateException e) {
LOG.error("Could not deserialize Cluster Config from disk- aborting!");
throw e;
return true;
* Sets up the parameters for the Asterix config.
* @throws IOException
private void distributeAsterixConfig() throws IOException {
FileSystem fs = FileSystem.get(conf);
String pathSuffix = instanceConfPath + File.separator + ASTERIX_CONF_NAME;
Path dst = new Path(dfsBasePath, pathSuffix);
URI paramLocation = dst.toUri();
FileStatus paramFileStatus = fs.getFileStatus(dst);
Long paramLen = paramFileStatus.getLen();
Long paramTimestamp = paramFileStatus.getModificationTime();
LocalResource asterixParamLoc = Records.newRecord(LocalResource.class);
localResources.put(ASTERIX_CONF_NAME, asterixParamLoc);
* @param c
* The cluster exception to attempt to alocate with the RM
* @throws YarnException
private void requestResources(Cluster c) throws YarnException, UnknownHostException {
//set memory
if (c.getCcContainerMem() != null) {
ccMem = Integer.parseInt(c.getCcContainerMem());
} else {
if (c.getNcContainerMem() != null) {
ncMem = Integer.parseInt(c.getNcContainerMem());
} else {
//request CC
int numNodes = 0;
ContainerRequest ccAsk = hostToRequest(cC.getClusterIp(), true);
LOG.info("Asked for CC: " + Arrays.toString(ccAsk.getNodes().toArray()));
//now we wait to be given the CC before starting the NCs...
//we will wait a minute.
int deathClock = 60;
while (ccUp.get() == false && deathClock > 0) {
try {
} catch (InterruptedException ex) {
if (deathClock == 0 && ccUp.get() == false) {
throw new YarnException("Couldn't allocate container for CC. Abort!");
LOG.info("Waiting for CC process to start");
//TODO: inspect for actual liveness instead of waiting.
// is there a good way to do this? maybe try opening a socket to it...
try {
} catch (InterruptedException ex) {
//request NCs
for (Node n : c.getNode()) {
resourceManager.addContainerRequest(hostToRequest(n.getClusterIp(), false));
LOG.info("Asked for NC: " + n.getClusterIp());
LOG.info("Requested all NCs and CCs. Wait for things to settle!");
numTotalContainers = numNodes;
doneAllocating = true;
* Asks the RM for a particular host, nicely.
* @param host
* The host to request
* @param cc
* Whether or not the host is the CC
* @return A container request that is (hopefully) for the host we asked for.
private ContainerRequest hostToRequest(String host, boolean cc) throws UnknownHostException {
InetAddress hostIp = InetAddress.getByName(host);
Priority pri = Records.newRecord(Priority.class);
Resource capability = Records.newRecord(Resource.class);
if (cc) {
} else {
//we dont set anything else because we don't care about that and yarn doesn't honor it yet
String[] hosts = new String[1];
//TODO this is silly
hosts[0] = hostIp.getHostName();
LOG.info("IP addr: " + host + " resolved to " + hostIp.getHostName());
ContainerRequest request = new ContainerRequest(capability, hosts, null, pri, false);
LOG.info("Requested host ask: " + request.getNodes());
return request;
* Determines whether or not a container is the one on which the CC should reside
* @param c
* The container in question
* @return True if the container should have the CC process on it, false otherwise.
boolean containerIsCC(Container c) {
String containerHost = c.getNodeId().getHost();
try {
InetAddress containerIp = InetAddress.getByName(containerHost);
InetAddress ccIp = InetAddress.getByName(cC.getClusterIp());
return containerIp.getCanonicalHostName().equals(ccIp.getCanonicalHostName());
} catch (UnknownHostException e) {
return false;
* Attempts to find the Node in the Cluster Description that matches this container
* @param c
* The container to resolve
* @return The node this container corresponds to
* @throws java.net.UnknownHostException
* if the container isn't present in the description
Node containerToNode(Container c, Cluster cl) throws UnknownHostException {
String containerHost = c.getNodeId().getHost();
InetAddress containerIp = InetAddress.getByName(containerHost);
LOG.info("Resolved Container IP: " + containerIp);
for (Node node : cl.getNode()) {
InetAddress nodeIp = InetAddress.getByName(node.getClusterIp());
LOG.info(nodeIp + "?=" + containerIp);
if (nodeIp.equals(containerIp))
return node;
//if we find nothing, this is bad...
throw new java.net.UnknownHostException("Could not resolve container" + containerHost + " to node");
* Here I am just pointing the Containers to the exisiting HDFS resources given by the Client
* filesystem of the nodes.
* @throws IOException
private void localizeDFSResources() throws IOException {
//if performing an 'offline' task, skip a lot of resource distribution
if (obliterate || backup || restore) {
if (appMasterJar == null || ("").equals(appMasterJar)) {
//this can happen in a jUnit testing environment. we don't need to set it there.
if (!conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) {
throw new IllegalStateException("AM jar not provided in environment.");
} else {
FileSystem fs = FileSystem.get(conf);
FileStatus appMasterJarStatus = fs.getFileStatus(appMasterJar);
LocalResource obliteratorJar = Records.newRecord(LocalResource.class);
localResources.put("asterix-yarn.jar", obliteratorJar);
//otherwise, distribute evertything to start up asterix
LocalResource asterixZip = Records.newRecord(LocalResource.class);
//this un-tar's the asterix distribution
try {
asterixZip.setResource(ConverterUtils.getYarnUrlFromURI(new URI(asterixZipPath)));
} catch (URISyntaxException e) {
LOG.error("Error locating Asterix zip" + " in env, path=" + asterixZipPath);
throw new IOException(e);
localResources.put(ASTERIX_ZIP_NAME, asterixZip);
//now let's do the same for the cluster description XML
LocalResource asterixConf = Records.newRecord(LocalResource.class);
try {
asterixConf.setResource(ConverterUtils.getYarnUrlFromURI(new URI(asterixConfPath)));
} catch (URISyntaxException e) {
LOG.error("Error locating Asterix config" + " in env, path=" + asterixConfPath);
throw new IOException(e);
//TODO: I could avoid localizing this everywhere by only calling this block on the metadata node.
localResources.put("cluster-config.xml", asterixConf);
//now add the libraries if there are any
try {
FileSystem fs = FileSystem.get(conf);
Path p = new Path(dfsBasePath, instanceConfPath + File.separator + "library" + Path.SEPARATOR);
if (fs.exists(p)) {
FileStatus[] dataverses = fs.listStatus(p);
for (FileStatus d : dataverses) {
if (!d.isDirectory())
throw new IOException("Library configuration directory structure is incorrect");
FileStatus[] libraries = fs.listStatus(d.getPath());
for (FileStatus l : libraries) {
if (l.isDirectory())
throw new IOException("Library configuration directory structure is incorrect");
LocalResource lr = Records.newRecord(LocalResource.class);
localResources.put("library" + Path.SEPARATOR + d.getPath().getName() + Path.SEPARATOR
+ l.getPath().getName().split("\\.")[0], lr);
LOG.info("Found library: " + l.getPath().toString());
} catch (FileNotFoundException e) {
LOG.info("No external libraries present");
//do nothing, it just means there aren't libraries. that is possible and ok
// it should be handled by the fs.exists(p) check though.
private void printUsage(Options opts) {
new HelpFormatter().printHelp("ApplicationMaster", opts);
* Start the AM and request all necessary resources.
* @return True if the run fully succeeded, false otherwise.
* @throws YarnException
* @throws IOException
public boolean run() throws YarnException, IOException {
LOG.info("Starting ApplicationMaster");
AMRMClientAsync.CallbackHandler allocListener = new RMCallbackHandler();
resourceManager = AMRMClientAsync.createAMRMClientAsync(1000, allocListener);
containerListener = new NMCallbackHandler();
nmClientAsync = new NMClientAsyncImpl(containerListener);
// Register self with ResourceManager
// This will start heartbeating to the RM
try {
appMasterHostname = InetAddress.getLocalHost().toString();
} catch (java.net.UnknownHostException uhe) {
appMasterHostname = uhe.toString();
RegisterApplicationMasterResponse response = resourceManager.registerApplicationMaster(appMasterHostname,
appMasterRpcPort, appMasterTrackingUrl);
// Dump out information about cluster capability as seen by the
// resource manager
int maxMem = response.getMaximumResourceCapability().getMemory();
LOG.info("Max mem capabililty of resources in this cluster " + maxMem);
try {
} catch (YarnException e) {
LOG.error("Could not allocate resources properly:" + e.getMessage());
done = true;
throw e;
//now we just sit and listen for messages from the RM
while (!done) {
try {
} catch (InterruptedException ex) {
return success;
* Clean up, whether or not we were successful.
private void finish() {
// Join all launched threads
// needed for when we time out
// and we need to release containers
for (Thread launchThread : launchThreads) {
try {
} catch (InterruptedException e) {
LOG.info("Exception thrown in thread join: " + e.getMessage());
//from https://stackoverflow.com/questions/4812570/how-to-store-printstacktrace-into-a-string
StringWriter errors = new StringWriter();
e.printStackTrace(new PrintWriter(errors));
// When the application completes, it should stop all running containers
LOG.info("Application completed. Stopping running containers");
// When the application completes, it should send a finish application
// signal to the RM
LOG.info("Application completed. Signalling finish to RM");
FinalApplicationStatus appStatus;
String appMessage = null;
success = true;
if (numFailedContainers.get() == 0 && numCompletedContainers.get() == numTotalContainers) {
appStatus = FinalApplicationStatus.SUCCEEDED;
} else {
appStatus = FinalApplicationStatus.FAILED;
appMessage = "Diagnostics." + ", total=" + numTotalContainers + ", completed="
+ numCompletedContainers.get() + ", allocated=" + numAllocatedContainers.get() + ", failed="
+ numFailedContainers.get();
success = false;
try {
resourceManager.unregisterApplicationMaster(appStatus, appMessage, null);
} catch (YarnException ex) {
LOG.error("Failed to unregister application", ex);
} catch (IOException e) {
LOG.error("Failed to unregister application", e);
done = true;
* This handles the information that comes in from the RM while the AM
* is running.
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
public void onContainersCompleted(List completedContainers) {
LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
for (ContainerStatus containerStatus : completedContainers) {
LOG.info("Got container status for containerID=" + containerStatus.getContainerId() + ", state="
+ containerStatus.getState() + ", exitStatus=" + containerStatus.getExitStatus()
+ ", diagnostics=" + containerStatus.getDiagnostics());
// non complete containers should not be here
if(containerStatus.getState() != ContainerState.COMPLETE){
throw new IllegalStateException("Non-completed container given as completed by RM.");
// increment counters for completed/failed containers
int exitStatus = containerStatus.getExitStatus();
if (0 != exitStatus) {
// container failed
} else {
// nothing to do
// container completed successfully
LOG.info("Container completed successfully." + ", containerId=" + containerStatus.getContainerId());
//stop infinite looping of run()
if (numCompletedContainers.get() + numFailedContainers.get() == numAllocatedContainers.get()
&& doneAllocating)
done = true;
public void onContainersAllocated(List allocatedContainers) {
LOG.info("Got response from RM for container ask, allocatedCnt=" + allocatedContainers.size());
for (Container allocatedContainer : allocatedContainers) {
try {
if (!pendingNCs.contains(containerToNode(allocatedContainer, clusterDesc)) && ccUp.get()) {
nmClientAsync.stopContainerAsync(allocatedContainer.getId(), allocatedContainer.getNodeId());
} catch(UnknownHostException ex){
LOG.error("Unknown host allocated for us by RM- this shouldn't happen.", ex);
LOG.info("Launching shell command on a new container." + ", containerId=" + allocatedContainer.getId()
+ ", containerNode=" + allocatedContainer.getNodeId().getHost() + ":"
+ allocatedContainer.getNodeId().getPort() + ", containerNodeURI="
+ allocatedContainer.getNodeHttpAddress() + ", containerResourceMemory"
+ allocatedContainer.getResource().getMemory());
LaunchAsterixContainer runnableLaunchContainer = new LaunchAsterixContainer(allocatedContainer,
Thread launchThread = new Thread(runnableLaunchContainer, "Asterix CC/NC");
// I want to know if this node is the CC, because it must start before the NCs.
LOG.info("Allocated: " + allocatedContainer.getNodeId().getHost());
LOG.info("CC : " + cC.getId());
try {
if (ccUp.get()) {
pendingNCs.remove(containerToNode(allocatedContainer, clusterDesc));
} catch(UnknownHostException ex){
LOG.error("Unknown host allocated for us by RM- this shouldn't happen.", ex);
if (containerIsCC(allocatedContainer)) {
// launch and start the container on a separate thread to keep
// the main thread unblocked
// as all containers may not be allocated at one go.
* Ask the processes on the container to gracefully exit.
public void onShutdownRequest() {
LOG.info("AM shutting down per request");
done = true;
public void onNodesUpdated(List updatedNodes) {
//TODO: This will become important when we deal with what happens if an NC dies
public float getProgress() {
//return half way because progress is basically meaningless for us
if (!doneAllocating) {
return 0.0f;
return (float) 0.5;
public void onError(Throwable arg0) {
LOG.error("Fatal Error recieved by AM: " + arg0);
done = true;
private class NMCallbackHandler implements NMClientAsync.CallbackHandler {
private ConcurrentMap containers = new ConcurrentHashMap();
public void addContainer(ContainerId containerId, Container container) {
containers.putIfAbsent(containerId, container);
public void onContainerStopped(ContainerId containerId) {
if (LOG.isDebugEnabled()) {
LOG.debug("Succeeded to stop Container " + containerId);
public void onContainerStatusReceived(ContainerId containerId, ContainerStatus containerStatus) {
if (LOG.isDebugEnabled()) {
LOG.debug("Container Status: id=" + containerId + ", status=" + containerStatus);
public void onContainerStarted(ContainerId containerId, Map allServiceResponse) {
if (LOG.isDebugEnabled()) {
LOG.debug("Succeeded to start Container " + containerId);
Container container = containers.get(containerId);
if (container != null) {
nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId());
public void onStartContainerError(ContainerId containerId, Throwable t) {
LOG.error("Failed to start Container " + containerId);
public void onGetContainerStatusError(ContainerId containerId, Throwable t) {
LOG.error("Failed to query the status of Container " + containerId);
public void onStopContainerError(ContainerId containerId, Throwable t) {
LOG.error("Failed to stop Container " + containerId);
* Thread to connect to the {@link ContainerManagementProtocol} and launch the container
* that will execute the shell command.
private class LaunchAsterixContainer implements Runnable {
// Allocated container
final Container container;
final NMCallbackHandler containerListener;
* @param lcontainer
* Allocated container
* @param containerListener
* Callback handler of the container
public LaunchAsterixContainer(Container lcontainer, NMCallbackHandler containerListener) {
this.container = lcontainer;
this.containerListener = containerListener;
* Connects to CM, sets up container launch context
* for shell command and eventually dispatches the container
* start request to the CM.
public void run() {
LOG.info("Setting up container launch container for containerid=" + container.getId());
ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class);
// Set the local resources
//Set the env variables to be setup in the env where the application master will be run
LOG.info("Set the environment for the node");
Map env = new HashMap();
// Add AppMaster.jar location to classpath
// At some point we should not be required to add
// the hadoop specific classpaths to the env.
// It should be provided out of the box.
// For now setting all required classpaths including
// the classpath to "." for the application jar
StringBuilder classPathEnv = new StringBuilder(Environment.CLASSPATH.$()).append(File.pathSeparatorChar)
.append("." + File.pathSeparatorChar + "*");
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
// add the runtime classpath needed for tests to work
if (conf.getBoolean(YarnConfiguration.IS_MINI_YARN_CLUSTER, false)) {
env.put("HADOOP_CONF_DIR", System.getProperty("user.dir") + File.separator + "target" + File.separator);
env.put("CLASSPATH", classPathEnv.toString());
List startCmd = null;
if (obliterate) {
LOG.debug("AM in obliterate mode");
startCmd = produceObliterateCommand(container);
} else if (backup) {
startCmd = produceBackupCommand(container);
LOG.debug("AM in backup mode");
} else if (restore) {
startCmd = produceRestoreCommand(container);
LOG.debug("AM in restore mode");
} else {
startCmd = produceStartCmd(container);
if (startCmd == null || startCmd.size() == 0) {
LOG.fatal("Could not map one or more NCs to NM container hosts- aborting!");
for (String s : startCmd) {
LOG.info("Command to execute: " + s);
containerListener.addContainer(container.getId(), container);
//finally start the container!?
nmClientAsync.startContainerAsync(container, ctx);
* Determines for a given container what the necessary command line
* arguments are to start the Asterix processes on that instance
* @param container
* The container to produce the commands for
* @return A list of the commands that should be executed
private List produceStartCmd(Container container) {
List commands = new ArrayList();
// Set the necessary command to execute on the allocated container
List vargs = new ArrayList(5);
vargs.add(JAVA_HOME + File.separator + "bin" + File.separator + "java");
vargs.add("-classpath " + '\'' + ASTERIX_ZIP_NAME + File.separator + "repo" + File.separator + "*\'");
vargs.add("-Dapp.repo=" + ASTERIX_ZIP_NAME + File.separator + "repo" + File.separator);
//first see if this node is the CC
if (containerIsCC(container) && (ccStarted.get() == false)) {
LOG.info("CC found on container" + container.getNodeId().getHost());
//get our java opts
vargs.add("-app-cc-main-class org.apache.asterix.hyracks.bootstrap.CCApplicationEntryPoint");
vargs.add("-cluster-net-ip-address " + cC.getClusterIp());
vargs.add("-client-net-ip-address " + cC.getClientIp());
} else {
//now we need to know what node we are on, so we can apply the correct properties
Node local;
try {
local = containerToNode(container, clusterDesc);
LOG.info("Attempting to start NC on host " + local.getId());
String iodevice = local.getIodevices();
if (iodevice == null) {
iodevice = clusterDesc.getIodevices();
String storageSuffix = local.getStore() == null ? clusterDesc.getStore() : local.getStore();
String storagePath = iodevice + File.separator + storageSuffix;
vargs.add("-app-nc-main-class org.apache.asterix.hyracks.bootstrap.NCApplicationEntryPoint");
vargs.add("-node-id " + local.getId());
vargs.add("-cc-host " + cC.getClusterIp());
vargs.add("-iodevices " + storagePath);
vargs.add("-cluster-net-ip-address " + local.getClusterIp());
vargs.add("-data-ip-address " + local.getClusterIp());
vargs.add("-result-ip-address " + local.getClusterIp());
vargs.add("-initial-run ");
} catch (UnknownHostException e) {
LOG.error("Unable to find NC or CC configured for host: " + container.getId() + " " + e);
// Add log redirect params
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stderr");
// Get final commmand
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
command.append(str).append(" ");
return commands;
private List produceObliterateCommand(Container container) {
//if this container has no NCs on it, nothing will be there to delete.
Node local = null;
List iodevices = null;
try {
local = containerToNode(container, clusterDesc);
if (local.getIodevices() == null) {
iodevices = Arrays.asList(clusterDesc.getIodevices().split(",", -1));
} else {
iodevices = Arrays.asList(local.getIodevices().split(",", -1));
} catch (UnknownHostException e) {
//we expect this may happen for the CC if it isn't colocated with an NC. otherwise it is not suppose to happen.
if (!containerIsCC(container)) {
LOG.error("Unable to find NC configured for host: " + container.getId() + e);
return null;
else {
return Arrays.asList("");
StringBuilder classPathEnv = new StringBuilder("").append("*");
List commands = new ArrayList();
Vector vargs = new Vector(5);
vargs.add(JAVA_HOME + File.separator + "bin" + File.separator + "java");
vargs.add("-cp " + classPathEnv.toString());
for (String s : iodevices) {
vargs.add(s + File.separator + clusterDesc.getStore());
LOG.debug("Deleting from: " + s);
//logs only exist on 1st iodevice
if (iodevices.indexOf(s) == 0) {
vargs.add(clusterDesc.getTxnLogDir() + "txnLogs" + File.separator);
LOG.debug("Deleting logs from: " + clusterDesc.getTxnLogDir());
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stderr");
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
command.append(str).append(" ");
return commands;
private List produceBackupCommand(Container container) {
Node local = null;
List iodevices = null;
try {
local = containerToNode(container, clusterDesc);
if (local.getIodevices() == null) {
iodevices = Arrays.asList(clusterDesc.getIodevices().split(",", -1));
} else {
iodevices = Arrays.asList(local.getIodevices().split(",", -1));
} catch (UnknownHostException e) {
//we expect this may happen for the CC if it isn't colocated with an NC. otherwise it is not suppose to happen.
if (!containerIsCC(container)) {
LOG.error("Unable to find NC configured for host: " + container.getId() + e);
return null;
}else {
return Arrays.asList("");
StringBuilder classPathEnv = new StringBuilder("").append("." + File.separator + "*");
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
classPathEnv.append(File.pathSeparatorChar).append("." + File.separator + "log4j.properties");
List commands = new ArrayList();
Vector vargs = new Vector(5);
vargs.add(JAVA_HOME + File.separator + "bin" + File.separator + "java");
vargs.add("-cp " + classPathEnv.toString());
String dstBase = instanceConfPath + "backups" + Path.SEPARATOR + backupTimestamp + Path.SEPARATOR
+ local.getId();
try {
} catch (IOException e) {
//something very bad happened- return null to cause attempt to abort
return null;
for (String s : iodevices) {
List ioComponents = Arrays.asList(s.split("\\/"));
StringBuilder dst = new StringBuilder().append(dstBase);
for (String io : ioComponents) {
if (ioComponents.indexOf(io) != ioComponents.size() - 1) {
vargs.add(s + File.separator + clusterDesc.getStore() + "," + dst);
LOG.debug("Backing up from: " + s);
//logs only exist on 1st iodevice
if (iodevices.indexOf(s) == 0) {
LOG.debug("Backing up logs from: " + clusterDesc.getTxnLogDir());
vargs.add(clusterDesc.getTxnLogDir() + "txnLogs" + File.separator + "," + dst);
LOG.debug("Backing up to: " + instanceConfPath + "backups" + Path.SEPARATOR + local.getId());
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stderr");
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
command.append(str).append(" ");
return commands;
private void createBackupFolder(String path) throws IOException {
FileSystem fs = FileSystem.get(conf);
Path backupFolder = new Path(path);
private List produceRestoreCommand(Container container) {
if (containerIsCC(container)) {
List blank = new ArrayList();
return blank;
Node local = null;
List iodevices = null;
try {
local = containerToNode(container, clusterDesc);
if (local.getIodevices() == null) {
iodevices = Arrays.asList(clusterDesc.getIodevices().split(",", -1));
} else {
iodevices = Arrays.asList(local.getIodevices().split(",", -1));
} catch (UnknownHostException e) {
//we expect this may happen for the CC if it isn't colocated with an NC. otherwise it is not suppose to happen.
if (!containerIsCC(container)) {
LOG.error("Unable to find NC configured for host: " + container.getId() + e);
return null;
} else {
return Arrays.asList("");
StringBuilder classPathEnv = new StringBuilder("").append("." + File.separator + "*");
for (String c : conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
classPathEnv.append(File.pathSeparatorChar).append("." + File.separator + "log4j.properties");
List commands = new ArrayList();
Vector vargs = new Vector(5);
vargs.add(JAVA_HOME + File.separator + "bin" + File.separator + "java");
vargs.add("-cp " + classPathEnv.toString());
String srcBase = instanceConfPath + "backups" + Path.SEPARATOR + Long.parseLong(snapName) + Path.SEPARATOR
+ local.getId();
for (String s : iodevices) {
List ioComponents = Arrays.asList(s.split("\\/"));
StringBuilder src = new StringBuilder().append(srcBase);
for (String io : ioComponents) {
if (ioComponents.indexOf(io) != ioComponents.size() - 1) {
try {
FileSystem fs = FileSystem.get(conf);
FileStatus[] backups = fs.listStatus(new Path(src.toString()));
for (FileStatus b : backups) {
if (!b.getPath().toString().contains("txnLogs")
&& !b.getPath().toString().contains(File.separator + "asterix_root_metadata")) {
vargs.add(b.getPath() + "," + s + File.separator + clusterDesc.getStore());
} catch (IOException e) {
LOG.error("Could not stat backup directory in DFS");
vargs.add(src + "," + s + clusterDesc.getStore());
LOG.debug("Restoring from: " + s);
//logs only exist on 1st iodevice
if (iodevices.indexOf(s) == 0) {
vargs.add(src + "txnLogs" + File.separator + "," + clusterDesc.getTxnLogDir() + File.separator);
LOG.debug("Restoring logs from: " + clusterDesc.getTxnLogDir());
LOG.debug("Restoring to: " + instanceConfPath + "backups" + Path.SEPARATOR + local.getId());
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + File.separator + "stderr");
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
command.append(str).append(" ");
return commands;
© 2015 - 2025 Weber Informatics LLC | Privacy Policy