Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
io.cdap.cdap.internal.app.services.ProgramNotificationSubscriberService Maven / Gradle / Ivy
/*
* Copyright © 2017-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package io.cdap.cdap.internal.app.services;
import com.google.common.collect.ImmutableMap;
import com.google.common.util.concurrent.AbstractIdleService;
import com.google.common.util.concurrent.Service;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException;
import com.google.gson.reflect.TypeToken;
import com.google.inject.Inject;
import io.cdap.cdap.api.metrics.MetricsCollectionService;
import io.cdap.cdap.api.metrics.MetricsContext;
import io.cdap.cdap.api.schedule.SchedulableProgramType;
import io.cdap.cdap.api.workflow.ScheduleProgramInfo;
import io.cdap.cdap.api.workflow.WorkflowActionNode;
import io.cdap.cdap.api.workflow.WorkflowNode;
import io.cdap.cdap.api.workflow.WorkflowSpecification;
import io.cdap.cdap.app.program.ProgramDescriptor;
import io.cdap.cdap.app.runtime.ProgramOptions;
import io.cdap.cdap.app.runtime.ProgramStateWriter;
import io.cdap.cdap.app.store.Store;
import io.cdap.cdap.common.app.RunIds;
import io.cdap.cdap.common.conf.CConfiguration;
import io.cdap.cdap.common.conf.Constants;
import io.cdap.cdap.common.service.Retries;
import io.cdap.cdap.common.service.RetryStrategies;
import io.cdap.cdap.common.service.RetryStrategy;
import io.cdap.cdap.common.utils.ImmutablePair;
import io.cdap.cdap.common.utils.ProjectInfo;
import io.cdap.cdap.internal.app.ApplicationSpecificationAdapter;
import io.cdap.cdap.internal.app.program.ProgramStatePublisher;
import io.cdap.cdap.internal.app.runtime.BasicArguments;
import io.cdap.cdap.internal.app.runtime.ProgramOptionConstants;
import io.cdap.cdap.internal.app.runtime.ProgramRunners;
import io.cdap.cdap.internal.app.runtime.SimpleProgramOptions;
import io.cdap.cdap.internal.app.runtime.SystemArguments;
import io.cdap.cdap.internal.app.store.AppMetadataStore;
import io.cdap.cdap.internal.app.store.RunRecordDetail;
import io.cdap.cdap.internal.app.store.RunRecordDetailWithExistingStatus;
import io.cdap.cdap.internal.provision.ProvisionRequest;
import io.cdap.cdap.internal.provision.ProvisionerNotifier;
import io.cdap.cdap.internal.provision.ProvisioningService;
import io.cdap.cdap.messaging.spi.MessagingService;
import io.cdap.cdap.proto.BasicThrowable;
import io.cdap.cdap.proto.Notification;
import io.cdap.cdap.proto.ProgramRunClusterStatus;
import io.cdap.cdap.proto.ProgramRunStatus;
import io.cdap.cdap.proto.ProgramType;
import io.cdap.cdap.proto.id.ApplicationId;
import io.cdap.cdap.proto.id.ArtifactId;
import io.cdap.cdap.proto.id.NamespaceId;
import io.cdap.cdap.proto.id.ProfileId;
import io.cdap.cdap.proto.id.ProgramId;
import io.cdap.cdap.proto.id.ProgramRunId;
import io.cdap.cdap.reporting.ProgramHeartbeatTable;
import io.cdap.cdap.runtime.spi.provisioner.Cluster;
import io.cdap.cdap.security.spi.authentication.SecurityRequestContext;
import io.cdap.cdap.spi.data.StructuredTableContext;
import io.cdap.cdap.spi.data.TableNotFoundException;
import io.cdap.cdap.spi.data.transaction.TransactionRunner;
import java.io.IOException;
import java.lang.reflect.Type;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.IntStream;
import javax.annotation.Nullable;
import org.apache.twill.internal.CompositeService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Service that creates children services, each to handle a single partition of program status
* events topic
*/
public class ProgramNotificationSubscriberService extends AbstractIdleService {
private final MessagingService messagingService;
private final CConfiguration cConf;
private final MetricsCollectionService metricsCollectionService;
private final ProvisionerNotifier provisionerNotifier;
private final ProgramLifecycleService programLifecycleService;
private final ProvisioningService provisioningService;
private final ProgramStateWriter programStateWriter;
private final TransactionRunner transactionRunner;
private final Store store;
private final RunRecordMonitorService runRecordMonitorService;
private Service delegate;
private Set programCompletionNotifiers;
@Inject
ProgramNotificationSubscriberService(
MessagingService messagingService,
CConfiguration cConf,
MetricsCollectionService metricsCollectionService,
ProvisionerNotifier provisionerNotifier,
ProgramLifecycleService programLifecycleService,
ProvisioningService provisioningService,
ProgramStateWriter programStateWriter,
TransactionRunner transactionRunner,
Store store,
RunRecordMonitorService runRecordMonitorService) {
this.messagingService = messagingService;
this.cConf = cConf;
this.metricsCollectionService = metricsCollectionService;
this.provisionerNotifier = provisionerNotifier;
this.programLifecycleService = programLifecycleService;
this.provisioningService = provisioningService;
this.programStateWriter = programStateWriter;
this.transactionRunner = transactionRunner;
this.store = store;
this.runRecordMonitorService = runRecordMonitorService;
this.programCompletionNotifiers = Collections.emptySet();
}
@Override
protected void startUp() throws Exception {
List children = new ArrayList<>();
String topicPrefix = cConf.get(Constants.AppFabric.PROGRAM_STATUS_EVENT_TOPIC);
int numPartitions = cConf.getInt(Constants.AppFabric.PROGRAM_STATUS_EVENT_NUM_PARTITIONS);
// Add bare one - we always listen to it
children.add(createChildService("program.status", topicPrefix));
// If number of partitions is more than 1 - create partitioned services
if (numPartitions > 1) {
IntStream.range(0, numPartitions)
.forEach(i -> children.add(createChildService("program.status." + i, topicPrefix + i)));
}
delegate = new CompositeService(children);
delegate.startAndWait();
}
@Override
protected void shutDown() throws Exception {
delegate.stopAndWait();
}
@Inject(optional = true)
void setProgramCompletionNotifiers(Set notifiers) {
this.programCompletionNotifiers = notifiers;
}
private ProgramNotificationSingleTopicSubscriberService createChildService(
String name, String topicName) {
return new ProgramNotificationSingleTopicSubscriberService(
messagingService,
cConf,
metricsCollectionService,
provisionerNotifier,
programLifecycleService,
provisioningService,
programStateWriter,
transactionRunner,
store,
runRecordMonitorService,
name,
topicName,
programCompletionNotifiers);
}
}
/**
* Service that receives program status notifications from a single topic and persists to the store.
* No transactions should be started in any of the overrided methods since they are already wrapped
* in a transaction.
*/
class ProgramNotificationSingleTopicSubscriberService
extends AbstractNotificationSubscriberService {
private static final Logger LOG =
LoggerFactory.getLogger(ProgramNotificationSubscriberService.class);
private static final Gson GSON =
ApplicationSpecificationAdapter.addTypeAdapters(new GsonBuilder()).create();
private static final Type STRING_STRING_MAP = new TypeToken>() {}.getType();
private static final String CDAP_VERSION = "cdap.version";
private static final Map STATUS_METRICS_NAME =
ImmutableMap.of(
ProgramRunStatus.COMPLETED, Constants.Metrics.Program.PROGRAM_COMPLETED_RUNS,
ProgramRunStatus.KILLED, Constants.Metrics.Program.PROGRAM_KILLED_RUNS,
ProgramRunStatus.FAILED, Constants.Metrics.Program.PROGRAM_FAILED_RUNS,
ProgramRunStatus.REJECTED, Constants.Metrics.Program.PROGRAM_REJECTED_RUNS);
private static final Map WORKFLOW_INNER_PROGRAM_TYPES =
ImmutableMap.of(
SchedulableProgramType.MAPREDUCE, ProgramType.MAPREDUCE,
SchedulableProgramType.SPARK, ProgramType.SPARK);
private final String recordedProgramStatusPublishTopic;
private final ProvisionerNotifier provisionerNotifier;
private final ProgramLifecycleService programLifecycleService;
private final ProvisioningService provisioningService;
private final ProgramStateWriter programStateWriter;
private final Queue tasks;
private final MetricsCollectionService metricsCollectionService;
private Set programCompletionNotifiers;
private final CConfiguration cConf;
private final Store store;
private final RunRecordMonitorService runRecordMonitorService;
private final boolean checkTxSeparation;
ProgramNotificationSingleTopicSubscriberService(
MessagingService messagingService,
CConfiguration cConf,
MetricsCollectionService metricsCollectionService,
ProvisionerNotifier provisionerNotifier,
ProgramLifecycleService programLifecycleService,
ProvisioningService provisioningService,
ProgramStateWriter programStateWriter,
TransactionRunner transactionRunner,
Store store,
RunRecordMonitorService runRecordMonitorService,
String name,
String topicName,
Set programCompletionNotifiers) {
super(
name,
cConf,
topicName,
cConf.getInt(Constants.AppFabric.STATUS_EVENT_FETCH_SIZE),
cConf.getLong(Constants.AppFabric.STATUS_EVENT_POLL_DELAY_MILLIS),
messagingService,
metricsCollectionService,
transactionRunner,
cConf.getInt(Constants.AppFabric.STATUS_EVENT_TX_SIZE));
this.recordedProgramStatusPublishTopic =
cConf.get(Constants.AppFabric.PROGRAM_STATUS_RECORD_EVENT_TOPIC);
this.provisionerNotifier = provisionerNotifier;
this.programLifecycleService = programLifecycleService;
this.provisioningService = provisioningService;
this.programStateWriter = programStateWriter;
this.tasks = new LinkedList<>();
this.metricsCollectionService = metricsCollectionService;
this.programCompletionNotifiers = programCompletionNotifiers;
this.runRecordMonitorService = runRecordMonitorService;
this.cConf = cConf;
this.store = store;
// If number of partitions equals 1, DB deadlock cannot happen as a result of concurrent
// modifications to
// programCountsTable
this.checkTxSeparation =
cConf.getInt(Constants.AppFabric.PROGRAM_STATUS_EVENT_NUM_PARTITIONS) > 1
&& cConf.getBoolean(Constants.AppFabric.PROGRAM_STATUS_EVENT_TX_SEPARATION);
}
@Override
protected void doStartUp() throws Exception {
super.doStartUp();
int batchSize = cConf.getInt(Constants.RuntimeMonitor.INIT_BATCH_SIZE);
RetryStrategy retryStrategy =
RetryStrategies.fromConfiguration(cConf, Constants.Service.RUNTIME_MONITOR_RETRY_PREFIX);
long startTs = System.currentTimeMillis();
AtomicBoolean launching = new AtomicBoolean(false);
Retries.runWithRetries(
() ->
store.scanActiveRuns(
batchSize,
(runRecordDetail) -> {
if (runRecordDetail.getStartTs() > startTs) {
return;
}
try {
if (runRecordDetail.getStatus() == ProgramRunStatus.PENDING) {
launching.set(true);
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
} else if (runRecordDetail.getStatus() == ProgramRunStatus.STARTING) {
launching.set(true);
runRecordMonitorService.addRequest(runRecordDetail.getProgramRunId());
// It is unknown what is the state of program runs in STARTING state.
// A STARTING message is published again to retry STARTING logic.
ProgramOptions programOptions =
new SimpleProgramOptions(
runRecordDetail.getProgramRunId().getParent(),
new BasicArguments(runRecordDetail.getSystemArgs()),
new BasicArguments(runRecordDetail.getUserArgs()));
LOG.debug("Retrying to start run {}.", runRecordDetail.getProgramRunId());
programStateWriter.start(
runRecordDetail.getProgramRunId(),
programOptions,
null,
this.store.loadProgram(runRecordDetail.getProgramRunId().getParent()));
}
} catch (Exception e) {
ProgramRunId programRunId = runRecordDetail.getProgramRunId();
LOG.warn(
"Retrying to start run {} failed. Marking it as failed.", programRunId, e);
programStateWriter.error(programRunId, e);
}
}),
retryStrategy,
e -> true);
if (!launching.get()) {
// there is no launching pipeline
runRecordMonitorService.emitLaunchingMetrics(0);
}
}
@Nullable
@Override
protected String loadMessageId(StructuredTableContext context)
throws IOException, TableNotFoundException {
return getAppMetadataStore(context).retrieveSubscriberState(getTopicId().getTopic(), "");
}
@Override
protected void storeMessageId(StructuredTableContext context, String messageId)
throws IOException, TableNotFoundException {
getAppMetadataStore(context).persistSubscriberState(getTopicId().getTopic(), "", messageId);
}
@Override
protected boolean shouldRunInSeparateTx(ImmutablePair message) {
if (!checkTxSeparation) {
return false;
}
String clusterStatusStr =
message.getSecond().getProperties().get(ProgramOptionConstants.CLUSTER_STATUS);
if (clusterStatusStr != null
&& ProgramRunClusterStatus.valueOf(clusterStatusStr)
== ProgramRunClusterStatus.PROVISIONING) {
// PROVISIONING notifications result in increment in programCountsTable which may result in DB
// deadlock
// under high load
return true;
}
return false;
}
@Override
protected void processMessages(
StructuredTableContext structuredTableContext,
Iterator> messages)
throws Exception {
ProgramHeartbeatTable heartbeatDataset = new ProgramHeartbeatTable(structuredTableContext);
List tasks = new LinkedList<>();
while (messages.hasNext()) {
ImmutablePair messagePair = messages.next();
List runnables =
processNotification(
heartbeatDataset,
messagePair.getFirst().getBytes(StandardCharsets.UTF_8),
messagePair.getSecond(),
structuredTableContext);
tasks.addAll(runnables);
}
// Only add post processing tasks if all messages are processed. If there is exception in the
// processNotifiation,
// messages will be replayed.
this.tasks.addAll(tasks);
}
@Override
protected void postProcess() {
Runnable task = tasks.poll();
while (task != null) {
task.run();
task = tasks.poll();
}
}
/**
* Process a {@link Notification} received from TMS.
*
* @param programHeartbeatTable the {@link ProgramHeartbeatTable} for writing heart beats and
* program status
* @param messageIdBytes the raw message id in the TMS for the notification
* @param notification the {@link Notification} to process
* @param context context to get the table for operations
* @return a {@link List} of {@link Runnable} tasks to run after the transactional processing of
* the whole messages batch is completed
* @throws Exception if failed to process the given notification
*/
private List processNotification(
ProgramHeartbeatTable programHeartbeatTable,
byte[] messageIdBytes,
Notification notification,
StructuredTableContext context)
throws Exception {
AppMetadataStore appMetadataStore = AppMetadataStore.create(context);
Map properties = notification.getProperties();
// Required parameters
String programRun = properties.get(ProgramOptionConstants.PROGRAM_RUN_ID);
String programStatusStr = properties.get(ProgramOptionConstants.PROGRAM_STATUS);
String clusterStatusStr = properties.get(ProgramOptionConstants.CLUSTER_STATUS);
// Ignore notifications which specify an invalid ProgramRunId, which shouldn't happen
if (programRun == null) {
LOG.warn("Ignore notification that misses program run state information, {}", notification);
return Collections.emptyList();
}
ProgramRunId programRunId = GSON.fromJson(programRun, ProgramRunId.class);
ProgramRunStatus programRunStatus = null;
if (programStatusStr != null) {
try {
programRunStatus = ProgramRunStatus.valueOf(programStatusStr);
} catch (IllegalArgumentException e) {
LOG.warn(
"Ignore notification with invalid program run status {} for program {}, {}",
programStatusStr,
programRun,
notification);
return Collections.emptyList();
}
}
ProgramRunClusterStatus clusterStatus = null;
if (clusterStatusStr != null) {
try {
clusterStatus = ProgramRunClusterStatus.valueOf(clusterStatusStr);
} catch (IllegalArgumentException e) {
LOG.warn(
"Ignore notification with invalid program run cluster status {} for program {}",
clusterStatusStr,
programRun);
return Collections.emptyList();
}
}
if (notification.getNotificationType().equals(Notification.Type.PROGRAM_HEART_BEAT)) {
RunRecordDetail runRecordMeta = appMetadataStore.getRun(programRunId);
long heartBeatTimeInSeconds =
TimeUnit.MILLISECONDS.toSeconds(
Long.parseLong(properties.get(ProgramOptionConstants.HEART_BEAT_TIME)));
writeToHeartBeatTable(runRecordMeta, heartBeatTimeInSeconds, programHeartbeatTable);
// we can return after writing to heart beat table
return Collections.emptyList();
}
List result = new ArrayList<>();
if (programRunStatus != null) {
handleProgramEvent(
programRunId,
programRunStatus,
notification,
messageIdBytes,
appMetadataStore,
programHeartbeatTable,
result);
}
if (clusterStatus == null) {
return result;
}
handleClusterEvent(
programRunId, clusterStatus, notification, messageIdBytes, appMetadataStore, context)
.ifPresent(result::add);
return result;
}
private void handleProgramEvent(
ProgramRunId programRunId,
ProgramRunStatus programRunStatus,
Notification notification,
byte[] messageIdBytes,
AppMetadataStore appMetadataStore,
ProgramHeartbeatTable programHeartbeatTable,
List runnables)
throws Exception {
LOG.trace("Processing program status notification: {}", notification);
Map properties = notification.getProperties();
String twillRunId = notification.getProperties().get(ProgramOptionConstants.TWILL_RUN_ID);
RunRecordDetail recordedRunRecord;
switch (programRunStatus) {
case STARTING:
try {
RunRecordDetail runRecordDetail = appMetadataStore.getRun(programRunId);
if (runRecordDetail != null
&& runRecordDetail.getStatus() != ProgramRunStatus.PENDING
&& runRecordDetail.getStatus() != ProgramRunStatus.STARTING) {
// This is an invalid state transition happening. Valid state transitions are:
// PENDING => STARTING : normal state transition
// STARTING => STARTING : state transition after app-fabric restart
LOG.debug(
"Ignoring unexpected request to transition program run {} from {} state to program "
+ "STARTING state.",
programRunId,
runRecordDetail.getStatus());
return;
}
} catch (IllegalStateException ex) {
LOG.error(
"Request to transition program run {} from non-existent state to program STARTING"
+ " state but multiple run IDs exist.",
programRunId);
}
String systemArgumentsString = properties.get(ProgramOptionConstants.SYSTEM_OVERRIDES);
Map systemArguments =
systemArgumentsString == null
? Collections.emptyMap()
: GSON.fromJson(systemArgumentsString, STRING_STRING_MAP);
ProgramOptions prgOptions = ProgramOptions.fromNotification(notification, GSON);
boolean programStartSkipped = ProgramStatePublisher.isProgramStartSkipped(systemArguments);
if (programStartSkipped) {
ArtifactId artifactId = ProgramStatePublisher.getArtifactId(GSON, properties);
appMetadataStore.recordProgramProvisioning(
programRunId,
prgOptions.getUserArguments().asMap(),
prgOptions.getArguments().asMap(),
messageIdBytes,
artifactId.toApiArtifactId());
appMetadataStore.recordProgramProvisioned(programRunId, 0, messageIdBytes);
} else {
ProgramDescriptor prgDescriptor =
GSON.fromJson(
properties.get(ProgramOptionConstants.PROGRAM_DESCRIPTOR),
ProgramDescriptor.class);
runnables.add(
() -> {
String oldUser = SecurityRequestContext.getUserId();
try {
SecurityRequestContext.setUserId(
prgOptions.getArguments().getOption(ProgramOptionConstants.USER_ID));
try {
programLifecycleService.startInternal(prgDescriptor, prgOptions, programRunId);
} catch (Exception e) {
LOG.error("Failed to start program {}", programRunId, e);
programStateWriter.error(programRunId, e);
}
} finally {
SecurityRequestContext.setUserId(oldUser);
}
});
}
recordedRunRecord =
appMetadataStore.recordProgramStart(
programRunId, twillRunId,
systemArguments, messageIdBytes);
writeToHeartBeatTable(
recordedRunRecord,
RunIds.getTime(programRunId.getRun(), TimeUnit.SECONDS),
programHeartbeatTable);
break;
case RUNNING:
long logicalStartTimeSecs =
getTimeSeconds(
notification.getProperties(), ProgramOptionConstants.LOGICAL_START_TIME, -1);
if (logicalStartTimeSecs == -1) {
LOG.warn(
"Ignore program running notification for program {} without {} specified, {}",
programRunId,
ProgramOptionConstants.LOGICAL_START_TIME,
notification);
return;
}
recordedRunRecord =
appMetadataStore.recordProgramRunning(
programRunId, logicalStartTimeSecs, twillRunId, messageIdBytes);
writeToHeartBeatTable(recordedRunRecord, logicalStartTimeSecs, programHeartbeatTable);
runRecordMonitorService.removeRequest(programRunId, true);
long startDelayTime =
logicalStartTimeSecs - RunIds.getTime(programRunId.getRun(), TimeUnit.SECONDS);
emitStartingTimeMetric(programRunId, startDelayTime, recordedRunRecord);
break;
case SUSPENDED:
long suspendTime =
getTimeSeconds(notification.getProperties(), ProgramOptionConstants.SUSPEND_TIME, -1);
// since we are adding suspend time recently, there might be old suspended notifications for
// which time
// can be -1.
recordedRunRecord =
appMetadataStore.recordProgramSuspend(programRunId, messageIdBytes, suspendTime);
writeToHeartBeatTable(recordedRunRecord, suspendTime, programHeartbeatTable);
break;
case RESUMING:
long resumeTime =
getTimeSeconds(notification.getProperties(), ProgramOptionConstants.RESUME_TIME, -1);
// since we are adding resume time recently, there might be old resumed notifications for
// which time
// can be -1.
recordedRunRecord =
appMetadataStore.recordProgramResumed(programRunId, messageIdBytes, resumeTime);
writeToHeartBeatTable(recordedRunRecord, resumeTime, programHeartbeatTable);
break;
case STOPPING:
Map notificationProperties = notification.getProperties();
long stoppingTsSecs =
getTimeSeconds(notificationProperties, ProgramOptionConstants.STOPPING_TIME, -1);
if (stoppingTsSecs == -1) {
LOG.warn(
"Ignore program stopping notification for program {} without {} specified, {}",
programRunId,
ProgramOptionConstants.STOPPING_TIME,
notification);
return;
}
long terminateTsSecs =
getTimeSeconds(
notificationProperties, ProgramOptionConstants.TERMINATE_TIME, Long.MAX_VALUE);
recordedRunRecord =
appMetadataStore.recordProgramStopping(
programRunId, messageIdBytes, stoppingTsSecs, terminateTsSecs);
writeToHeartBeatTable(recordedRunRecord, stoppingTsSecs, programHeartbeatTable);
break;
case COMPLETED:
case KILLED:
case FAILED:
recordedRunRecord =
handleProgramCompletion(
appMetadataStore,
programHeartbeatTable,
programRunId,
programRunStatus,
notification,
messageIdBytes,
runnables);
break;
case REJECTED:
ProgramOptions programOptions = ProgramOptions.fromNotification(notification, GSON);
ArtifactId artifactId = ProgramStatePublisher.getArtifactId(GSON, properties);
recordedRunRecord =
appMetadataStore.recordProgramRejected(
programRunId,
programOptions.getUserArguments().asMap(),
programOptions.getArguments().asMap(),
messageIdBytes,
artifactId.toApiArtifactId());
writeToHeartBeatTable(
recordedRunRecord,
RunIds.getTime(programRunId.getRun(), TimeUnit.SECONDS),
programHeartbeatTable);
getEmitMetricsRunnable(
programRunId,
recordedRunRecord,
Constants.Metrics.Program.PROGRAM_REJECTED_RUNS,
null)
.ifPresent(runnables::add);
runRecordMonitorService.removeRequest(programRunId, true);
break;
default:
// This should not happen
LOG.error(
"Unsupported program status {} for program {}, {}",
programRunStatus,
programRunId,
notification);
return;
}
if (recordedRunRecord != null) {
// We need to publish the message so that the trigger subscriber can pick it up and start the
// trigger if
// necessary
publishRecordedStatus(notification, programRunId, recordedRunRecord.getStatus());
// for any status that represents completion of a job that was actually started (excludes
// rejected jobs)
// publish the deprovisioning event(s).
if (programRunStatus.isEndState() && programRunStatus != ProgramRunStatus.REJECTED) {
if (ProgramStatePublisher.isProgramStartSkipped(recordedRunRecord.getSystemArgs())) {
appMetadataStore.recordProgramDeprovisioning(programRunId, messageIdBytes);
appMetadataStore.recordProgramDeprovisioned(programRunId, null, messageIdBytes);
} else {
provisionerNotifier.deprovisioning(programRunId);
}
}
}
}
/**
* Handles a program completion notification.
*
* @param appMetadataStore the {@link AppMetadataStore} to write the status to
* @param programHeartbeatTable the {@link ProgramHeartbeatTable} to write the status to
* @param programRunId the program run of the completed program
* @param programRunStatus the status of the completion
* @param notification the {@link Notification} that carries information about the program
* completion
* @param sourceId the source message id of the notification
* @param runnables a {@link List} adding {@link Runnable} to be executed after event handling is
* completed
* @return a {@link RunRecordDetail} that carries the result of updates to {@link
* AppMetadataStore}. If there is no update, {@code null} will be returned
* @throws Exception if failed to update program status
*/
@Nullable
private RunRecordDetail handleProgramCompletion(
AppMetadataStore appMetadataStore,
ProgramHeartbeatTable programHeartbeatTable,
ProgramRunId programRunId,
ProgramRunStatus programRunStatus,
Notification notification,
byte[] sourceId,
List runnables)
throws Exception {
Map properties = notification.getProperties();
long endTimeSecs = getTimeSeconds(properties, ProgramOptionConstants.END_TIME, -1);
if (endTimeSecs == -1) {
LOG.warn(
"Ignore program {} notification for program {} without end time specified, {}",
programRunStatus.name().toLowerCase(),
programRunId,
notification);
return null;
}
BasicThrowable failureCause =
decodeBasicThrowable(properties.get(ProgramOptionConstants.PROGRAM_ERROR));
// If it is a workflow, process the inner program states first
// We expect all inner program states has been received already before receiving the workflow
// state.
// If there is any states missing, it will be handled here.
if (programRunId.getType() == ProgramType.WORKFLOW) {
processWorkflowOnStop(
appMetadataStore,
programHeartbeatTable,
programRunId,
programRunStatus,
notification,
sourceId,
runnables);
}
RunRecordDetailWithExistingStatus recordedRunRecord =
appMetadataStore.recordProgramStop(
programRunId, endTimeSecs, programRunStatus, failureCause, sourceId);
if (recordedRunRecord != null) {
writeToHeartBeatTable(recordedRunRecord, endTimeSecs, programHeartbeatTable);
getEmitMetricsRunnable(
programRunId,
recordedRunRecord,
STATUS_METRICS_NAME.get(programRunStatus),
recordedRunRecord.getExistingStatus())
.ifPresent(runnables::add);
// emit program run time metric.
long runTime = endTimeSecs - RunIds.getTime(programRunId.getRun(), TimeUnit.SECONDS);
SystemArguments.getProfileIdFromArgs(
programRunId.getNamespaceId(), recordedRunRecord.getSystemArgs())
.ifPresent(
profileId -> {
emitRunTimeMetric(programRunId, programRunStatus, runTime, recordedRunRecord);
emitStoppingTimeMetric(programRunId, profileId, recordedRunRecord);
});
runnables.add(
() -> {
programCompletionNotifiers.forEach(
notifier ->
notifier.onProgramCompleted(programRunId, recordedRunRecord.getStatus()));
runRecordMonitorService.removeRequest(programRunId, true);
});
}
return recordedRunRecord;
}
/**
* On workflow program stop, inspects inner program states and adjust them if they are not in end
* state already.
*
* @param appMetadataStore the {@link AppMetadataStore} to write the status to
* @param programHeartbeatTable the {@link ProgramHeartbeatTable} to write the status to
* @param programRunId the program run of the completed program
* @param programRunStatus the status of the completion
* @param notification the {@link Notification} that carries information about the workflow
* completion
* @param sourceId the source message id of the notification
* @param runnables a {@link List} adding {@link Runnable} to be executed after event handling is
* completed
* @throws Exception if failed to update program status
*/
private void processWorkflowOnStop(
AppMetadataStore appMetadataStore,
ProgramHeartbeatTable programHeartbeatTable,
ProgramRunId programRunId,
ProgramRunStatus programRunStatus,
Notification notification,
byte[] sourceId,
List runnables)
throws Exception {
ApplicationId appId = programRunId.getParent().getParent();
WorkflowSpecification workflowSpec =
Optional.ofNullable(appMetadataStore.getApplication(appId))
.map(appMeta -> appMeta.getSpec().getWorkflows().get(programRunId.getProgram()))
.orElse(null);
// If cannot find the workflow spec (e.g. app deleted), then there is nothing we can do.
if (workflowSpec == null) {
return;
}
// For all MR and Spark nodes, we need to update the inner program run status if they are not in
// end state yet.
for (WorkflowNode workflowNode : workflowSpec.getNodeIdMap().values()) {
if (!(workflowNode instanceof WorkflowActionNode)) {
continue;
}
ScheduleProgramInfo programInfo = ((WorkflowActionNode) workflowNode).getProgram();
if (!WORKFLOW_INNER_PROGRAM_TYPES.containsKey(programInfo.getProgramType())) {
continue;
}
// Get all active runs of the inner program. If the parent workflow runId is the same as this
// one,
// set a terminal state for the inner program run.
ProgramId innerProgramId =
appId.program(
WORKFLOW_INNER_PROGRAM_TYPES.get(programInfo.getProgramType()),
programInfo.getProgramName());
Map innerProgramNotifications = new LinkedHashMap<>();
appMetadataStore.scanActiveRuns(
innerProgramId,
runRecord -> {
Map systemArgs = runRecord.getSystemArgs();
String workflowName = systemArgs.get(ProgramOptionConstants.WORKFLOW_NAME);
String workflowRun = systemArgs.get(ProgramOptionConstants.WORKFLOW_RUN_ID);
if (workflowName == null || workflowRun == null) {
return;
}
ProgramRunId workflowRunId =
appId.program(ProgramType.WORKFLOW, workflowName).run(workflowRun);
if (!programRunId.equals(workflowRunId)) {
return;
}
Map notificationProps = new HashMap<>(notification.getProperties());
notificationProps.put(
ProgramOptionConstants.PROGRAM_RUN_ID, GSON.toJson(runRecord.getProgramRunId()));
innerProgramNotifications.put(
runRecord.getProgramRunId(),
new Notification(Notification.Type.PROGRAM_STATUS, notificationProps));
});
for (Map.Entry entry : innerProgramNotifications.entrySet()) {
handleProgramEvent(
entry.getKey(),
programRunStatus,
entry.getValue(),
sourceId,
appMetadataStore,
programHeartbeatTable,
runnables);
}
}
}
/** write to heart beat table if the recordedRunRecord is not null. */
private void writeToHeartBeatTable(
@Nullable RunRecordDetail recordedRunRecord,
long timestampInSeconds,
ProgramHeartbeatTable programHeartbeatTable)
throws IOException {
if (recordedRunRecord != null) {
programHeartbeatTable.writeRunRecordMeta(recordedRunRecord, timestampInSeconds);
}
}
/**
* Handles a notification related to cluster operations.
*
* @param programRunId program run id from the event
* @param clusterStatus cluster status from the event
* @param notification the notification to process
* @param messageIdBytes the unique ID for the notification message
* @param appMetadataStore the data table to use
* @param context the table context for performing table operations
* @return an {@link Optional} of {@link Runnable} to carry a task to execute after handling of
* this event completed. See {@link #postProcess()} for details.
* @throws IOException if failed to read/write to the app metadata store.
*/
private Optional handleClusterEvent(
ProgramRunId programRunId,
ProgramRunClusterStatus clusterStatus,
Notification notification,
byte[] messageIdBytes,
AppMetadataStore appMetadataStore,
StructuredTableContext context)
throws IOException, InterruptedException {
Map properties = notification.getProperties();
ProgramOptions programOptions = ProgramOptions.fromNotification(notification, GSON);
String userId = properties.get(ProgramOptionConstants.USER_ID);
long endTs = getTimeSeconds(properties, ProgramOptionConstants.CLUSTER_END_TIME, -1);
ProgramDescriptor programDescriptor =
GSON.fromJson(
properties.get(ProgramOptionConstants.PROGRAM_DESCRIPTOR), ProgramDescriptor.class);
switch (clusterStatus) {
case PROVISIONING:
appMetadataStore.recordProgramProvisioning(
programRunId,
programOptions.getUserArguments().asMap(),
programOptions.getArguments().asMap(),
messageIdBytes,
programDescriptor.getArtifactId().toApiArtifactId());
ProvisionRequest provisionRequest =
new ProvisionRequest(programRunId, programOptions, programDescriptor, userId);
return Optional.of(provisioningService.provision(provisionRequest, context));
case PROVISIONED:
Cluster cluster =
GSON.fromJson(properties.get(ProgramOptionConstants.CLUSTER), Cluster.class);
RunRecordDetail runRecord =
appMetadataStore.recordProgramProvisioned(
programRunId, cluster.getNodes().size(), messageIdBytes);
// if the run was stopped right before provisioning completes, it is possible for the state
// to transition to stopping or killed, but not in time to cancel the provisioning.
// In that case, we end up with a provisioned message, but we don't want to start the
// program.
if (runRecord == null
|| runRecord.getStatus() == ProgramRunStatus.STOPPING
|| runRecord.getStatus() == ProgramRunStatus.KILLED) {
break;
}
// Update the ProgramOptions system arguments to include information needed for program
// execution
Map systemArgs = new HashMap<>(programOptions.getArguments().asMap());
systemArgs.put(
ProgramOptionConstants.USER_ID, properties.get(ProgramOptionConstants.USER_ID));
systemArgs.put(
ProgramOptionConstants.CLUSTER, properties.get(ProgramOptionConstants.CLUSTER));
systemArgs.put(
ProgramOptionConstants.SECURE_KEYS_DIR,
properties.get(ProgramOptionConstants.SECURE_KEYS_DIR));
ProgramOptions newProgramOptions =
new SimpleProgramOptions(
programOptions.getProgramId(),
new BasicArguments(systemArgs),
programOptions.getUserArguments());
// Publish the program STARTING state before starting the program
programStateWriter.start(programRunId, newProgramOptions, null, programDescriptor);
// emit provisioning time metric
long provisioningTime =
System.currentTimeMillis() / 1000
- RunIds.getTime(programRunId.getRun(), TimeUnit.SECONDS);
SystemArguments.getProfileIdFromArgs(programRunId.getNamespaceId(), systemArgs)
.ifPresent(
profileId ->
emitProvisioningTimeMetric(
programRunId, profileId, programOptions, provisioningTime));
break;
case DEPROVISIONING:
RunRecordDetail recordedMeta =
appMetadataStore.recordProgramDeprovisioning(programRunId, messageIdBytes);
// If we skipped recording the run status, that means this was a duplicate message,
// or an invalid state transition. In both cases, we should not try to deprovision the
// cluster.
if (recordedMeta != null) {
return Optional.of(provisioningService.deprovision(programRunId, context));
}
break;
case DEPROVISIONED:
appMetadataStore.recordProgramDeprovisioned(programRunId, endTs, messageIdBytes);
break;
case ORPHANED:
appMetadataStore.recordProgramOrphaned(programRunId, endTs, messageIdBytes);
break;
}
return Optional.empty();
}
private Optional getEmitMetricsRunnable(
ProgramRunId programRunId,
@Nullable RunRecordDetail recordedRunRecord,
String metricName,
@Nullable ProgramRunStatus existingStatus) {
if (recordedRunRecord == null) {
return Optional.empty();
}
Optional profile =
SystemArguments.getProfileIdFromArgs(
programRunId.getNamespaceId(), recordedRunRecord.getSystemArgs());
Map additionalTags =
getAdditionalTagsForProgramMetrics(recordedRunRecord, existingStatus);
return profile.map(
profileId -> () -> emitProfileMetrics(programRunId, profileId, metricName, additionalTags));
}
private void publishRecordedStatus(
Notification notification, ProgramRunId programRunId, ProgramRunStatus status)
throws Exception {
Map notificationProperties = new HashMap<>(notification.getProperties());
notificationProperties.put(ProgramOptionConstants.PROGRAM_RUN_ID, GSON.toJson(programRunId));
notificationProperties.put(ProgramOptionConstants.PROGRAM_STATUS, status.name());
notificationProperties.put(CDAP_VERSION, ProjectInfo.getVersion().toString());
Notification programStatusNotification =
new Notification(Notification.Type.PROGRAM_STATUS, notificationProperties);
getMessagingContext()
.getMessagePublisher()
.publish(
NamespaceId.SYSTEM.getNamespace(),
recordedProgramStatusPublishTopic,
GSON.toJson(programStatusNotification));
}
/**
* Helper method to extract the time from the given properties map, or return -1 if no value was
* found.
*
* @param properties the properties map
* @param option the key to lookup in the properties map
* @param defaultValue the default value to return if there is no such property.
* @return the time in seconds, or {@code defaultValue} if not found
*/
private long getTimeSeconds(Map properties, String option, long defaultValue) {
String timeString = properties.get(option);
return (timeString == null)
? defaultValue
: TimeUnit.MILLISECONDS.toSeconds(Long.parseLong(timeString));
}
/**
* Decodes a {@link BasicThrowable} from a given json string.
*
* @param encoded the json representing of the {@link BasicThrowable}
* @return the decode {@link BasicThrowable}; A {@code null} will be returned if the encoded
* string is {@code null} or on decode failure.
*/
@Nullable
private BasicThrowable decodeBasicThrowable(@Nullable String encoded) {
try {
return (encoded == null) ? null : GSON.fromJson(encoded, BasicThrowable.class);
} catch (JsonSyntaxException e) {
// This shouldn't happen normally, unless the BasicThrowable changed in an incompatible way
return null;
}
}
/**
* Emit the metrics context for the program, the tags are constructed with the program run id and
* the profile id.
*/
private void emitProfileMetrics(
ProgramRunId programRunId,
ProfileId profileId,
String metricName,
Map additionalTags) {
Map tags =
ImmutableMap.builder()
.put(Constants.Metrics.Tag.PROFILE_SCOPE, profileId.getScope().name())
.put(Constants.Metrics.Tag.PROFILE, profileId.getProfile())
.put(Constants.Metrics.Tag.NAMESPACE, programRunId.getNamespace())
.put(Constants.Metrics.Tag.PROGRAM_TYPE, programRunId.getType().getPrettyName())
.put(Constants.Metrics.Tag.APP, programRunId.getApplication())
.put(Constants.Metrics.Tag.PROGRAM, programRunId.getProgram())
.put(Constants.Metrics.Tag.RUN_ID, programRunId.getRun())
.putAll(additionalTags)
.build();
metricsCollectionService.getContext(tags).increment(metricName, 1L);
}
/**
* Emit the provisioning time metric. The tags are constructed with the program run id, profile id
* and program options.
*/
private void emitProvisioningTimeMetric(
ProgramRunId programRunId,
ProfileId profileId,
ProgramOptions programOptions,
long provisioningTime) {
Map args = programOptions.getArguments().asMap();
String provisioner = SystemArguments.getProfileProvisioner(args);
Map tags =
ImmutableMap.builder()
.put(Constants.Metrics.Tag.PROFILE_SCOPE, profileId.getScope().name())
.put(Constants.Metrics.Tag.PROFILE, profileId.getProfile())
.put(Constants.Metrics.Tag.NAMESPACE, programRunId.getNamespace())
.put(Constants.Metrics.Tag.PROGRAM_TYPE, programRunId.getType().getPrettyName())
.put(Constants.Metrics.Tag.APP, programRunId.getApplication())
.put(Constants.Metrics.Tag.PROGRAM, programRunId.getProgram())
.put(Constants.Metrics.Tag.PROVISIONER, provisioner)
.put(Constants.Metrics.Tag.RUN_ID, programRunId.getRun())
.build();
metricsCollectionService
.getContext(tags)
.gauge(Constants.Metrics.Program.PROGRAM_PROVISIONING_DELAY_SECONDS, provisioningTime);
}
/**
* Emit the program run time metric. The tags are constructed with the program run id and program
* run status.
*/
private void emitRunTimeMetric(
ProgramRunId programRunId,
ProgramRunStatus programRunStatus,
long runTime,
RunRecordDetail runRecord) {
Map tags =
ImmutableMap.builder()
.put(Constants.Metrics.Tag.STATUS, programRunStatus.name())
.put(Constants.Metrics.Tag.PROGRAM, programRunId.getProgram())
.putAll(getAdditionalTagsForProfileMetrics(runRecord, programRunId))
.putAll(getAdditionalTagsForProgramMetrics(runRecord, null))
.build();
MetricsContext metricsContext =
ProgramRunners.createProgramMetricsContext(programRunId, tags, metricsCollectionService);
metricsContext.gauge(Constants.Metrics.Program.RUN_TIME_SECONDS, runTime);
}
/** Emit stopping latency metric. */
private void emitStoppingTimeMetric(
ProgramRunId programRunId, ProfileId profileId, RunRecordDetail runRecord) {
if (runRecord.getStopTs() == null || runRecord.getStoppingTs() == null) {
return;
}
Map additionalTags = new HashMap<>();
// don't want to add the tag if it is not present otherwise it will result in NPE
additionalTags.computeIfAbsent(
Constants.Metrics.Tag.PROVISIONER,
provisioner -> SystemArguments.getProfileProvisioner(runRecord.getSystemArgs()));
Map tags =
ImmutableMap.builder()
.put(Constants.Metrics.Tag.PROFILE_SCOPE, profileId.getScope().name())
.put(Constants.Metrics.Tag.PROFILE, profileId.getProfile())
.put(Constants.Metrics.Tag.PROGRAM, programRunId.getProgram())
.putAll(additionalTags)
.build();
MetricsContext metricsContext =
ProgramRunners.createProgramMetricsContext(programRunId, tags, metricsCollectionService);
metricsContext.gauge(
Constants.Metrics.Program.PROGRAM_STOPPING_DELAY_SECONDS,
runRecord.getStopTs() - runRecord.getStoppingTs());
}
private void emitStartingTimeMetric(
ProgramRunId programRunId, long startDelayTime, @Nullable RunRecordDetail runRecord) {
Map tags =
ImmutableMap.builder()
.put(Constants.Metrics.Tag.PROGRAM, programRunId.getProgram())
.putAll(getAdditionalTagsForProfileMetrics(runRecord, programRunId))
.putAll(getAdditionalTagsForProgramMetrics(runRecord, null))
.build();
MetricsContext metricsContext =
ProgramRunners.createProgramMetricsContext(programRunId, tags, metricsCollectionService);
metricsContext.gauge(Constants.Metrics.Program.PROGRAM_STARTING_DELAY_SECONDS, startDelayTime);
}
private Map getAdditionalTagsForProfileMetrics(
@Nullable RunRecordDetail runRecord, ProgramRunId programRunId) {
Map tags = new HashMap<>(Collections.emptyMap());
if (runRecord == null) {
return tags;
}
SystemArguments.getProfileIdFromArgs(programRunId.getNamespaceId(), runRecord.getSystemArgs())
.ifPresent(
profileId -> {
tags.putIfAbsent(Constants.Metrics.Tag.PROFILE, profileId.getProfile());
tags.putIfAbsent(Constants.Metrics.Tag.PROFILE_SCOPE, profileId.getScope().name());
});
return tags;
}
/** Returns an instance of {@link AppMetadataStore}. */
private AppMetadataStore getAppMetadataStore(StructuredTableContext context) {
return AppMetadataStore.create(context);
}
private Map getAdditionalTagsForProgramMetrics(
@Nullable RunRecordDetail runRecord, @Nullable ProgramRunStatus existingStatus) {
Map additionalTags = new HashMap<>();
if (runRecord == null) {
return additionalTags;
}
// don't want to add the tag if it is not present otherwise it will result in NPE
additionalTags.computeIfAbsent(
Constants.Metrics.Tag.PROVISIONER,
provisioner -> SystemArguments.getProfileProvisioner(runRecord.getSystemArgs()));
additionalTags.computeIfAbsent(
Constants.Metrics.Tag.CLUSTER_STATUS,
clusterStatus -> runRecord.getCluster().getStatus().name());
additionalTags.computeIfAbsent(
Constants.Metrics.Tag.EXISTING_STATUS,
existingProgramStatus -> existingStatus == null ? null : existingStatus.name());
return additionalTags;
}
}