Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.flink.runtime.healthmanager.HealthMonitor Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.healthmanager;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.metrics.Gauge;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.healthmanager.metrics.HealthMonitorMetricGroup;
import org.apache.flink.runtime.healthmanager.metrics.MetricProvider;
import org.apache.flink.runtime.healthmanager.plugins.Action;
import org.apache.flink.runtime.healthmanager.plugins.ActionSelector;
import org.apache.flink.runtime.healthmanager.plugins.Detector;
import org.apache.flink.runtime.healthmanager.plugins.Resolver;
import org.apache.flink.runtime.healthmanager.plugins.Symptom;
import org.apache.flink.runtime.healthmanager.plugins.actionselectors.RescaleResourcePriorActionSelector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.DelayIncreasingDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.DirectOOMDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.FailoverDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.FrequentFullGCDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.HeapOOMDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.HighCpuDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.HighDelayDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.HighNativeMemoryDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.JobStableDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.JobStuckDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.KilledDueToMemoryExceedDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.LargeTimerCountDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.LongTimeFullGCDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.LowCpuDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.LowMemoryDetector;
import org.apache.flink.runtime.healthmanager.plugins.detectors.OverParallelizedDetector;
import org.apache.flink.runtime.healthmanager.plugins.resolvers.CpuAdjuster;
import org.apache.flink.runtime.healthmanager.plugins.resolvers.DirectMemoryAdjuster;
import org.apache.flink.runtime.healthmanager.plugins.resolvers.HeapMemoryAdjuster;
import org.apache.flink.runtime.healthmanager.plugins.resolvers.NativeMemoryAdjuster;
import org.apache.flink.runtime.healthmanager.plugins.resolvers.ParallelismScaler;
import org.apache.flink.runtime.healthmanager.plugins.utils.HealthMonitorOptions;
import org.apache.flink.runtime.healthmanager.plugins.utils.JobTopologyAnalyzer;
import org.apache.flink.runtime.healthmanager.plugins.utils.MetricUtils;
import org.apache.flink.runtime.healthmanager.plugins.utils.TaskMetricsSubscriber;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
/**
* Health Monitor which checks status of job periodically, and solve abnormal symptoms when detected.
*/
public class HealthMonitor {
private static final Logger LOGGER = LoggerFactory.getLogger(HealthMonitor.class);
public static final ConfigOption HEALTH_MONITOR_ENABLED =
ConfigOptions.key("healthmonitor.enabled").defaultValue(true);
public static final ConfigOption HEALTH_MONITOR_STANDALONE_CHECK_INTERVAL =
ConfigOptions.key("healthmonitor.standalone.check.interval.ms").defaultValue(10000L);
public static final ConfigOption HEALTH_CHECK_INTERNAL =
ConfigOptions.key("healthmonitor.health.check.interval.ms").defaultValue(10000L);
public static final ConfigOption ACTION_SELECTOR_CLASS =
ConfigOptions.key("healthmonitor.action.selector.class")
.defaultValue(RescaleResourcePriorActionSelector.class.getCanonicalName());
public static final ConfigOption DETECTOR_CLASSES =
ConfigOptions.key("healthmonitor.detector.classes").noDefaultValue();
public static final ConfigOption RESOLVER_CLASSES =
ConfigOptions.key("healthmonitor.resolver.classes").noDefaultValue();
public static final ConfigOption HEALTH_MONITOR_STANDALONE_MODE_ENABLED =
ConfigOptions.key("healthmonitor.standalone.mode.enabled").defaultValue(false);
public static final ConfigOption DRY_RUN_ENABLED =
ConfigOptions.key("healthmonitor.dry-run.enabled").defaultValue(false);
private JobID jobID;
private Configuration config;
private RestServerClient.JobConfig jobConfig;
private MetricProvider metricProvider;
private RestServerClient restServerClient;
private HealthMonitorMetricGroup metricGroup;
private ScheduledExecutorService executorService;
private ScheduledFuture timedTaskHandler;
private List detectors;
private List resolvers;
private ActionSelector actionSelector;
private volatile long jobStartExecutionTime = Long.MAX_VALUE;
private volatile long successActionCount = 0;
private volatile long failedActionCount = 0;
private volatile boolean isEnabled;
private Map taskMetricsSubscribes = new HashMap<>();
private JobTopologyAnalyzer jobTopologyAnalyzer = new JobTopologyAnalyzer();
@VisibleForTesting
public HealthMonitor(
JobID jobID,
MetricProvider metricProvider,
RestServerClient visitor,
ScheduledExecutorService executorService,
Configuration config) {
this(jobID, metricProvider, visitor, null, executorService, config);
}
public HealthMonitor(
JobID jobID,
MetricProvider metricProvider,
RestServerClient visitor,
HealthMonitorMetricGroup metricGroup,
ScheduledExecutorService executorService,
Configuration config) {
this.jobID = jobID;
this.executorService = executorService;
this.metricProvider = metricProvider;
this.restServerClient = visitor;
this.config = config.clone();
this.metricGroup = metricGroup;
}
public void start() throws Exception {
LOGGER.info("Starting to monitor job {}", jobID);
jobTopologyAnalyzer.analyze(getJobConfig());
for (String key : getJobConfig().getConfig().keySet()) {
this.config.setString(key , getJobConfig().getConfig().getString(key, null));
}
isEnabled = config.getBoolean(HEALTH_MONITOR_ENABLED) || config.getBoolean(HealthMonitor.HEALTH_MONITOR_STANDALONE_MODE_ENABLED);
if (isEnabled) {
loadPlugins();
}
long checkInterval = config.getBoolean(HEALTH_MONITOR_STANDALONE_MODE_ENABLED) ? config.getLong(HEALTH_MONITOR_STANDALONE_CHECK_INTERVAL) : config.getLong(HEALTH_CHECK_INTERNAL);
if (checkInterval > 0) {
timedTaskHandler = executorService.scheduleAtFixedRate(
new HealthChecker(), 0, checkInterval, TimeUnit.MILLISECONDS);
}
if (metricGroup != null) {
MetricGroup actionMetrics = metricGroup.addGroup("action");
actionMetrics.gauge("success", new Gauge() {
@Override
public Long getValue() {
return successActionCount;
}
});
actionMetrics.gauge("failure", new Gauge() {
@Override
public Long getValue() {
return failedActionCount;
}
});
}
}
@VisibleForTesting
public void loadPlugins() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
// try to close old plugins first.
closePlugins();
// load new plugins.
loadDetectors();
loadResolvers();
loadActionSelector();
}
public void stop() {
if (timedTaskHandler != null) {
timedTaskHandler.cancel(true);
}
closePlugins();
}
@VisibleForTesting
public void closePlugins() {
if (this.actionSelector != null) {
this.actionSelector.close();
this.actionSelector = null;
}
if (detectors != null) {
for (Detector detector : detectors) {
detector.close();
}
detectors.clear();
detectors = null;
}
if (resolvers != null) {
for (Resolver resolver : resolvers) {
resolver.close();
}
resolvers.clear();
resolvers = null;
}
for (TaskMetricsSubscriber taskMetricsSubscriber : taskMetricsSubscribes.values()) {
taskMetricsSubscriber.close();
}
taskMetricsSubscribes.clear();
}
public TaskMetricsSubscriber subscribeTaskMetrics(long interval) {
if (taskMetricsSubscribes.containsKey(interval)) {
return taskMetricsSubscribes.get(interval);
} else {
TaskMetricsSubscriber subscriber = new TaskMetricsSubscriber(this, interval);
subscriber.open();
taskMetricsSubscribes.put(interval, subscriber);
return subscriber;
}
}
private void loadActionSelector() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
this.actionSelector =
(ActionSelector) Class.forName(config.getString(ACTION_SELECTOR_CLASS)).newInstance();
LOGGER.info("Load action selector:" + actionSelector);
this.actionSelector.open(this);
}
private void loadDetectors() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
Set detectorClazzs = new HashSet<>();
if (config.getString(DETECTOR_CLASSES) != null) {
detectorClazzs.addAll(Arrays.asList(config.getString(DETECTOR_CLASSES).split(",")));
} else {
if (config.getBoolean(HealthMonitorOptions.ENABLE_PARALLELISM_RESCALE)) {
// detector which will trigger rescale
detectorClazzs.add(HighDelayDetector.class.getCanonicalName());
detectorClazzs.add(DelayIncreasingDetector.class.getCanonicalName());
detectorClazzs.add(LargeTimerCountDetector.class.getCanonicalName());
detectorClazzs.add(OverParallelizedDetector.class.getCanonicalName());
// detectors which will check state of job
detectorClazzs.add(JobStableDetector.class.getCanonicalName());
detectorClazzs.add(JobStuckDetector.class.getCanonicalName());
detectorClazzs.add(FailoverDetector.class.getCanonicalName());
detectorClazzs.add(FrequentFullGCDetector.class.getCanonicalName());
detectorClazzs.add(LongTimeFullGCDetector.class.getCanonicalName());
}
if (config.getBoolean(HealthMonitorOptions.ENABLE_RESOURCE_RESCALE)) {
detectorClazzs.add(HighCpuDetector.class.getCanonicalName());
detectorClazzs.add(LowCpuDetector.class.getCanonicalName());
detectorClazzs.add(HeapOOMDetector.class.getCanonicalName());
detectorClazzs.add(FrequentFullGCDetector.class.getCanonicalName());
detectorClazzs.add(LongTimeFullGCDetector.class.getCanonicalName());
detectorClazzs.add(DirectOOMDetector.class.getCanonicalName());
detectorClazzs.add(HighNativeMemoryDetector.class.getCanonicalName());
detectorClazzs.add(KilledDueToMemoryExceedDetector.class.getCanonicalName());
detectorClazzs.add(LowMemoryDetector.class.getCanonicalName());
detectorClazzs.add(JobStableDetector.class.getCanonicalName());
}
}
LOGGER.info("Load detectors:" + StringUtils.join(detectorClazzs, ","));
this.detectors = new ArrayList<>(detectorClazzs.size());
for (String clazz : detectorClazzs) {
Detector detector = (Detector) Class.forName(clazz.trim()).newInstance();
detectors.add(detector);
detector.open(this);
}
}
private void loadResolvers() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
Set resolverClazzs = new HashSet<>();
if (config.getString(RESOLVER_CLASSES) != null) {
resolverClazzs.addAll(Arrays.asList(config.getString(RESOLVER_CLASSES).split(",")));
} else {
if (config.getBoolean(HealthMonitorOptions.ENABLE_PARALLELISM_RESCALE)) {
resolverClazzs.add(ParallelismScaler.class.getCanonicalName());
}
if (config.getBoolean(HealthMonitorOptions.ENABLE_RESOURCE_RESCALE)) {
resolverClazzs.add(CpuAdjuster.class.getCanonicalName());
resolverClazzs.add(HeapMemoryAdjuster.class.getCanonicalName());
resolverClazzs.add(DirectMemoryAdjuster.class.getCanonicalName());
resolverClazzs.add(NativeMemoryAdjuster.class.getCanonicalName());
}
}
LOGGER.info("Load resolvers:" + StringUtils.join(resolverClazzs, ","));
this.resolvers = new ArrayList<>(resolverClazzs.size());
for (String clazz : resolverClazzs) {
Resolver resolver = (Resolver) Class.forName(clazz.trim()).newInstance();
resolvers.add(resolver);
resolver.open(this);
}
}
public JobID getJobID() {
return jobID;
}
public MetricProvider getMetricProvider() {
return metricProvider;
}
public RestServerClient getRestServerClient() {
return restServerClient;
}
public Configuration getConfig() {
return config;
}
public ScheduledExecutorService getExecutorService() {
return executorService;
}
public RestServerClient.JobConfig getJobConfig() {
if (jobConfig == null) {
jobConfig = restServerClient.getJobConfig(jobID);
}
return jobConfig;
}
public JobTopologyAnalyzer getJobTopologyAnalyzer() {
return jobTopologyAnalyzer;
}
public long getJobStartExecutionTime() {
if (jobStartExecutionTime == Long.MAX_VALUE) {
// check and set last start execution time.
jobStartExecutionTime = MetricUtils.getStartExecuteTime(this);
}
return jobStartExecutionTime;
}
/**
* Health check for a job, which detects abnormal symptoms of job which detectors and tries to
* resolve abnormal status with registered Resolver.
*/
public class HealthChecker implements Runnable {
@Override
public void run() {
try {
check();
// reset job config.
jobConfig = null;
} catch (Throwable e) {
LOGGER.warn("Fail to check job status", e);
}
}
public void check() {
LOGGER.debug("Start to check job {}.", jobID);
Configuration newConfig = getJobConfig().getConfig();
if (isEnabled && !newConfig.getBoolean(HEALTH_MONITOR_ENABLED) && !config.getBoolean(HEALTH_MONITOR_STANDALONE_MODE_ENABLED)) {
LOGGER.info("Disabling health monitor");
closePlugins();
isEnabled = false;
return;
}
if (!isEnabled && newConfig.getBoolean(HEALTH_MONITOR_ENABLED)) {
try {
LOGGER.info("Enabling health monitor");
// reload configuration.
for (String key : newConfig.keySet()) {
config.setString(key , newConfig.getString(key, null));
}
loadPlugins();
isEnabled = true;
} catch (ClassNotFoundException | IllegalAccessException | InstantiationException e) {
LOGGER.error("Fail to load plugins", e);
return;
}
}
if (!isEnabled) {
LOGGER.debug("Health monitor disabled.");
return;
}
List symptoms = new LinkedList<>();
jobStartExecutionTime = Long.MAX_VALUE;
// 1. check abnormal symptoms.
for (Detector detector: detectors) {
Symptom symptom = null;
try {
symptom = detector.detect();
} catch (Throwable e) {
LOGGER.warn("Exception caught in detector " + detector, e);
}
if (symptom != null) {
symptoms.add(symptom);
}
}
LOGGER.debug("Detected symptoms: {}.", symptoms);
// 2. diagnose and generate resolve action.
List actions = new LinkedList<>();
for (Resolver resolver : resolvers) {
try {
Action action = resolver.resolve(symptoms);
if (action != null) {
actions.add(action);
}
} catch (Throwable e) {
LOGGER.warn("Exception caught in resolver " + resolver, e);
}
}
LOGGER.debug("Generated actions: {}.", actions);
if (actions.size() == 0) {
return;
}
// 3. select an action to execute.
Action action = null;
try {
action = actionSelector.accept(actions);
} catch (Throwable e) {
LOGGER.warn("Exception caught in action selector", e);
}
if (action != null) {
try {
// 4. execute an action.
LOGGER.info("Executing action {}, because of symptom: {}", action, symptoms);
if (config.getBoolean(DRY_RUN_ENABLED)) {
return;
}
action.execute(restServerClient);
// 5. validate result of action.
if (!action.validate(metricProvider, restServerClient)) {
LOGGER.info("Action {} validate failed, try to roll back", action);
// 6. execution roll back action if validation failed.
Action actionToRollback = action.rollback();
LOGGER.info("Executing roll back action {}", actionToRollback);
actionToRollback.execute(restServerClient);
// 7. notify failure of an action.
actionSelector.actionFailed(action);
failedActionCount++;
} else {
LOGGER.info("Action {} validate succeed.", action);
// 6. notify success of an action.
actionSelector.actionSucceed(action);
successActionCount++;
}
} catch (Throwable e) {
LOGGER.warn("Action " + action + " execution failed.", e);
actionSelector.actionFailed(action);
failedActionCount++;
}
} else {
LOGGER.debug("No Action selected.");
}
}
}
}