Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.flink.runtime.healthmanager.HealthMonitor Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.healthmanager;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.healthmanager.metrics.HealthMonitorMetricGroup;
import org.apache.flink.runtime.healthmanager.metrics.MetricProvider;
import org.apache.flink.runtime.healthmanager.plugins.Action;
import org.apache.flink.runtime.healthmanager.plugins.ActionExecutor;
import org.apache.flink.runtime.healthmanager.plugins.ActionSelector;
import org.apache.flink.runtime.healthmanager.plugins.Detector;
import org.apache.flink.runtime.healthmanager.plugins.Policy;
import org.apache.flink.runtime.healthmanager.plugins.Resolver;
import org.apache.flink.runtime.healthmanager.plugins.Symptom;
import org.apache.flink.runtime.healthmanager.plugins.actionselectors.RescaleResourcePriorActionSelector;
import org.apache.flink.runtime.healthmanager.plugins.executors.DirectActionExecutor;
import org.apache.flink.runtime.healthmanager.plugins.policies.PolicyUtils;
import org.apache.flink.runtime.healthmanager.plugins.utils.JobTopologyAnalyzer;
import org.apache.flink.runtime.healthmanager.plugins.utils.MetricUtils;
import org.apache.flink.runtime.healthmanager.plugins.utils.TaskMetricsSubscriber;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
/**
* Health Monitor which checks status of job periodically, and solve abnormal symptoms when detected.
*/
public class HealthMonitor {
private static final Logger LOGGER = LoggerFactory.getLogger(HealthMonitor.class);
public static final ConfigOption HEALTH_MONITOR_ENABLED =
ConfigOptions.key("healthmonitor.enabled").defaultValue(true);
public static final ConfigOption HEALTH_MONITOR_STANDALONE_CHECK_INTERVAL =
ConfigOptions.key("healthmonitor.standalone.check.interval.ms").defaultValue(10000L);
public static final ConfigOption HEALTH_CHECK_INTERNAL =
ConfigOptions.key("healthmonitor.health.check.interval.ms").defaultValue(10000L);
public static final ConfigOption ACTION_SELECTOR_CLASS =
ConfigOptions.key("healthmonitor.action.selector.class")
.defaultValue(RescaleResourcePriorActionSelector.class.getCanonicalName());
public static final ConfigOption ACTION_EXECUTOR_CLASS =
ConfigOptions.key("healthmonitor.action.executor.class")
.defaultValue(DirectActionExecutor.class.getCanonicalName());
public static final ConfigOption POLICY_CLASS =
ConfigOptions.key("healthmonitor.policy.class").noDefaultValue();
public static final ConfigOption DETECTOR_CLASSES =
ConfigOptions.key("healthmonitor.detector.classes").noDefaultValue();
public static final ConfigOption RESOLVER_CLASSES =
ConfigOptions.key("healthmonitor.resolver.classes").noDefaultValue();
public static final ConfigOption HEALTH_MONITOR_STANDALONE_MODE_ENABLED =
ConfigOptions.key("healthmonitor.standalone.mode.enabled").defaultValue(false);
public static final ConfigOption DRY_RUN_ENABLED =
ConfigOptions.key("healthmonitor.dry-run.enabled").defaultValue(false);
private JobID jobID;
private Configuration config;
private RestServerClient.JobConfig jobConfig;
private MetricProvider metricProvider;
private RestServerClient restServerClient;
private HealthMonitorMetricGroup metricGroup;
private ScheduledExecutorService executorService;
private ScheduledFuture timedTaskHandler;
private Policy policy;
private List detectors;
private List resolvers;
private ActionSelector actionSelector;
private ActionExecutor actionExecutor;
private volatile long jobStartExecutionTime = Long.MAX_VALUE;
private volatile long successActionCount = 0;
private volatile long failedActionCount = 0;
private volatile boolean isEnabled;
private Map taskMetricsSubscribes = new HashMap<>();
private JobTopologyAnalyzer jobTopologyAnalyzer = new JobTopologyAnalyzer();
@VisibleForTesting
public HealthMonitor(
JobID jobID,
MetricProvider metricProvider,
RestServerClient visitor,
ScheduledExecutorService executorService,
Configuration config) {
this(jobID, metricProvider, visitor, null, executorService, config);
}
public HealthMonitor(
JobID jobID,
MetricProvider metricProvider,
RestServerClient visitor,
HealthMonitorMetricGroup metricGroup,
ScheduledExecutorService executorService,
Configuration config) {
this.jobID = jobID;
this.executorService = executorService;
this.metricProvider = metricProvider;
this.restServerClient = visitor;
this.config = config.clone();
this.metricGroup = metricGroup;
}
public void start() throws Exception {
LOGGER.info("Starting to monitor job {}", jobID);
jobTopologyAnalyzer.analyze(getJobConfig());
for (String key : getJobConfig().getConfig().keySet()) {
this.config.setString(key , getJobConfig().getConfig().getString(key, null));
}
isEnabled = config.getBoolean(HEALTH_MONITOR_ENABLED) || config.getBoolean(HealthMonitor.HEALTH_MONITOR_STANDALONE_MODE_ENABLED);
long checkInterval = config.getBoolean(HEALTH_MONITOR_STANDALONE_MODE_ENABLED) ? config.getLong(HEALTH_MONITOR_STANDALONE_CHECK_INTERVAL) : config.getLong(HEALTH_CHECK_INTERNAL);
if (checkInterval > 0) {
timedTaskHandler = executorService.scheduleAtFixedRate(
new HealthChecker(), 0, checkInterval, TimeUnit.MILLISECONDS);
}
if (metricGroup != null) {
MetricGroup actionMetrics = metricGroup.addGroup("action");
actionMetrics.gauge("success", () -> successActionCount);
actionMetrics.gauge("failure", () -> failedActionCount);
}
}
@VisibleForTesting
public void loadPlugins() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
// try to close old plugins first.
closePlugins();
// apply policy configure.
loadPolicy();
// reload plugins.
loadDetectors();
loadResolvers();
loadActionSelector();
loadActionExecutor();
}
private void loadPolicy() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
policy = PolicyUtils.load(config);
policy.open(this);
policy.apply(this);
}
public void stop() {
if (timedTaskHandler != null) {
timedTaskHandler.cancel(true);
}
closePlugins();
}
@VisibleForTesting
public void closePlugins() {
// reload configuration.
if (this.actionExecutor != null) {
this.actionExecutor.close();
this.actionExecutor = null;
}
if (this.actionSelector != null) {
this.actionSelector.close();
this.actionSelector = null;
}
if (detectors != null) {
for (Detector detector : detectors) {
detector.close();
}
detectors.clear();
detectors = null;
}
if (resolvers != null) {
for (Resolver resolver : resolvers) {
resolver.close();
}
resolvers.clear();
resolvers = null;
}
for (TaskMetricsSubscriber taskMetricsSubscriber : taskMetricsSubscribes.values()) {
taskMetricsSubscriber.close();
}
taskMetricsSubscribes.clear();
}
public TaskMetricsSubscriber subscribeTaskMetrics(long interval) {
if (taskMetricsSubscribes.containsKey(interval)) {
return taskMetricsSubscribes.get(interval);
} else {
TaskMetricsSubscriber subscriber = new TaskMetricsSubscriber(this, interval);
subscriber.open();
taskMetricsSubscribes.put(interval, subscriber);
return subscriber;
}
}
private void loadActionSelector() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
this.actionSelector =
(ActionSelector) Class.forName(config.getString(ACTION_SELECTOR_CLASS)).newInstance();
LOGGER.info("Load action selector:" + actionSelector);
this.actionSelector.open(this);
}
private void loadActionExecutor() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
this.actionExecutor =
(ActionExecutor) Class.forName(config.getString(ACTION_EXECUTOR_CLASS)).newInstance();
LOGGER.info("Load action executor:" + actionExecutor);
this.actionExecutor.open(this);
}
@VisibleForTesting
public void loadDetectors() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
Set detectorClazzs = new HashSet<>();
if (!StringUtils.isEmpty(config.getString(DETECTOR_CLASSES))) {
detectorClazzs.addAll(Arrays.asList(config.getString(DETECTOR_CLASSES).split(",")));
}
LOGGER.info("Load detectors:" + StringUtils.join(detectorClazzs, ","));
this.detectors = new ArrayList<>(detectorClazzs.size());
for (String clazz : detectorClazzs) {
Detector detector = (Detector) Class.forName(clazz.trim()).newInstance();
detectors.add(detector);
detector.open(this);
}
}
@VisibleForTesting
public void loadResolvers() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
Set resolverClazzs = new HashSet<>();
if (!StringUtils.isEmpty(config.getString(RESOLVER_CLASSES))) {
resolverClazzs.addAll(Arrays.asList(config.getString(RESOLVER_CLASSES).split(",")));
}
LOGGER.info("Load resolvers:" + StringUtils.join(resolverClazzs, ","));
this.resolvers = new ArrayList<>(resolverClazzs.size());
for (String clazz : resolverClazzs) {
Resolver resolver = (Resolver) Class.forName(clazz.trim()).newInstance();
resolvers.add(resolver);
resolver.open(this);
}
}
public JobID getJobID() {
return jobID;
}
public MetricProvider getMetricProvider() {
return metricProvider;
}
public RestServerClient getRestServerClient() {
return restServerClient;
}
public Configuration getConfig() {
return config;
}
public ScheduledExecutorService getExecutorService() {
return executorService;
}
public RestServerClient.JobConfig getJobConfig() {
if (jobConfig == null) {
jobConfig = restServerClient.getJobConfig(jobID);
}
return jobConfig;
}
public JobTopologyAnalyzer getJobTopologyAnalyzer() {
return jobTopologyAnalyzer;
}
public long getJobStartExecutionTime() {
if (jobStartExecutionTime == Long.MAX_VALUE) {
// check and set last start execution time.
jobStartExecutionTime = MetricUtils.getStartExecuteTime(this);
}
return jobStartExecutionTime;
}
/**
* Health check for a job, which detects abnormal symptoms of job which detectors and tries to
* resolve abnormal status with registered Resolver.
*/
public class HealthChecker implements Runnable {
private boolean forceReload = true;
private Configuration lastJobConfig = null;
@Override
public void run() {
try {
check();
// reset job config.
jobConfig = null;
} catch (Throwable e) {
LOGGER.warn("Fail to check job status", e);
}
}
public void check() {
LOGGER.debug("Start to check job {}.", jobID);
Configuration newConfig = getJobConfig().getConfig();
if (!forceReload && policy != null) {
forceReload = policy.reloadPlugin();
}
if (forceReload || (
lastJobConfig != null && !lastJobConfig.equals(newConfig) && !config.getBoolean(HEALTH_MONITOR_STANDALONE_MODE_ENABLED))) {
// first reload succeed.
forceReload = false;
lastJobConfig = newConfig;
// try to close plugins.
closePlugins();
// config changed, merge new config.
for (String key : newConfig.keySet()) {
config.setString(key , newConfig.getString(key, null));
}
isEnabled = newConfig.getBoolean(HEALTH_MONITOR_ENABLED);
if (isEnabled) {
try {
loadPlugins();
} catch (Throwable e) {
LOGGER.error("Fail to reload plugins", e);
forceReload = true;
return;
}
}
}
if (!isEnabled) {
LOGGER.debug("Health monitor disabled.");
return;
}
List symptoms = new LinkedList<>();
jobStartExecutionTime = Long.MAX_VALUE;
// 1. check abnormal symptoms.
for (Detector detector: detectors) {
Symptom symptom = null;
try {
symptom = detector.detect();
} catch (Throwable e) {
LOGGER.warn("Exception caught in detector " + detector, e);
}
if (symptom != null) {
symptoms.add(symptom);
}
}
LOGGER.debug("Detected symptoms: {}.", symptoms);
// 2. diagnose and generate resolve action.
List actions = new LinkedList<>();
for (Resolver resolver : resolvers) {
try {
Action action = resolver.resolve(symptoms);
if (action != null) {
actions.add(action);
}
} catch (Throwable e) {
LOGGER.warn("Exception caught in resolver " + resolver, e);
}
}
LOGGER.debug("Generated actions: {}.", actions);
if (actions.size() == 0) {
return;
}
// 3. select an action to execute.
Action action = null;
try {
action = actionSelector.accept(actions);
} catch (Throwable e) {
LOGGER.warn("Exception caught in action selector", e);
}
if (action != null) {
LOGGER.info("Executing action {}, because of symptom: {}", action, symptoms);
try {
if (actionExecutor.execute(action)) {
actionSelector.actionSucceed(action);
successActionCount++;
} else {
actionSelector.actionFailed(action);
failedActionCount++;
}
} catch (Throwable e) {
LOGGER.warn("Action " + action + " execution failed.", e);
actionSelector.actionFailed(action);
failedActionCount++;
}
} else {
LOGGER.debug("No Action selected.");
}
}
}
@VisibleForTesting
public List getDetectors() {
return detectors;
}
@VisibleForTesting
public List getResolvers() {
return resolvers;
}
public ActionExecutor getActionExecutor() {
return actionExecutor;
}
}