All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.healthmanager.HealthMonitor Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.healthmanager;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.healthmanager.metrics.HealthMonitorMetricGroup;
import org.apache.flink.runtime.healthmanager.metrics.MetricProvider;
import org.apache.flink.runtime.healthmanager.plugins.Action;
import org.apache.flink.runtime.healthmanager.plugins.ActionExecutor;
import org.apache.flink.runtime.healthmanager.plugins.ActionSelector;
import org.apache.flink.runtime.healthmanager.plugins.Detector;
import org.apache.flink.runtime.healthmanager.plugins.Policy;
import org.apache.flink.runtime.healthmanager.plugins.Resolver;
import org.apache.flink.runtime.healthmanager.plugins.Symptom;
import org.apache.flink.runtime.healthmanager.plugins.actionselectors.RescaleResourcePriorActionSelector;
import org.apache.flink.runtime.healthmanager.plugins.executors.DirectActionExecutor;
import org.apache.flink.runtime.healthmanager.plugins.policies.PolicyUtils;
import org.apache.flink.runtime.healthmanager.plugins.utils.JobTopologyAnalyzer;
import org.apache.flink.runtime.healthmanager.plugins.utils.MetricUtils;
import org.apache.flink.runtime.healthmanager.plugins.utils.TaskMetricsSubscriber;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;

/**
 * Health Monitor which checks status of job periodically, and solve abnormal symptoms when detected.
 */
public class HealthMonitor {

	private static final Logger LOGGER = LoggerFactory.getLogger(HealthMonitor.class);

	public static final ConfigOption HEALTH_MONITOR_ENABLED =
			ConfigOptions.key("healthmonitor.enabled").defaultValue(true);

	public static final ConfigOption HEALTH_MONITOR_STANDALONE_CHECK_INTERVAL =
			ConfigOptions.key("healthmonitor.standalone.check.interval.ms").defaultValue(10000L);

	public static final ConfigOption HEALTH_CHECK_INTERNAL =
			ConfigOptions.key("healthmonitor.health.check.interval.ms").defaultValue(10000L);

	public static final ConfigOption ACTION_SELECTOR_CLASS =
			ConfigOptions.key("healthmonitor.action.selector.class")
					.defaultValue(RescaleResourcePriorActionSelector.class.getCanonicalName());

	public static final ConfigOption ACTION_EXECUTOR_CLASS =
			ConfigOptions.key("healthmonitor.action.executor.class")
					.defaultValue(DirectActionExecutor.class.getCanonicalName());

	public static final ConfigOption POLICY_CLASS =
			ConfigOptions.key("healthmonitor.policy.class").noDefaultValue();

	public static final ConfigOption DETECTOR_CLASSES =
			ConfigOptions.key("healthmonitor.detector.classes").noDefaultValue();

	public static final ConfigOption RESOLVER_CLASSES =
			ConfigOptions.key("healthmonitor.resolver.classes").noDefaultValue();

	public static final ConfigOption HEALTH_MONITOR_STANDALONE_MODE_ENABLED =
			ConfigOptions.key("healthmonitor.standalone.mode.enabled").defaultValue(false);

	public static final ConfigOption DRY_RUN_ENABLED =
			ConfigOptions.key("healthmonitor.dry-run.enabled").defaultValue(false);

	private JobID jobID;
	private Configuration config;

	private RestServerClient.JobConfig jobConfig;

	private MetricProvider metricProvider;
	private RestServerClient restServerClient;
	private HealthMonitorMetricGroup metricGroup;
	private ScheduledExecutorService executorService;

	private ScheduledFuture timedTaskHandler;

	private Policy policy;
	private List detectors;
	private List resolvers;
	private ActionSelector actionSelector;
	private ActionExecutor actionExecutor;

	private volatile long jobStartExecutionTime = Long.MAX_VALUE;

	private volatile long successActionCount = 0;
	private volatile long failedActionCount = 0;

	private volatile boolean isEnabled;

	private Map taskMetricsSubscribes = new HashMap<>();

	private JobTopologyAnalyzer jobTopologyAnalyzer = new JobTopologyAnalyzer();

	@VisibleForTesting
	public HealthMonitor(
			JobID jobID,
			MetricProvider metricProvider,
			RestServerClient visitor,
			ScheduledExecutorService executorService,
			Configuration config) {
		this(jobID, metricProvider, visitor, null, executorService, config);
	}

	public HealthMonitor(
			JobID jobID,
			MetricProvider metricProvider,
			RestServerClient visitor,
			HealthMonitorMetricGroup metricGroup,
			ScheduledExecutorService executorService,
			Configuration config) {

		this.jobID = jobID;
		this.executorService = executorService;
		this.metricProvider = metricProvider;
		this.restServerClient = visitor;
		this.config = config.clone();
		this.metricGroup = metricGroup;
	}

	public void start() throws Exception {

		LOGGER.info("Starting to monitor job {}", jobID);

		jobTopologyAnalyzer.analyze(getJobConfig());

		for (String key : getJobConfig().getConfig().keySet()) {
			this.config.setString(key , getJobConfig().getConfig().getString(key, null));
		}

		isEnabled = config.getBoolean(HEALTH_MONITOR_ENABLED) || config.getBoolean(HealthMonitor.HEALTH_MONITOR_STANDALONE_MODE_ENABLED);
		long checkInterval = config.getBoolean(HEALTH_MONITOR_STANDALONE_MODE_ENABLED) ? config.getLong(HEALTH_MONITOR_STANDALONE_CHECK_INTERVAL) : config.getLong(HEALTH_CHECK_INTERNAL);

		if (checkInterval > 0) {
			timedTaskHandler = executorService.scheduleAtFixedRate(
					new HealthChecker(), 0, checkInterval, TimeUnit.MILLISECONDS);
		}

		if (metricGroup != null) {
			MetricGroup actionMetrics = metricGroup.addGroup("action");
			actionMetrics.gauge("success", () -> successActionCount);
			actionMetrics.gauge("failure", () -> failedActionCount);
		}

	}

	@VisibleForTesting
	public void loadPlugins() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
		// try to close old plugins first.
		closePlugins();

		// apply policy configure.
		loadPolicy();

		// reload plugins.
		loadDetectors();
		loadResolvers();
		loadActionSelector();
		loadActionExecutor();
	}

	private void loadPolicy() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
		policy = PolicyUtils.load(config);
		policy.open(this);
		policy.apply(this);
	}

	public void stop() {

		if (timedTaskHandler != null) {
			timedTaskHandler.cancel(true);
		}

		closePlugins();
	}

	@VisibleForTesting
	public void closePlugins() {

		// reload configuration.
		if (this.actionExecutor != null) {
			this.actionExecutor.close();
			this.actionExecutor = null;
		}

		if (this.actionSelector != null) {
			this.actionSelector.close();
			this.actionSelector = null;
		}

		if (detectors != null) {
			for (Detector detector : detectors) {
				detector.close();
			}
			detectors.clear();
			detectors = null;
		}

		if (resolvers != null) {
			for (Resolver resolver : resolvers) {
				resolver.close();
			}
			resolvers.clear();
			resolvers = null;
		}

		for (TaskMetricsSubscriber taskMetricsSubscriber : taskMetricsSubscribes.values()) {
			taskMetricsSubscriber.close();
		}
		taskMetricsSubscribes.clear();
	}

	public TaskMetricsSubscriber subscribeTaskMetrics(long interval) {
		if (taskMetricsSubscribes.containsKey(interval)) {
			return taskMetricsSubscribes.get(interval);
		} else {
			TaskMetricsSubscriber subscriber = new TaskMetricsSubscriber(this, interval);
			subscriber.open();
			taskMetricsSubscribes.put(interval, subscriber);
			return subscriber;
		}
	}

	private void loadActionSelector() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
		this.actionSelector =
				(ActionSelector) Class.forName(config.getString(ACTION_SELECTOR_CLASS)).newInstance();
		LOGGER.info("Load action selector:" + actionSelector);
		this.actionSelector.open(this);
	}

	private void loadActionExecutor() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
		this.actionExecutor =
				(ActionExecutor) Class.forName(config.getString(ACTION_EXECUTOR_CLASS)).newInstance();
		LOGGER.info("Load action executor:" + actionExecutor);
		this.actionExecutor.open(this);
	}

	@VisibleForTesting
	public void loadDetectors() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
		Set detectorClazzs = new HashSet<>();
		if (!StringUtils.isEmpty(config.getString(DETECTOR_CLASSES))) {
			detectorClazzs.addAll(Arrays.asList(config.getString(DETECTOR_CLASSES).split(",")));
		}
		LOGGER.info("Load detectors:" + StringUtils.join(detectorClazzs, ","));
		this.detectors = new ArrayList<>(detectorClazzs.size());
		for (String clazz : detectorClazzs) {
			Detector detector = (Detector) Class.forName(clazz.trim()).newInstance();
			detectors.add(detector);
			detector.open(this);
		}

	}

	@VisibleForTesting
	public void loadResolvers() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
		Set resolverClazzs = new HashSet<>();
		if (!StringUtils.isEmpty(config.getString(RESOLVER_CLASSES))) {
			resolverClazzs.addAll(Arrays.asList(config.getString(RESOLVER_CLASSES).split(",")));
		}
		LOGGER.info("Load resolvers:" + StringUtils.join(resolverClazzs, ","));
		this.resolvers = new ArrayList<>(resolverClazzs.size());
		for (String clazz : resolverClazzs) {
			Resolver resolver = (Resolver) Class.forName(clazz.trim()).newInstance();
			resolvers.add(resolver);
			resolver.open(this);
		}
	}

	public JobID getJobID() {
		return jobID;
	}

	public MetricProvider getMetricProvider() {
		return metricProvider;
	}

	public RestServerClient getRestServerClient() {
		return restServerClient;
	}

	public Configuration getConfig() {
		return config;
	}

	public ScheduledExecutorService getExecutorService() {
		return executorService;
	}

	public RestServerClient.JobConfig getJobConfig() {
		if (jobConfig == null) {
			jobConfig = restServerClient.getJobConfig(jobID);
		}
		return jobConfig;
	}

	public JobTopologyAnalyzer getJobTopologyAnalyzer() {
		return jobTopologyAnalyzer;
	}

	public long getJobStartExecutionTime() {
		if (jobStartExecutionTime == Long.MAX_VALUE) {
			// check and set last start execution time.
			jobStartExecutionTime = MetricUtils.getStartExecuteTime(this);
		}
		return jobStartExecutionTime;
	}

	/**
	 * Health check for a job, which detects abnormal symptoms of job which detectors and tries to
	 * resolve abnormal status with registered Resolver.
	 */
	public class HealthChecker implements Runnable {

		private boolean forceReload = true;
		private Configuration lastJobConfig = null;
		@Override
		public void run() {
			try {
				check();
				// reset job config.
				jobConfig = null;
			} catch (Throwable e) {
				LOGGER.warn("Fail to check job status", e);
			}
		}

		public void check() {

			LOGGER.debug("Start to check job {}.", jobID);
			Configuration newConfig = getJobConfig().getConfig();

			if (!forceReload && policy != null) {
				forceReload = policy.reloadPlugin();
			}

			if (forceReload || (
					lastJobConfig != null && !lastJobConfig.equals(newConfig) && !config.getBoolean(HEALTH_MONITOR_STANDALONE_MODE_ENABLED))) {

				// first reload succeed.
				forceReload = false;
				lastJobConfig = newConfig;

				// try to close plugins.
				closePlugins();

				// config changed, merge new config.
				for (String key : newConfig.keySet()) {
					config.setString(key , newConfig.getString(key, null));
				}

				isEnabled = newConfig.getBoolean(HEALTH_MONITOR_ENABLED);
				if (isEnabled) {
					try {
						loadPlugins();
					} catch (Throwable e) {
						LOGGER.error("Fail to reload plugins", e);
						forceReload = true;
						return;
					}
				}
			}

			if (!isEnabled) {
				LOGGER.debug("Health monitor disabled.");
				return;
			}

			List symptoms = new LinkedList<>();

			jobStartExecutionTime = Long.MAX_VALUE;

			// 1. check abnormal symptoms.
			for (Detector detector: detectors) {
				Symptom symptom = null;
				try {
					symptom = detector.detect();
				} catch (Throwable e) {
					LOGGER.warn("Exception caught in detector " + detector, e);
				}
				if (symptom != null) {
					symptoms.add(symptom);
				}
			}
			LOGGER.debug("Detected symptoms: {}.", symptoms);

			// 2. diagnose and generate resolve action.
			List actions = new LinkedList<>();
			for (Resolver resolver : resolvers) {
				try {
					Action action = resolver.resolve(symptoms);
					if (action != null) {
						actions.add(action);
					}
				} catch (Throwable e) {
					LOGGER.warn("Exception caught in resolver " + resolver, e);
				}
			}
			LOGGER.debug("Generated actions: {}.", actions);

			if (actions.size() == 0) {
				return;
			}

			// 3. select an action to execute.
			Action action = null;
			try {
				action = actionSelector.accept(actions);
			} catch (Throwable e) {
				LOGGER.warn("Exception caught in action selector", e);
			}

			if (action != null) {
				LOGGER.info("Executing action {}, because of symptom: {}", action, symptoms);
				try {
					if (actionExecutor.execute(action)) {
						actionSelector.actionSucceed(action);
						successActionCount++;
					} else {
						actionSelector.actionFailed(action);
						failedActionCount++;
					}
				} catch (Throwable e) {
					LOGGER.warn("Action " + action + " execution failed.", e);
					actionSelector.actionFailed(action);
					failedActionCount++;
				}

			} else {
				LOGGER.debug("No Action selected.");
			}

		}
	}

	@VisibleForTesting
	public List getDetectors() {
		return detectors;
	}

	@VisibleForTesting
	public List getResolvers() {
		return resolvers;
	}

	public ActionExecutor getActionExecutor() {
		return actionExecutor;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy