All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.healthmanager.HealthManager Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.healthmanager;

import org.apache.flink.api.common.JobID;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.healthmanager.metrics.HealthManagerMetricGroup;
import org.apache.flink.runtime.healthmanager.metrics.MetricProvider;
import org.apache.flink.runtime.healthmanager.metrics.RestServerMetricProvider;
import org.apache.flink.runtime.metrics.MetricRegistry;
import org.apache.flink.runtime.util.ExecutorThreadFactory;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;

/**
 * Health manager monitors status of the jobs.
 */
public class HealthManager {

	private static final Logger LOGGER = LoggerFactory.getLogger(HealthManager.class);

	private static final ConfigOption JOB_CHECK_INTERNAL =
			ConfigOptions.key("healthmanager.job.check.interval.ms").defaultValue(10000L);

	/** Cluster configurations. */
	private Configuration config;

	/** Executor services. */
	private ScheduledExecutorService executorService;

	/** Metric Provider. */
	private MetricProvider metricProvider;

	/** RestServer client. */
	private RestServerClient restServerClient;

	/** All job Running. */
	private Map jobMonitors = new HashMap<>();

	/** Handler of the job checker timed task. */
	private ScheduledFuture timedTaskHandler;

	/** MetricsGroup for HealthManager. */
	private HealthManagerMetricGroup metricGroup;

	public HealthManager(
			String restServerAddress,
			MetricRegistry metricRegistry,
			Configuration config) throws Exception {

		this.config = config;

		this.executorService = new ScheduledThreadPoolExecutor(
				4, new ExecutorThreadFactory("health-manager"));

		this.metricGroup = new HealthManagerMetricGroup(metricRegistry);

		LOGGER.info("Starting Health manager with rest server:" + restServerAddress);
		this.restServerClient = new RestServerClientImpl(restServerAddress, config, executorService);

		this.metricProvider = new RestServerMetricProvider(config, restServerClient, executorService);
	}

	public void start() {
		LOGGER.info("Starting health manager.");
		long interval = config.getLong(JOB_CHECK_INTERNAL);
		if (interval > 0) {
			timedTaskHandler = executorService.scheduleAtFixedRate(
					new RunningJobListChecker(), 0, interval, TimeUnit.MILLISECONDS);
		}

		this.metricProvider.open();
	}

	public void stop() {
		LOGGER.info("Stopping health manager.");
		if (timedTaskHandler != null) {
			timedTaskHandler.cancel(true);
		}

		metricProvider.close();

		for (HealthMonitor monitor : jobMonitors.values()) {
			monitor.stop();
		}

		jobMonitors.clear();
		executorService.shutdown();

		if (metricGroup != null) {
			metricGroup.close();
		}
	}

	/**
	 * Job checker which starts new HealthMonitor for new jobs and stops HealthMonitor for stopped jobs.
	 */
	private class RunningJobListChecker implements Runnable {

		@Override
		public void run() {

			Map runningIds = new HashMap<>();

			try {
				restServerClient.listJob()
						.stream()
						.filter(status -> !status.getJobState().isGloballyTerminalState())
						.forEach(status -> runningIds.put(status.getJobId(), status.getJobName()));
			} catch (Throwable e) {
				// skip current round check since some wrong in rest server.
				LOGGER.warn("Wait rest server to be ready", e);
				return;
			}
			try {
				for (JobID id : runningIds.keySet()) {
					if (!jobMonitors.containsKey(id)) {
						LOGGER.info("New job submitted, id:" + id);
						HealthMonitor newMonitor = new HealthMonitor(
								id,
								metricProvider,
								restServerClient,
								metricGroup.addJob(id, runningIds.get(id)),
								executorService, config);
						try {
							newMonitor.start();
						} catch (Exception e) {
							LOGGER.info("Fail to start monitor for job:" + id, e);
							continue;
						}

						jobMonitors.put(id, newMonitor);
					}
				}

				List finishedJob = new LinkedList<>();
				for (JobID id : jobMonitors.keySet()) {
					if (!runningIds.containsKey(id)) {
						LOGGER.info("New job finished or failed, id:" + id);
						finishedJob.add(id);
						metricGroup.removeJob(id);
					}
				}

				for (JobID id : finishedJob) {
					jobMonitors.remove(id).stop();
				}
			} catch (Throwable e) {
				LOGGER.warn("Exception caught in job checker", e);
			}
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy