Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.resourcemanager.autoscale;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.ResourceManagerOptions;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.resourcemanager.ResourceManager;
import org.apache.flink.runtime.resourcemanager.autoscale.hotupdate.HotUpdateConfigRetrievalListener;
import org.apache.flink.runtime.resourcemanager.autoscale.hotupdate.HotUpdateConfigRetrievalService;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.ResourceCalculator;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.ResourceDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.ResourceSymptom;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.calculators.ExponentiallyClusterScaler;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.detectors.ClusterBlockRequestDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.detectors.ClusterCpuUsageDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.detectors.ClusterMemoryUsageDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.detectors.ClusterUnfulfilledQueueDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.utils.HotUpdateConfiguration;
import org.apache.flink.runtime.resourcemanager.autoscale.utils.SlotManagerInfo;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import static org.apache.flink.configuration.ConfigOptions.key;
/**
* ResourceManager plugin to support Auto-Scale in session mode.
*/
public class ResourceAutoScaler {
private static final Logger LOGGER = LoggerFactory.getLogger(ResourceAutoScaler.class);
public static final ConfigOption SCALER_CHECK_INTERNAL =
ConfigOptions.key("session-auto-scale.check.interval.ms").defaultValue(10000L);
private static final ConfigOption METRIC_FETCH_INTERVAL_OPTION =
ConfigOptions.key("session-auto-scale.metric.fetch.interval.ms").defaultValue(10_000L);
private static final ConfigOption WARMUP_PERIOD =
ConfigOptions.key("session-auto-scale.check.warmup.ms").defaultValue(180_000L);
private static final ConfigOption COOL_DOWN_PERIOD =
ConfigOptions.key("session-auto-scale.rescale.cool-down.ms").defaultValue(1000L);
public static final ConfigOption METRIC_UPDATE_INTERVAL =
ConfigOptions.key("session-auto-scale.metric.update.interval.ms").defaultValue(180_000);
public static final ConfigOption RESOURCE_DETECTOR_CLASSES =
ConfigOptions.key("session-auto-scale.detector.classes").noDefaultValue();
public static final ConfigOption RESOURCE_CALCULATOR_CLASS =
ConfigOptions.key("session-auto-scale.calculator.class").noDefaultValue();
public static final ConfigOption RESOURCE_LIMIT_CLUSTER_MAX_TM =
ConfigOptions.key("session-auto-scale.resource.limit.cluster.tm.max").defaultValue(Integer.MAX_VALUE);
public static final ConfigOption RESOURCE_LIMIT_CLUSTER_MIN_TM =
ConfigOptions.key("session-auto-scale.resource.limit.cluster.tm.min").defaultValue(0);
public static final ConfigOption RESOURCE_DESIRE_TM =
ConfigOptions.key("session-auto-scale.resource.cluster.tm.desire").defaultValue(-1);
public static final ConfigOption HOTUPDATE_CONFIG_PATH =
key("session-auto-scale.hot-update.config.key")
.defaultValue("hotupdate");
private Configuration config;
private ScheduledExecutorService executorService;
private ScheduledFuture timedTaskHandler;
private ScheduledFuture fetchTaskHandler;
private final ResourceManager resourceManager;
private final HotUpdateConfigRetrievalService hotUpdateConfigRetrievalService;
private List detectors;
private ResourceCalculator resourceCalculator;
public ResourceAutoScaler(
ResourceManager resourceManager,
ScheduledExecutorService executorService,
Configuration config,
@Nullable HotUpdateConfigRetrievalService hotUpdateConfigRetrievalService) {
this.resourceManager = resourceManager;
this.executorService = executorService;
this.config = config;
this.hotUpdateConfigRetrievalService = hotUpdateConfigRetrievalService;
}
public void start() throws Exception {
LOGGER.info("Starting to monitor cluster resource.");
loadPlugins();
long checkInterval = config.getLong(SCALER_CHECK_INTERNAL);
long fetchIntervalMS = config.getLong(METRIC_FETCH_INTERVAL_OPTION);
long warmUpPeriod = config.getLong(WARMUP_PERIOD);
long coolDownPeriod = config.getLong(COOL_DOWN_PERIOD);
if (checkInterval > 0) {
timedTaskHandler = executorService.scheduleAtFixedRate(
new AutoScaleChecker(coolDownPeriod), warmUpPeriod, checkInterval, TimeUnit.MILLISECONDS);
}
if (fetchIntervalMS > 0) {
fetchTaskHandler = executorService.scheduleAtFixedRate(
new MetricFetcher(), warmUpPeriod, fetchIntervalMS, TimeUnit.MILLISECONDS);
}
if (hotUpdateConfigRetrievalService != null) {
hotUpdateConfigRetrievalService.start(new ResourceAutoScalerHotUpdateConfigRetrievalListener());
}
}
private void loadPlugins() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
// try to close old plugins first.
closePlugins();
// load new plugins.
loadResourceDetectors();
loadResourceCalculator();
}
private void loadResourceDetectors() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
Set detectorClazzs = new HashSet<>();
if (config.getString(RESOURCE_DETECTOR_CLASSES) != null) {
detectorClazzs.addAll(Arrays.asList(config.getString(RESOURCE_DETECTOR_CLASSES).split(",")));
} else {
if (config.getBoolean(ResourceManagerOptions.ENABLE_SESSION_AUTO_SCALE)) {
detectorClazzs.add(ClusterBlockRequestDetector.class.getCanonicalName());
detectorClazzs.add(ClusterCpuUsageDetector.class.getCanonicalName());
detectorClazzs.add(ClusterMemoryUsageDetector.class.getCanonicalName());
detectorClazzs.add(ClusterUnfulfilledQueueDetector.class.getCanonicalName());
}
}
LOGGER.info("Load detectors:" + StringUtils.join(detectorClazzs, ","));
this.detectors = new ArrayList<>(detectorClazzs.size());
for (String clazz : detectorClazzs) {
ResourceDetector detector = (ResourceDetector) Class.forName(clazz.trim()).newInstance();
detectors.add(detector);
detector.open(this);
}
}
private void loadResourceCalculator() throws ClassNotFoundException, IllegalAccessException, InstantiationException {
if (config.getString(RESOURCE_CALCULATOR_CLASS) != null) {
this.resourceCalculator = (ResourceCalculator) Class.forName(config.getString(RESOURCE_CALCULATOR_CLASS)).newInstance();
} else {
this.resourceCalculator = ExponentiallyClusterScaler.class.newInstance();
}
this.resourceCalculator.open(this);
}
public void stop() throws Exception {
if (timedTaskHandler != null) {
timedTaskHandler.cancel(true);
}
if (fetchTaskHandler != null) {
fetchTaskHandler.cancel(true);
}
if (hotUpdateConfigRetrievalService != null) {
hotUpdateConfigRetrievalService.stop();
}
closePlugins();
}
private void closePlugins() {
if (detectors != null) {
for (ResourceDetector detector : detectors) {
detector.close();
}
detectors.clear();
detectors = null;
}
if (resourceCalculator != null) {
resourceCalculator.close();
resourceCalculator = null;
}
}
public Configuration getConfig() {
return config;
}
private int calculateTargetTaskExecutorNumber() {
// We need to trigger rescale when the desire number not set in case the maximum/minimum
// number of taskExecutor changed and current number not fix in new range.
int targetTaskExecutorNumber = config.getInteger(RESOURCE_DESIRE_TM) == -1 ?
resourceManager.getCurrentTaskManagersNumber() : config.getInteger(RESOURCE_DESIRE_TM);
targetTaskExecutorNumber = Math.min(targetTaskExecutorNumber, config.getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MAX_TM));
targetTaskExecutorNumber = Math.max(targetTaskExecutorNumber, config.getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MIN_TM));
return targetTaskExecutorNumber;
}
public void hotUpdateConfiguration(HotUpdateConfiguration hotUpdateConfiguration) throws Exception {
int newMaxTaskExecutor = config.getInteger(RESOURCE_LIMIT_CLUSTER_MAX_TM);
if (hotUpdateConfiguration.containMaxTaskExecutors()) {
newMaxTaskExecutor = hotUpdateConfiguration.getMaxTaskExecutors();
}
int newMinTaskExecutor = config.getInteger(RESOURCE_LIMIT_CLUSTER_MIN_TM);
if (hotUpdateConfiguration.containMinTaskExecutors()) {
newMinTaskExecutor = hotUpdateConfiguration.getMinTaskExecutors();
}
if (newMaxTaskExecutor >= newMinTaskExecutor && newMaxTaskExecutor >= 0 && newMinTaskExecutor >= 0) {
config.setInteger(RESOURCE_LIMIT_CLUSTER_MAX_TM, newMaxTaskExecutor);
config.setInteger(RESOURCE_LIMIT_CLUSTER_MIN_TM, newMinTaskExecutor);
} else {
LOGGER.warn("Fail to check sanity of new resource limit configuration, will" +
" not change with new max task executor number {} and new min task executor number {}",
newMaxTaskExecutor, newMinTaskExecutor);
}
if (hotUpdateConfiguration.containDesireTaskExecutors()) {
config.setInteger(RESOURCE_DESIRE_TM, hotUpdateConfiguration.getDesireTaskExecutors());
}
if (hotUpdateConfiguration.shouldTriggerRescale()) {
resourceManager.getRpcService().execute(() -> resourceManager.rescale(calculateTargetTaskExecutorNumber())).get();
}
}
/**
* Health check for the cluster, which detects abnormal symptoms of cluster with resource detectors and tries to
* resolve abnormal status with registered resource calculator.
*/
private class AutoScaleChecker implements Runnable {
long coolDownPeriod;
long lastRescaleTimeStamp;
AutoScaleChecker(long coolDownPeriod) {
this.coolDownPeriod = coolDownPeriod;
this.lastRescaleTimeStamp = 0;
}
@Override
public void run() {
try {
check();
} catch (Throwable e) {
LOGGER.warn("Fail to check job status", e);
}
}
public void check() throws Exception {
LOGGER.debug("Start to check cluster.");
if (System.currentTimeMillis() < lastRescaleTimeStamp + coolDownPeriod) {
LOGGER.info("Still in cool down period, skip the rescale check.");
return;
}
List symptoms = new LinkedList<>();
// 1. check abnormal symptoms.
for (ResourceDetector detector: detectors) {
ResourceSymptom symptom = null;
try {
symptom = detector.detect();
} catch (Throwable e) {
LOGGER.warn("Exception caught in detector " + detector, e);
}
if (symptom != null) {
symptoms.add(symptom);
}
}
LOGGER.debug("Detected symptoms: {}.", symptoms);
// 2. diagnose and calculate target TaskExecutor number.
int currentTaskExecutorNumber = resourceManager.getRpcService().execute(resourceManager::getCurrentTaskManagersNumber).get();
int targetTaskExecutorNumber = resourceCalculator.calculate(symptoms, currentTaskExecutorNumber);
LOGGER.debug("Calculate target TaskExecutor number: {}.", targetTaskExecutorNumber);
if (targetTaskExecutorNumber == currentTaskExecutorNumber) {
return;
}
lastRescaleTimeStamp = System.currentTimeMillis();
LOGGER.info("Rescale the TaskExecutors from {} to {}.", currentTaskExecutorNumber, targetTaskExecutorNumber);
resourceManager.getRpcService().execute(() -> resourceManager.rescale(targetTaskExecutorNumber)).get();
}
}
/**
* Metrics fetcher for the detectors, which fetch metrics and update detectors' state periodically.
*/
private class MetricFetcher implements Runnable {
@Override
public void run() {
SlotManagerInfo slotManagerInfo;
Collection> taskManagersProfiles;
try {
slotManagerInfo =
resourceManager.getRpcService().execute(resourceManager::requestSlotManagerInfo).get();
taskManagersProfiles = resourceManager.getRpcService()
.execute(resourceManager::getTaskManagerResourceProfile).get();
} catch (Exception ignored) {
LOGGER.warn("Fail to fetch cluster metrics", ignored);
return;
}
for (ResourceDetector detector : detectors) {
detector.update(slotManagerInfo, taskManagersProfiles);
}
}
}
private class ResourceAutoScalerHotUpdateConfigRetrievalListener implements HotUpdateConfigRetrievalListener {
@Override
public void notifyUpdateConfig(@Nullable HotUpdateConfiguration hotUpdateConfig) {
try {
hotUpdateConfiguration(hotUpdateConfig);
} catch (Exception exception) {
LOGGER.error("Error occurred when try to hot-update configuration.", exception);
}
}
@Override
public void handleError(Exception exception) {
LOGGER.error("Error occurred when listen to hot-update configuration.", exception);
}
}
}