org.apache.flink.runtime.resourcemanager.autoscale.plugins.calculators.ExponentiallyClusterScaler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.resourcemanager.autoscale.plugins.calculators;
import org.apache.flink.configuration.ConfigOption;
import org.apache.flink.configuration.ConfigOptions;
import org.apache.flink.runtime.resourcemanager.autoscale.ResourceAutoScaler;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.ResourceCalculator;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.ResourceSymptom;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterHighBlockRequests;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterHighCpu;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterHighMemory;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterLongUnfulfilledQueue;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterLowCpu;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterLowMemory;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterShortUnfulfilledQueue;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterZeroBlockRequests;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
/**
* Exponentially scaler resolve the cluster resource auto-scale.
* It increase the TM number exponentially when scale-out, while decrease the TM number linearly when scale-in.
*/
public class ExponentiallyClusterScaler implements ResourceCalculator {
private static final Logger LOGGER = LoggerFactory.getLogger(ExponentiallyClusterScaler.class);
public static final ConfigOption EXPONENTIAL_SCALE_INIT_STEP =
ConfigOptions.key("session-auto-scale.exponential-scale.init-step").defaultValue(2);
private ResourceAutoScaler resourceAutoScaler;
private int nowScaleOutStep;
private ClusterHighCpu clusterHighCpu;
private ClusterLowCpu clusterLowCpu;
private ClusterHighMemory clusterHighMemory;
private ClusterLowMemory clusterLowMemory;
private ClusterLongUnfulfilledQueue clusterLongUnfulfilledQueue;
private ClusterShortUnfulfilledQueue clusterShortUnfulfilledQueue;
private ClusterHighBlockRequests clusterHighBlockRequests;
private ClusterZeroBlockRequests clusterZeroBlockRequests;
@Override
public void open(ResourceAutoScaler resourceAutoScaler) {
this.resourceAutoScaler = resourceAutoScaler;
nowScaleOutStep = resourceAutoScaler.getConfig().getInteger(EXPONENTIAL_SCALE_INIT_STEP);
}
@Override
public void close() {
}
@Override
public int calculate(List symptomList, int currentTaskManagers) {
diagnose(symptomList);
int targetTaskManagers = currentTaskManagers;
if (clusterHighMemory != null || clusterHighCpu != null ||
clusterLongUnfulfilledQueue != null || clusterHighBlockRequests != null) {
LOGGER.info("Detect event {} {} {} {}, trigger scale-out with current tm {} and scale step {}.",
clusterHighCpu, clusterHighMemory, clusterHighBlockRequests, clusterLongUnfulfilledQueue, currentTaskManagers, nowScaleOutStep);
targetTaskManagers = currentTaskManagers + nowScaleOutStep;
// Exponentially update the scale out step.
nowScaleOutStep <<= 1;
nowScaleOutStep = Math.min(nowScaleOutStep,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MAX_TM));
targetTaskManagers = Math.min(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MAX_TM));
targetTaskManagers = Math.max(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MIN_TM));
return targetTaskManagers;
}
// If not trigger scale-out, we need to reset the scale-out step to the init state.
nowScaleOutStep = resourceAutoScaler.getConfig().getInteger(EXPONENTIAL_SCALE_INIT_STEP);
if (clusterZeroBlockRequests != null && clusterShortUnfulfilledQueue != null &&
(clusterLowMemory != null && clusterLowCpu != null)) {
LOGGER.info("Detect event {} {} {} {}, trigger scale-in with current tm {} and scale step {}.",
clusterZeroBlockRequests, clusterShortUnfulfilledQueue, clusterLowMemory, clusterLowCpu, currentTaskManagers, 1);
targetTaskManagers = currentTaskManagers - 1;
targetTaskManagers = Math.min(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MAX_TM));
targetTaskManagers = Math.max(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MIN_TM));
return targetTaskManagers;
}
targetTaskManagers = Math.min(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MAX_TM));
targetTaskManagers = Math.max(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MIN_TM));
return targetTaskManagers;
}
public boolean diagnose(List symptomList) {
clusterHighCpu = null;
clusterLowCpu = null;
clusterHighMemory = null;
clusterLowMemory = null;
clusterLongUnfulfilledQueue = null;
clusterShortUnfulfilledQueue = null;
clusterHighBlockRequests = null;
clusterZeroBlockRequests = null;
for (ResourceSymptom symptom : symptomList) {
if (symptom instanceof ClusterHighMemory) {
clusterHighMemory = (ClusterHighMemory) symptom;
LOGGER.debug("Cluster high memory detected.");
continue;
}
if (symptom instanceof ClusterLowMemory) {
clusterLowMemory = (ClusterLowMemory) symptom;
LOGGER.debug("Cluster low memory detected.");
continue;
}
if (symptom instanceof ClusterHighCpu) {
clusterHighCpu = (ClusterHighCpu) symptom;
LOGGER.debug("Cluster high cpu detected.");
continue;
}
if (symptom instanceof ClusterLowCpu) {
clusterLowCpu = (ClusterLowCpu) symptom;
LOGGER.debug("Cluster low cpu detected.");
continue;
}
if (symptom instanceof ClusterZeroBlockRequests) {
clusterZeroBlockRequests = (ClusterZeroBlockRequests) symptom;
LOGGER.debug("Cluster zero block request detected.");
continue;
}
if (symptom instanceof ClusterHighBlockRequests) {
clusterHighBlockRequests = (ClusterHighBlockRequests) symptom;
LOGGER.debug("Cluster high block request detected.");
continue;
}
if (symptom instanceof ClusterLongUnfulfilledQueue) {
clusterLongUnfulfilledQueue = (ClusterLongUnfulfilledQueue) symptom;
LOGGER.debug("Cluster long unfulfilled queue detected.");
}
if (symptom instanceof ClusterShortUnfulfilledQueue) {
clusterShortUnfulfilledQueue = (ClusterShortUnfulfilledQueue) symptom;
LOGGER.debug("Cluster long short queue detected.");
}
}
return true;
}
}