org.apache.flink.runtime.resourcemanager.autoscale.plugins.calculators.TargetTrackerScaler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.resourcemanager.autoscale.plugins.calculators;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.runtime.resourcemanager.autoscale.ResourceAutoScaler;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.ResourceCalculator;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.ResourceSymptom;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.detectors.ClusterCpuUsageDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.detectors.ClusterMemoryUsageDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.detectors.ClusterUnfulfilledQueueDetector;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterHighBlockRequests;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterHighCpu;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterHighMemory;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterLongUnfulfilledQueue;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterLowCpu;
import org.apache.flink.runtime.resourcemanager.autoscale.plugins.symptoms.ClusterLowMemory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.List;
/**
* Target tracker scaler resolve the cluster resource auto-scale.
* It scale out/in the cluster to the estimated number when the tracked metrics above/below the expected threshold,
* e.g. when the tracked metrics grow above the high threshold, it estimated the TM number to decrease to the high threshold.
* The estimated number is also restricted by {@link ResourceAutoScaler#RESOURCE_LIMIT_CLUSTER_MAX_TM}
* and {@link ResourceAutoScaler#RESOURCE_LIMIT_CLUSTER_MIN_TM}.
* It currently does not support block request number metrics.
*/
public class TargetTrackerScaler implements ResourceCalculator {
private static final Logger LOGGER = LoggerFactory.getLogger(TargetTrackerScaler.class);
private ResourceAutoScaler resourceAutoScaler;
private Configuration configuration;
private ClusterHighCpu clusterHighCpu;
private ClusterLowCpu clusterLowCpu;
private ClusterHighMemory clusterHighMemory;
private ClusterLowMemory clusterLowMemory;
private ClusterLongUnfulfilledQueue clusterLongUnfulfilledQueue;
private ClusterHighBlockRequests clusterHighBlockRequests;
private double cpuHighThreshold;
private double cpuLowThreshold;
private double memoryHighThreshold;
private double memoryLowThreshold;
private int unfulfilledQueueHighThreshold;
private int taskManagerSlots;
@Override
public void open(ResourceAutoScaler resourceAutoScaler) {
this.resourceAutoScaler = resourceAutoScaler;
this.configuration = resourceAutoScaler.getConfig();
this.cpuHighThreshold = configuration.getDouble(ClusterCpuUsageDetector.HIGH_CPU_THRESHOLD);
this.cpuLowThreshold = configuration.getDouble(ClusterCpuUsageDetector.LOW_CPU_THRESHOLD);
this.memoryHighThreshold = configuration.getDouble(ClusterMemoryUsageDetector.HIGH_MEM_THRESHOLD);
this.memoryLowThreshold = configuration.getDouble(ClusterMemoryUsageDetector.LOW_MEM_THRESHOLD);
this.unfulfilledQueueHighThreshold = configuration.getInteger(ClusterUnfulfilledQueueDetector.HIGH_UNFULFILLED_QUEUE_LENGTH_THRESHOLD);
this.taskManagerSlots = configuration.getInteger(TaskManagerOptions.NUM_TASK_SLOTS);
}
@Override
public void close() {
}
@Override
public int calculate(List symptomList, int currentTaskManagers) {
diagnose(symptomList);
int targetTaskManagers = currentTaskManagers;
if (clusterHighMemory != null || clusterHighCpu != null || clusterLongUnfulfilledQueue != null) {
LOGGER.info("Detect event {} {} {}, trigger scale-out with current tm {}.",
clusterHighCpu, clusterHighMemory, clusterLongUnfulfilledQueue, currentTaskManagers);
// The target TM number should be the maximum of all estimated number.
if (clusterHighCpu != null) {
targetTaskManagers = Math.max(targetTaskManagers,
(int) Math.ceil(currentTaskManagers * clusterHighCpu.getUtility() / cpuHighThreshold));
}
if (clusterHighMemory != null) {
targetTaskManagers = Math.max(targetTaskManagers,
(int) Math.ceil(currentTaskManagers * clusterHighMemory.getUtility() / memoryHighThreshold));
}
if (clusterLongUnfulfilledQueue != null) {
targetTaskManagers = Math.max(targetTaskManagers,
currentTaskManagers +
(int) Math.ceil((clusterLongUnfulfilledQueue.getLength() - unfulfilledQueueHighThreshold) / taskManagerSlots));
}
} else if (clusterHighBlockRequests == null && (clusterLowMemory != null && clusterLowCpu != null)) {
// We trigger scale in when both cpu and memory lower than threshold.
LOGGER.info("Detect event {} {}, trigger scale-down with current tm {}.",
clusterLowMemory, clusterLowCpu, currentTaskManagers);
// The target TM number should be the maximum of all estimated number.
targetTaskManagers = Math.max(
(int) Math.ceil(currentTaskManagers * clusterLowMemory.getUtility() / memoryLowThreshold),
(int) Math.ceil(currentTaskManagers * clusterLowCpu.getUtility() / cpuLowThreshold));
}
targetTaskManagers = Math.min(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MAX_TM));
targetTaskManagers = Math.max(targetTaskManagers,
resourceAutoScaler.getConfig().getInteger(ResourceAutoScaler.RESOURCE_LIMIT_CLUSTER_MIN_TM));
return targetTaskManagers;
}
public boolean diagnose(List symptomList) {
clusterHighCpu = null;
clusterLowCpu = null;
clusterHighMemory = null;
clusterLowMemory = null;
clusterLongUnfulfilledQueue = null;
clusterHighBlockRequests = null;
for (ResourceSymptom symptom : symptomList) {
if (symptom instanceof ClusterHighMemory) {
clusterHighMemory = (ClusterHighMemory) symptom;
LOGGER.debug("Cluster high memory detected.");
continue;
}
if (symptom instanceof ClusterLowMemory) {
clusterLowMemory = (ClusterLowMemory) symptom;
LOGGER.debug("Cluster low memory detected.");
continue;
}
if (symptom instanceof ClusterHighCpu) {
clusterHighCpu = (ClusterHighCpu) symptom;
LOGGER.debug("Cluster high cpu detected.");
continue;
}
if (symptom instanceof ClusterLowCpu) {
clusterLowCpu = (ClusterLowCpu) symptom;
LOGGER.debug("Cluster low cpu detected.");
continue;
}
if (symptom instanceof ClusterHighBlockRequests) {
clusterHighBlockRequests = (ClusterHighBlockRequests) symptom;
LOGGER.debug("Cluster high block request detected.");
continue;
}
if (symptom instanceof ClusterLongUnfulfilledQueue) {
clusterLongUnfulfilledQueue = (ClusterLongUnfulfilledQueue) symptom;
LOGGER.debug("Cluster long unfulfilled queue detected.");
}
}
return true;
}
}