Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.flink.runtime.healthmanager.plugins.actions.AdjustJobConfig Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.healthmanager.plugins.actions;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.operators.ResourceSpec;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.healthmanager.RestServerClient;
import org.apache.flink.runtime.healthmanager.metrics.MetricProvider;
import org.apache.flink.runtime.healthmanager.plugins.Action;
import org.apache.flink.runtime.healthmanager.plugins.utils.HealthMonitorOptions;
import org.apache.flink.runtime.healthmanager.plugins.utils.MaxResourceLimitUtil;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* Adjust resource and parallelism config for given vertex.
*/
public class AdjustJobConfig implements Action {
private static final Logger LOGGER = LoggerFactory.getLogger(AdjustJobConfig.class);
protected static long maxSleepTime = 60_000L;
protected JobID jobID;
protected Map currentParallelism;
protected Map targetParallelism;
protected Map currentResource;
protected Map targetResource;
protected long timeoutMs;
protected ActionMode actionMode;
public AdjustJobConfig(JobID jobID, long timeoutMs) {
this(jobID, timeoutMs, new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), ActionMode.IMMEDIATE);
}
public AdjustJobConfig(
JobID jobID,
long timeoutMs,
Map currentParallelism,
Map targetParallelism,
Map currentResource,
Map targetResource) {
this(jobID, timeoutMs, currentParallelism, targetParallelism, currentResource, targetResource, ActionMode.IMMEDIATE);
}
public AdjustJobConfig(AdjustJobConfig other) {
this(other.jobID, other.timeoutMs);
for (JobVertexID vertexID : other.currentParallelism.keySet()) {
addVertex(
vertexID,
other.currentParallelism.get(vertexID),
other.targetParallelism.get(vertexID),
other.currentResource.get(vertexID),
other.targetResource.get(vertexID));
}
this.actionMode = other.actionMode;
}
public AdjustJobConfig(
JobID jobID,
long timeoutMs,
Map currentParallelism,
Map targetParallelism,
Map currentResource,
Map targetResource,
ActionMode actionMode) {
this.jobID = jobID;
this.timeoutMs = timeoutMs;
this.currentParallelism = currentParallelism;
this.currentResource = currentResource;
this.targetParallelism = targetParallelism;
this.targetResource = targetResource;
this.actionMode = actionMode;
}
public void addVertex(
JobVertexID jobVertexId,
int currentParallelism,
int targetParallelism,
ResourceSpec currentResource,
ResourceSpec targetResource) {
this.currentParallelism.put(jobVertexId, currentParallelism);
this.targetParallelism.put(jobVertexId, targetParallelism);
this.currentResource.put(jobVertexId, currentResource);
this.targetResource.put(jobVertexId, targetResource);
}
public Set getAffectedVertex() {
return currentParallelism.keySet();
}
public int getTargetParallelism(JobVertexID vertexID) {
return targetParallelism.get(vertexID);
}
public ResourceSpec getTargetResource(JobVertexID vertexID) {
return targetResource.get(vertexID);
}
public boolean isEmpty() {
return currentParallelism.isEmpty();
}
public void excludeMinorDiffVertices(Configuration conf) {
excludeMinorDiffVertices(false, currentParallelism, currentResource, conf);
}
public void excludeMinorDiffVertices(boolean ignoreMinorScaleUp, Configuration conf) {
excludeMinorDiffVertices(true, currentParallelism, currentResource, conf);
}
public void excludeMinorDiffVertices(
boolean ignoreMinorScaleUp,
Map refParallelism,
Map refResource,
Configuration conf) {
double minDiffParallelismRatio = conf.getDouble(HealthMonitorOptions.PARALLELISM_SCALE_MIN_DIFF_RATIO);
double minDiffResourceRatio = conf.getDouble(HealthMonitorOptions.RESOURCE_SCALE_MIN_DIFF_RATIO);
double minDiffCpuCore = conf.getDouble(HealthMonitorOptions.RESOURCE_SCALE_MIN_DIFF_CPU);
int minDiffNativeMemMB = conf.getInteger(HealthMonitorOptions.RESOURCE_SCALE_MIN_DIFF_NATIVE_MEM);
HashSet vertexToRemove = new HashSet<>();
for (JobVertexID vertexID : targetParallelism.keySet()) {
Integer curPara = refParallelism.get(vertexID);
int tarPara = targetParallelism.get(vertexID);
ResourceSpec curRes = refResource.get(vertexID);
ResourceSpec tarRes = targetResource.get(vertexID);
if (curPara == null || curRes == null) {
continue;
}
if (ignoreMinorScaleUp) {
if (Math.abs(tarPara - curPara) > curPara * minDiffParallelismRatio) {
continue;
}
} else {
if (tarPara > curPara || curPara - tarPara > minDiffParallelismRatio * curPara) {
continue;
}
}
if (Math.abs(curRes.getCpuCores() - tarRes.getCpuCores()) > minDiffResourceRatio * curRes.getCpuCores() &&
Math.abs(curRes.getCpuCores() - tarRes.getCpuCores()) > minDiffCpuCore) {
continue;
}
if (Math.abs(curRes.getHeapMemory() - tarRes.getHeapMemory()) > minDiffResourceRatio * curRes.getHeapMemory()) {
continue;
}
if (Math.abs(curRes.getDirectMemory() - tarRes.getDirectMemory()) > minDiffResourceRatio * curRes.getDirectMemory()) {
continue;
}
if (Math.abs(curRes.getNativeMemory() - tarRes.getNativeMemory()) > minDiffResourceRatio * curRes.getNativeMemory() &&
Math.abs(curRes.getNativeMemory() - tarRes.getNativeMemory()) > minDiffNativeMemMB) {
continue;
}
vertexToRemove.add(vertexID);
}
for (JobVertexID vertexID : vertexToRemove) {
LOGGER.debug("Removing vertex with minor difference, vertex id: {}", vertexID);
currentParallelism.remove(vertexID);
targetParallelism.remove(vertexID);
currentResource.remove(vertexID);
targetResource.remove(vertexID);
}
}
@Override
public void execute(RestServerClient restServerClient) throws Exception {
Map> vertexParallelismResource = new HashMap<>();
for (JobVertexID jvId : currentParallelism.keySet()) {
vertexParallelismResource.put(jvId, new Tuple2<>(targetParallelism.get(jvId), targetResource.get(jvId)));
}
boolean triggerCheckpoint = true;
if (actionMode == ActionMode.IMMEDIATE) {
triggerCheckpoint = false;
}
if (!vertexParallelismResource.isEmpty()) {
restServerClient.rescale(jobID, vertexParallelismResource, triggerCheckpoint).get();
}
}
@Override
public boolean validate(MetricProvider provider, RestServerClient restServerClient) throws Exception {
long start = System.currentTimeMillis();
if (timeoutMs < 0) {
// return directly when there is no timeout check.
timeoutMs = Long.MAX_VALUE;
}
while (true) {
Thread.sleep(Math.min(timeoutMs / 10, maxSleepTime));
if (System.currentTimeMillis() - start > timeoutMs) {
return false;
}
RestServerClient.JobStatus jobStatus = restServerClient.getJobStatus(jobID);
int i = 0;
for (Tuple2 time2state: jobStatus.getTaskStatus().values()) {
if (!time2state.f1.equals(ExecutionState.RUNNING)) {
break;
}
i++;
}
// all task running now.
if (i == jobStatus.getTaskStatus().size()) {
break;
}
}
return true;
}
@Override
public Action rollback() {
return new AdjustJobConfig(
jobID, timeoutMs, targetParallelism, currentParallelism, targetResource, currentResource);
}
public void setActionMode(ActionMode actionMode) {
this.actionMode = actionMode;
}
@Override
public ActionMode getActionMode() {
return actionMode;
}
@Override
public String toString() {
String adjustments = currentParallelism.keySet().stream().map(vertexId -> "{JobVertexID:" + vertexId + ", "
+ "parallelism: " + currentParallelism.get(vertexId) + " -> " + targetParallelism.get(vertexId) + ", "
+ "resource: " + currentResource.get(vertexId) + " -> " + targetResource.get(vertexId) + "}").collect(
Collectors.joining(", "));
return "AdjustJobConfig{actionMode: " + actionMode + ", adjustments: " + adjustments + "}";
}
public RestServerClient.JobConfig getAppliedJobConfig(RestServerClient.JobConfig originJobConfig) {
RestServerClient.JobConfig appliedJobConfig = new RestServerClient.JobConfig(originJobConfig);
for (JobVertexID vertexId : targetResource.keySet()) {
RestServerClient.VertexConfig originVertexConfig = originJobConfig.getVertexConfigs().get(vertexId);
RestServerClient.VertexConfig appliedVertexConfig = new RestServerClient.VertexConfig(
originVertexConfig.getName(),
targetParallelism.get(vertexId),
originVertexConfig.getMaxParallelism(),
targetResource.get(vertexId),
originVertexConfig.getOperatorIds(),
originVertexConfig.getColocationGroupId());
appliedJobConfig.getVertexConfigs().put(vertexId, appliedVertexConfig);
}
return appliedJobConfig;
}
public boolean isScaleDown() {
return isScaleDown(currentParallelism, currentResource);
}
private boolean isScaleDown(Map refParallelism, Map refResource) {
for (JobVertexID vertexId : targetResource.keySet()) {
if (targetParallelism.get(vertexId) > refParallelism.get(vertexId)) {
return false;
}
if (!targetResource.get(vertexId).lessThanOrEqual(refResource.get(vertexId))) {
return false;
}
}
return true;
}
public boolean isMinorScaleDown(RestServerClient.JobConfig current, Configuration config) {
Map parallelism = new HashMap<>();
Map resourceSpec = new HashMap<>();
current.getVertexConfigs().entrySet().stream().forEach(entry -> {
parallelism.put(entry.getKey(), entry.getValue().getParallelism());
resourceSpec.put(entry.getKey(), entry.getValue().getResourceSpec());
});
if (isScaleDown(parallelism, resourceSpec)) {
double maxCpuLimit = MaxResourceLimitUtil.getMaxCpu(config);
int maxMemoryLimit = MaxResourceLimitUtil.getMaxMem(config);
if (maxCpuLimit != Double.MAX_VALUE || maxMemoryLimit != Integer.MAX_VALUE) {
double minorRatio = config.getDouble(HealthMonitorOptions.MINOR_RATIO);
double curTotalCpu = current.getJobTotalCpuCores();
int curTotalMem = current.getJobTotalMemoryMb();
RestServerClient.JobConfig targetJobConfig = getAppliedJobConfig(current);
double targetTotalCpu = targetJobConfig.getJobTotalCpuCores();
int targetTotalMem = targetJobConfig.getJobTotalMemoryMb();
if (curTotalCpu - targetTotalCpu < minorRatio * maxCpuLimit && curTotalMem - targetTotalMem < minorRatio * maxMemoryLimit) {
return true;
}
}
}
return false;
}
public void clear() {
currentResource.clear();
currentParallelism.clear();
targetResource.clear();
targetParallelism.clear();
}
}