All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.healthmanager.plugins.actions.AdjustJobConfig Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.healthmanager.plugins.actions;

import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.operators.ResourceSpec;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.healthmanager.RestServerClient;
import org.apache.flink.runtime.healthmanager.metrics.MetricProvider;
import org.apache.flink.runtime.healthmanager.plugins.Action;
import org.apache.flink.runtime.healthmanager.plugins.utils.HealthMonitorOptions;
import org.apache.flink.runtime.healthmanager.plugins.utils.MaxResourceLimitUtil;
import org.apache.flink.runtime.jobgraph.JobVertexID;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Adjust resource and parallelism config for given vertex.
 */
public class AdjustJobConfig implements Action {

	private static final Logger LOGGER = LoggerFactory.getLogger(AdjustJobConfig.class);

	protected static long maxSleepTime = 60_000L;

	protected JobID jobID;
	protected Map currentParallelism;
	protected Map targetParallelism;
	protected Map currentResource;
	protected Map targetResource;
	protected long timeoutMs;
	protected ActionMode actionMode;

	public AdjustJobConfig(JobID jobID, long timeoutMs) {
		this(jobID, timeoutMs, new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), ActionMode.IMMEDIATE);
	}

	public AdjustJobConfig(
		JobID jobID,
		long timeoutMs,
		Map currentParallelism,
		Map targetParallelism,
		Map currentResource,
		Map targetResource) {
		this(jobID, timeoutMs, currentParallelism, targetParallelism, currentResource, targetResource, ActionMode.IMMEDIATE);
	}

	public AdjustJobConfig(AdjustJobConfig other) {
		this(other.jobID, other.timeoutMs);
		for (JobVertexID vertexID : other.currentParallelism.keySet()) {
			addVertex(
					vertexID,
					other.currentParallelism.get(vertexID),
					other.targetParallelism.get(vertexID),
					other.currentResource.get(vertexID),
					other.targetResource.get(vertexID));
		}
		this.actionMode = other.actionMode;
	}

	public AdjustJobConfig(
		JobID jobID,
		long timeoutMs,
		Map currentParallelism,
		Map targetParallelism,
		Map currentResource,
		Map targetResource,
		ActionMode actionMode) {
		this.jobID = jobID;
		this.timeoutMs = timeoutMs;
		this.currentParallelism = currentParallelism;
		this.currentResource = currentResource;
		this.targetParallelism = targetParallelism;
		this.targetResource = targetResource;
		this.actionMode = actionMode;
	}

	public void addVertex(
		JobVertexID jobVertexId,
		int currentParallelism,
		int targetParallelism,
		ResourceSpec currentResource,
		ResourceSpec targetResource) {
		this.currentParallelism.put(jobVertexId, currentParallelism);
		this.targetParallelism.put(jobVertexId, targetParallelism);
		this.currentResource.put(jobVertexId, currentResource);
		this.targetResource.put(jobVertexId, targetResource);
	}

	public Set getAffectedVertex() {
		return currentParallelism.keySet();
	}

	public int getTargetParallelism(JobVertexID vertexID) {
		return targetParallelism.get(vertexID);
	}

	public ResourceSpec getTargetResource(JobVertexID vertexID) {
		return targetResource.get(vertexID);
	}

	public boolean isEmpty() {
		return currentParallelism.isEmpty();
	}

	public void excludeMinorDiffVertices(Configuration conf) {
		excludeMinorDiffVertices(false, currentParallelism, currentResource, conf);
	}

	public void excludeMinorDiffVertices(boolean ignoreMinorScaleUp, Configuration conf) {
		excludeMinorDiffVertices(true, currentParallelism, currentResource, conf);
	}

	public void excludeMinorDiffVertices(
			boolean ignoreMinorScaleUp,
			Map refParallelism,
			Map refResource,
			Configuration conf) {
		double minDiffParallelismRatio = conf.getDouble(HealthMonitorOptions.PARALLELISM_SCALE_MIN_DIFF_RATIO);
		double minDiffResourceRatio = conf.getDouble(HealthMonitorOptions.RESOURCE_SCALE_MIN_DIFF_RATIO);
		double minDiffCpuCore = conf.getDouble(HealthMonitorOptions.RESOURCE_SCALE_MIN_DIFF_CPU);
		int minDiffNativeMemMB = conf.getInteger(HealthMonitorOptions.RESOURCE_SCALE_MIN_DIFF_NATIVE_MEM);

		HashSet vertexToRemove = new HashSet<>();
		for (JobVertexID vertexID : targetParallelism.keySet()) {
			Integer curPara = refParallelism.get(vertexID);
			int tarPara = targetParallelism.get(vertexID);
			ResourceSpec curRes = refResource.get(vertexID);
			ResourceSpec tarRes = targetResource.get(vertexID);

			if (curPara == null || curRes == null) {
				continue;
			}

			if (ignoreMinorScaleUp) {
				if (Math.abs(tarPara - curPara) > curPara * minDiffParallelismRatio) {
					continue;
				}
			} else {
				if (tarPara > curPara || curPara - tarPara > minDiffParallelismRatio * curPara) {
					continue;
				}
			}

			if (Math.abs(curRes.getCpuCores() - tarRes.getCpuCores()) > minDiffResourceRatio * curRes.getCpuCores() &&
				Math.abs(curRes.getCpuCores() - tarRes.getCpuCores()) > minDiffCpuCore) {
				continue;
			}

			if (Math.abs(curRes.getHeapMemory() - tarRes.getHeapMemory()) > minDiffResourceRatio * curRes.getHeapMemory()) {
				continue;
			}

			if (Math.abs(curRes.getDirectMemory() - tarRes.getDirectMemory()) > minDiffResourceRatio * curRes.getDirectMemory()) {
				continue;
			}

			if (Math.abs(curRes.getNativeMemory() - tarRes.getNativeMemory()) > minDiffResourceRatio * curRes.getNativeMemory() &&
				Math.abs(curRes.getNativeMemory() - tarRes.getNativeMemory()) > minDiffNativeMemMB) {
				continue;
			}

			vertexToRemove.add(vertexID);
		}

		for (JobVertexID vertexID : vertexToRemove) {
			LOGGER.debug("Removing vertex with minor difference, vertex id: {}", vertexID);
			currentParallelism.remove(vertexID);
			targetParallelism.remove(vertexID);
			currentResource.remove(vertexID);
			targetResource.remove(vertexID);
		}
	}

	@Override
	public void execute(RestServerClient restServerClient) throws Exception {
		Map> vertexParallelismResource = new HashMap<>();
		for (JobVertexID jvId : currentParallelism.keySet()) {
			vertexParallelismResource.put(jvId, new Tuple2<>(targetParallelism.get(jvId), targetResource.get(jvId)));
		}
		boolean triggerCheckpoint = true;
		if (actionMode == ActionMode.IMMEDIATE) {
			triggerCheckpoint = false;
		}
		if (!vertexParallelismResource.isEmpty()) {
			restServerClient.rescale(jobID, vertexParallelismResource, triggerCheckpoint).get();
		}
	}

	@Override
	public boolean validate(MetricProvider provider, RestServerClient restServerClient) throws Exception {
		long start = System.currentTimeMillis();
		if (timeoutMs < 0) {
			// return directly when there is no timeout check.
			timeoutMs = Long.MAX_VALUE;
		}
		while (true) {
			Thread.sleep(Math.min(timeoutMs / 10, maxSleepTime));
			if (System.currentTimeMillis() - start > timeoutMs) {
				return false;
			}

			RestServerClient.JobStatus jobStatus = restServerClient.getJobStatus(jobID);
			int i = 0;
			for (Tuple2 time2state: jobStatus.getTaskStatus().values()) {
				if (!time2state.f1.equals(ExecutionState.RUNNING)) {
					break;
				}
				i++;
			}

			// all task running now.
			if (i == jobStatus.getTaskStatus().size()) {
				break;
			}

		}
		return true;
	}

	@Override
	public Action rollback() {
		return new AdjustJobConfig(
				jobID, timeoutMs, targetParallelism, currentParallelism, targetResource, currentResource);
	}

	public void setActionMode(ActionMode actionMode) {
		this.actionMode = actionMode;
	}

	@Override
	public ActionMode getActionMode() {
		return actionMode;
	}

	@Override
	public String toString() {
		String adjustments = currentParallelism.keySet().stream().map(vertexId -> "{JobVertexID:" + vertexId + ", "
			+ "parallelism: " + currentParallelism.get(vertexId) + " -> " + targetParallelism.get(vertexId) + ", "
			+ "resource: " + currentResource.get(vertexId) + " -> " + targetResource.get(vertexId) + "}").collect(
			Collectors.joining(", "));
		return "AdjustJobConfig{actionMode: " + actionMode + ", adjustments: " + adjustments + "}";
	}

	public RestServerClient.JobConfig getAppliedJobConfig(RestServerClient.JobConfig originJobConfig) {
		RestServerClient.JobConfig appliedJobConfig = new RestServerClient.JobConfig(originJobConfig);
		for (JobVertexID vertexId : targetResource.keySet()) {
			RestServerClient.VertexConfig originVertexConfig = originJobConfig.getVertexConfigs().get(vertexId);
			RestServerClient.VertexConfig appliedVertexConfig = new RestServerClient.VertexConfig(
				originVertexConfig.getName(),
				targetParallelism.get(vertexId),
				originVertexConfig.getMaxParallelism(),
				targetResource.get(vertexId),
				originVertexConfig.getOperatorIds(),
				originVertexConfig.getColocationGroupId());
			appliedJobConfig.getVertexConfigs().put(vertexId, appliedVertexConfig);
		}
		return appliedJobConfig;
	}

	public boolean isScaleDown() {
		return isScaleDown(currentParallelism, currentResource);
	}

	private boolean isScaleDown(Map refParallelism, Map refResource) {
		for (JobVertexID vertexId : targetResource.keySet()) {
			if (targetParallelism.get(vertexId) > refParallelism.get(vertexId)) {
				return false;
			}

			if (!targetResource.get(vertexId).lessThanOrEqual(refResource.get(vertexId))) {
				return false;
			}
		}
		return true;
	}

	public boolean isMinorScaleDown(RestServerClient.JobConfig current, Configuration config) {
		Map parallelism = new HashMap<>();
		Map resourceSpec = new HashMap<>();
		current.getVertexConfigs().entrySet().stream().forEach(entry -> {
			parallelism.put(entry.getKey(), entry.getValue().getParallelism());
			resourceSpec.put(entry.getKey(), entry.getValue().getResourceSpec());
		});
		if (isScaleDown(parallelism, resourceSpec)) {
			double maxCpuLimit = MaxResourceLimitUtil.getMaxCpu(config);
			int maxMemoryLimit = MaxResourceLimitUtil.getMaxMem(config);
			if (maxCpuLimit != Double.MAX_VALUE || maxMemoryLimit != Integer.MAX_VALUE) {

				double minorRatio = config.getDouble(HealthMonitorOptions.MINOR_RATIO);
				double curTotalCpu = current.getJobTotalCpuCores();
				int curTotalMem = current.getJobTotalMemoryMb();
				RestServerClient.JobConfig targetJobConfig = getAppliedJobConfig(current);
				double targetTotalCpu = targetJobConfig.getJobTotalCpuCores();
				int targetTotalMem = targetJobConfig.getJobTotalMemoryMb();
				if (curTotalCpu - targetTotalCpu < minorRatio * maxCpuLimit && curTotalMem - targetTotalMem < minorRatio * maxMemoryLimit) {
					return true;
				}
			}
		}
		return false;
	}

	public void clear() {
		currentResource.clear();
		currentParallelism.clear();
		targetResource.clear();
		targetParallelism.clear();
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy