Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.flink.runtime.healthmanager.plugins.resolvers.ParallelismScaler Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.healthmanager.plugins.resolvers;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.operators.ResourceSpec;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.runtime.healthmanager.HealthMonitor;
import org.apache.flink.runtime.healthmanager.RestServerClient;
import org.apache.flink.runtime.healthmanager.metrics.MetricAggType;
import org.apache.flink.runtime.healthmanager.metrics.MetricProvider;
import org.apache.flink.runtime.healthmanager.metrics.TaskMetricSubscription;
import org.apache.flink.runtime.healthmanager.metrics.timeline.TimelineAggType;
import org.apache.flink.runtime.healthmanager.plugins.Action;
import org.apache.flink.runtime.healthmanager.plugins.Resolver;
import org.apache.flink.runtime.healthmanager.plugins.Symptom;
import org.apache.flink.runtime.healthmanager.plugins.actions.RescaleJobParallelism;
import org.apache.flink.runtime.healthmanager.plugins.detectors.LargeTimerCountDetector;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobStable;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobStuck;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexBackPressure;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexDelayIncreasing;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexFailover;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexFrequentFullGC;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexHighDelay;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexLargeTimerCount;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexLongTimeFullGC;
import org.apache.flink.runtime.healthmanager.plugins.symptoms.JobVertexOverParallelized;
import org.apache.flink.runtime.healthmanager.plugins.utils.HealthMonitorOptions;
import org.apache.flink.runtime.healthmanager.plugins.utils.JobTopologyAnalyzer;
import org.apache.flink.runtime.healthmanager.plugins.utils.MaxResourceLimitUtil;
import org.apache.flink.runtime.healthmanager.plugins.utils.TaskMetrics;
import org.apache.flink.runtime.healthmanager.plugins.utils.TaskMetricsSubscriber;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.rest.messages.checkpoints.CheckpointStatistics;
import org.apache.flink.runtime.rest.messages.checkpoints.TaskCheckpointStatistics;
import org.apache.flink.util.AbstractID;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static org.apache.flink.runtime.healthmanager.plugins.utils.HealthMonitorOptions.MAX_PARTITION_PER_TASK;
import static org.apache.flink.runtime.healthmanager.plugins.utils.HealthMonitorOptions.TIMER_SCALE_RATIO;
import static org.apache.flink.runtime.healthmanager.plugins.utils.MetricNames.SOURCE_PARTITION_COUNT;
import static org.apache.flink.runtime.healthmanager.plugins.utils.MetricNames.TASK_TIMER_COUNT;
/**
* Parallelism scaler resolves job parallelism scaling.
* workload = calculation_time_per_record * input_tps
* = (latency_per_record - wait_output_per_record * output_tps / input_tps) * input_tps
* target_parallelism_v1 : target_parallelism_v2 = workload_v1 : workload_v2
* target_parallelism = workload * tps_scale_ratio
*/
public class ParallelismScaler implements Resolver {
private static final Logger LOGGER = LoggerFactory.getLogger(ParallelismScaler.class);
private JobID jobID;
private HealthMonitor monitor;
private MetricProvider metricProvider;
private double scaleTpsRatio;
private long timeout;
private long checkInterval;
private int maxPartitionPerTask;
private long stableTime;
private long stateSizeThreshold;
private boolean checkRescale;
private boolean checkAllInScaleDown;
private double multiOutputRatio;
private double timerCountThreshold;
private double maxCpuLimit;
private int maxMemoryLimit;
private long checkpointIntervalThreshold;
private double minDiffParallelismRatio;
// symptoms
private JobVertexHighDelay highDelaySymptom;
private JobVertexDelayIncreasing delayIncreasingSymptom;
private JobVertexBackPressure backPressureSymptom;
private JobVertexOverParallelized overParallelizedSymptom;
private JobStable jobStableSymptom;
private JobVertexFrequentFullGC frequentFullGCSymptom;
private JobVertexLongTimeFullGC longTimeFullGCSymptom;
private JobVertexLargeTimerCount largeTimerCountSymptom;
private JobVertexFailover failoverSymptom;
private JobStuck jobStuckSymptom;
// diagnose
private boolean needScaleUpForDelay;
private boolean needScaleUpForBackpressure;
private boolean needScaleUpForMassiveTimer;
private boolean needScaleDown;
// topology
private TaskMetricsSubscriber taskMetricsSubscriber;
private JobTopologyAnalyzer jobTopologyAnalyzer;
private Map sourcePartitionCountSubs;
private Map timerCountSubs;
@Override
public void open(HealthMonitor monitor) {
this.monitor = monitor;
this.jobID = monitor.getJobID();
this.metricProvider = monitor.getMetricProvider();
this.scaleTpsRatio = monitor.getConfig().getDouble(HealthMonitorOptions.PARALLELISM_MIN_RATIO);
this.timeout = monitor.getConfig().getLong(HealthMonitorOptions.PARALLELISM_SCALE_TIME_OUT);
this.checkInterval = monitor.getConfig().getLong(HealthMonitorOptions.PARALLELISM_SCALE_INTERVAL);
this.maxPartitionPerTask = monitor.getConfig().getInteger(MAX_PARTITION_PER_TASK);
this.stableTime = monitor.getConfig().getLong(HealthMonitorOptions.PARALLELISM_SCALE_STABLE_TIME);
this.stateSizeThreshold = monitor.getConfig().getLong(HealthMonitorOptions.PARALLELISM_SCALE_STATE_SIZE_THRESHOLD);
this.checkpointIntervalThreshold = monitor.getConfig().getLong(HealthMonitorOptions.PARALLELISM_SCALE_CHECKPOINT_THRESHOLD);
this.checkRescale = monitor.getConfig().getBoolean(HealthMonitorOptions.PARALLELSIM_SCALE_CHECK_RESCALE);
this.checkAllInScaleDown = monitor.getConfig().getBoolean(HealthMonitorOptions.PARALLELISM_SCALE_CHECK_ALL_IN_SCALE_DOWN);
this.checkRescale = monitor.getConfig().getBoolean(HealthMonitorOptions.PARALLELSIM_SCALE_CHECK_RESCALE);
this.multiOutputRatio = monitor.getConfig().getDouble(HealthMonitorOptions.PARALLELISM_SCALE_MULTI_OUTPUT_RATIO);
this.timerCountThreshold = monitor.getConfig().getLong(LargeTimerCountDetector.LARGE_TIMER_COUNT_THRESHOLD) /
monitor.getConfig().getDouble(TIMER_SCALE_RATIO);
this.maxCpuLimit = MaxResourceLimitUtil.getMaxCpu(monitor.getConfig());
this.maxMemoryLimit = MaxResourceLimitUtil.getMaxMem(monitor.getConfig());
this.minDiffParallelismRatio = monitor.getConfig().getDouble(HealthMonitorOptions.PARALLELISM_SCALE_MIN_DIFF_RATIO);
this.taskMetricsSubscriber = monitor.subscribeTaskMetrics(checkInterval);
RestServerClient.JobConfig jobConfig = monitor.getJobConfig();
// analyze job graph.
this.jobTopologyAnalyzer = monitor.getJobTopologyAnalyzer();
// subscribe metrics.
this.sourcePartitionCountSubs = new HashMap<>();
this.timerCountSubs = new HashMap<>();
for (JobVertexID vertexId : jobConfig.getVertexConfigs().keySet()) {
// timer count
TaskMetricSubscription timerCount = metricProvider.subscribeTaskMetric(
jobID, vertexId, TASK_TIMER_COUNT, MetricAggType.SUM, 1, TimelineAggType.LATEST);
timerCountSubs.put(vertexId, timerCount);
// source latency
if (jobTopologyAnalyzer.isSource(vertexId)) {
sourcePartitionCountSubs.put(vertexId,
metricProvider.subscribeTaskMetric(
jobID, vertexId, SOURCE_PARTITION_COUNT, MetricAggType.SUM, checkInterval, TimelineAggType.LATEST));
}
}
}
@Override
public void close() {
if (metricProvider == null) {
return;
}
if (sourcePartitionCountSubs != null) {
for (TaskMetricSubscription sub : sourcePartitionCountSubs.values()) {
metricProvider.unsubscribe(sub);
}
}
if (timerCountSubs != null) {
for (TaskMetricSubscription sub: timerCountSubs.values()) {
metricProvider.unsubscribe(sub);
}
}
}
@VisibleForTesting
public void setMonitor(HealthMonitor monitor) {
this.monitor = monitor;
}
@Override
public Action resolve(List symptomList) {
LOGGER.debug("Start resolving.");
// Step-1. Diagnose
parseSymptoms(symptomList);
if (!diagnose()) {
return null;
}
// Step-2. Prepare Metrics
Map taskMetrics = taskMetricsSubscriber.getTaskMetrics();
if (taskMetrics == null && !needScaleUpForMassiveTimer) {
LOGGER.debug("Can not rescale, metrics are not completed.");
return null;
} else {
Map taskCheckpointInfo = null;
CheckpointStatistics completedCheckpointStats = null;
long lastCheckpointTime = 0;
try {
completedCheckpointStats = monitor.getRestServerClient().getLatestCheckPointStates(monitor.getJobID());
if (completedCheckpointStats != null) {
taskCheckpointInfo = completedCheckpointStats.getCheckpointStatisticsPerTask();
lastCheckpointTime = completedCheckpointStats.getLatestAckTimestamp();
}
} catch (Exception e) {
// fail to get checkpoint info.
}
Map minParallelisms = getVertexMinParallelisms(monitor.getJobConfig(), taskCheckpointInfo, taskMetrics);
LOGGER.debug("Min parallelism for vertices: {}", minParallelisms);
Map targetParallelisms;
// calculate target parallelism for vertices.
if (taskMetrics != null) {
// Step-4. calculate sub dags to scale up.
Map subDagScaleUpRatio = getSubDagScaleUpRatio(taskMetrics);
// Step-4.2. get scale down vertex.
Set vertexToDownScale = new HashSet<>();
if (System.currentTimeMillis() - lastCheckpointTime < checkpointIntervalThreshold || !subDagScaleUpRatio.isEmpty()) {
vertexToDownScale = getVertexToScaleDown(monitor.getJobConfig(), minParallelisms);
}
// Step-5. set parallelisms
targetParallelisms = getVertexTargetParallelisms(subDagScaleUpRatio, vertexToDownScale, taskMetrics);
} else {
// just massive timer for vertices is detected, rescaling these vertices only.
// Step-4 calculate target parallelism of vertices suffering massive timers.
Set massiveTimerVertices = new HashSet<>();
massiveTimerVertices.addAll(largeTimerCountSymptom.getJobVertexIDs());
targetParallelisms = new HashMap<>();
RestServerClient.JobConfig originJobConfig = monitor.getJobConfig();
for (JobVertexID vertexID: massiveTimerVertices) {
Tuple2 timerCount;
if (!timerCountSubs.containsKey(vertexID) ||
(timerCount = timerCountSubs.get(vertexID).getValue()) == null) {
continue;
}
LOGGER.debug("Current timer count {} for vertex {}.", timerCount.f1, vertexID);
int targetParallelism = (int) (Math.ceil(timerCount.f1 /
monitor.getConfig().getLong(LargeTimerCountDetector.LARGE_TIMER_COUNT_THRESHOLD)
* monitor.getConfig().getDouble(TIMER_SCALE_RATIO)));
// validate target parallelism
targetParallelism = targetParallelism > 1 ? targetParallelism : 1;
RestServerClient.VertexConfig originVertexConfig = originJobConfig.getVertexConfigs().get(vertexID);
if (targetParallelism <= originVertexConfig.getParallelism()) {
continue;
}
targetParallelisms.put(vertexID, targetParallelism);
}
}
LOGGER.debug("Target parallelism for vertices before applying constraints: {}.", targetParallelisms);
if (targetParallelisms.isEmpty()) {
return null;
}
updateTargetParallelismsSubjectToConstraints(targetParallelisms, minParallelisms, monitor.getJobConfig());
LOGGER.debug("Target parallelism for vertices after applying constraints: {}.", targetParallelisms);
// Step-6. generate parallelism rescale action
RescaleJobParallelism rescaleJobParallelism = generateRescaleParallelismAction(targetParallelisms, minParallelisms, monitor.getJobConfig());
if (rescaleJobParallelism != null && !rescaleJobParallelism.isEmpty()) {
LOGGER.info("RescaleJobParallelism action generated: {}.", rescaleJobParallelism);
return rescaleJobParallelism;
}
}
return null;
}
@VisibleForTesting
public Set getVertexToScaleDown(
RestServerClient.JobConfig jobConfig,
Map minParallelisms) {
Set vertexToDownScale = new HashSet<>();
// find sub dags to downscale
vertexToDownScale.clear();
if (needScaleDown) {
Set verticesToDownScale = new HashSet<>(overParallelizedSymptom.getJobVertexIDs());
for (JobVertexID vertexId : verticesToDownScale) {
if (minParallelisms == null || minParallelisms.get(vertexId) == null ||
minParallelisms.get(vertexId) < jobConfig.getVertexConfigs().get(vertexId).getParallelism()) {
vertexToDownScale.add(vertexId);
}
}
if (checkAllInScaleDown && !vertexToDownScale.isEmpty()) {
vertexToDownScale.addAll(jobConfig.getVertexConfigs().keySet());
}
}
LOGGER.debug("Roots of sub-dags need to scale down: {}.", vertexToDownScale);
return vertexToDownScale;
}
@VisibleForTesting
public void parseSymptoms(List symptomList) {
// clear old symptoms
jobStableSymptom = null;
frequentFullGCSymptom = null;
longTimeFullGCSymptom = null;
largeTimerCountSymptom = null;
failoverSymptom = null;
jobStuckSymptom = null;
highDelaySymptom = null;
delayIncreasingSymptom = null;
backPressureSymptom = null;
overParallelizedSymptom = null;
// read new symptoms
for (Symptom symptom : symptomList) {
if (symptom instanceof JobStable) {
jobStableSymptom = (JobStable) symptom;
}
if (symptom instanceof JobVertexFrequentFullGC) {
frequentFullGCSymptom = (JobVertexFrequentFullGC) symptom;
LOGGER.debug("Frequent full gc detected for vertices {}.", frequentFullGCSymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobVertexLongTimeFullGC) {
longTimeFullGCSymptom = (JobVertexLongTimeFullGC) symptom;
LOGGER.debug("Long time full gc detected for vertices {}.", longTimeFullGCSymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobVertexLargeTimerCount) {
largeTimerCountSymptom = (JobVertexLargeTimerCount) symptom;
LOGGER.debug("Large timer count detected for vertices {}.", largeTimerCountSymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobVertexFailover) {
failoverSymptom = (JobVertexFailover) symptom;
LOGGER.debug("Failover detected for vertices {}.", failoverSymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobStuck) {
jobStuckSymptom = (JobStuck) symptom;
LOGGER.debug("Stuck detected for vertices {}.", jobStuckSymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobVertexHighDelay) {
highDelaySymptom = (JobVertexHighDelay) symptom;
LOGGER.debug("High delay detected for vertices {}.", highDelaySymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobVertexDelayIncreasing) {
delayIncreasingSymptom = (JobVertexDelayIncreasing) symptom;
LOGGER.debug("Delay increasing detected for vertices {}.", delayIncreasingSymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobVertexBackPressure) {
backPressureSymptom = (JobVertexBackPressure) symptom;
LOGGER.debug("Back pressure detected for vertices {}.", backPressureSymptom.getJobVertexIDs());
continue;
}
if (symptom instanceof JobVertexOverParallelized) {
overParallelizedSymptom = (JobVertexOverParallelized) symptom;
LOGGER.debug("Over parallelized detected for vertices {}.", overParallelizedSymptom.getJobVertexIDs());
continue;
}
}
}
private boolean diagnose() {
needScaleUpForMassiveTimer = largeTimerCountSymptom != null;
if (needScaleUpForMassiveTimer) {
LOGGER.debug("Job exists vertices require large number of timer,"
+ " need rescale parallelism.");
return true;
}
if (jobStableSymptom == null ||
jobStableSymptom.getStableTime() < stableTime ||
frequentFullGCSymptom != null && frequentFullGCSymptom.isSevere() ||
longTimeFullGCSymptom != null && longTimeFullGCSymptom.isSevere() ||
failoverSymptom != null) {
LOGGER.debug("Job is not stable, should not rescale parallelism.");
return false;
}
needScaleUpForDelay = highDelaySymptom != null || delayIncreasingSymptom != null;
needScaleUpForBackpressure = backPressureSymptom != null;
needScaleDown = overParallelizedSymptom != null && backPressureSymptom == null;
if (!needScaleUpForDelay && !needScaleUpForBackpressure && !needScaleDown) {
LOGGER.debug("No need to rescale parallelism.");
return false;
}
return true;
}
@VisibleForTesting
public Map getSubDagScaleUpRatio(
Map taskMetrics) {
Map subDagTargetTpsRatio = new HashMap<>();
// find sub dags to upscale
Set subDagRootsToUpScale = new HashSet<>();
if (needScaleUpForBackpressure) {
Set backPressureVertices = new HashSet<>();
backPressureVertices.addAll(backPressureSymptom.getJobVertexIDs());
for (JobVertexID vertexID : backPressureVertices) {
subDagRootsToUpScale.add(jobTopologyAnalyzer.getSubDagRoot(vertexID));
subDagTargetTpsRatio.put(jobTopologyAnalyzer.getSubDagRoot(vertexID), scaleTpsRatio);
}
}
if (needScaleUpForDelay) {
Set verticesToUpScale = new HashSet<>();
if (highDelaySymptom != null) {
verticesToUpScale.addAll(highDelaySymptom.getJobVertexIDs());
}
if (delayIncreasingSymptom != null) {
verticesToUpScale.addAll(delayIncreasingSymptom.getJobVertexIDs());
}
for (JobVertexID vertexId : verticesToUpScale) {
subDagRootsToUpScale.add(jobTopologyAnalyzer.getSubDagRoot(vertexId));
TaskMetrics metric = taskMetrics.get(vertexId);
// init ratio to make sure we can cache up the delay.
double ratio = 1 / (1 - metric.getDelayIncreasingRate()) * scaleTpsRatio;
// only when workload > 0
if (metric.isParallelSource() && metric.getWorkload() > 0) {
double maxTps = 1.0 / Math.max(
metric.getPartitionLatency(),
metric.getTaskLatencyPerRecord() - metric.getWaitOutputPerRecord()) * metric.getPartitionCount();
if (highDelaySymptom != null && highDelaySymptom.getSevereJobVertexIDs().contains(vertexId)) {
// use max tps when delay is severe.
ratio = maxTps / metric.getInputTps();
} else if (maxTps / metric.getInputTps() * scaleTpsRatio < ratio) {
// limit target tps to be max tps.
ratio = maxTps / metric.getInputTps() * scaleTpsRatio;
}
}
if (ratio > 1 + minDiffParallelismRatio) {
subDagTargetTpsRatio.put(jobTopologyAnalyzer.getSubDagRoot(vertexId), ratio);
}
}
}
LOGGER.debug("Roots of sub-dags need to scale up: {}.", subDagRootsToUpScale);
// for sub dags that need to rescale, set target scale ratio
LOGGER.debug("Target scale up tps ratio for sub-dags before adjusting: {}", subDagTargetTpsRatio);
// scale up downstream sub dags according to upstream sub dags
boolean hasDagScaleUp = true;
while (hasDagScaleUp) {
hasDagScaleUp = false;
for (JobVertexID root : jobTopologyAnalyzer.getAllSubDagRoots()) {
for (JobVertexID upStream : jobTopologyAnalyzer.getInputs(root)) {
JobVertexID upStreamSubDagRoot = jobTopologyAnalyzer.getSubDagRoot(upStream);
if (!subDagTargetTpsRatio.containsKey(upStreamSubDagRoot)) {
continue;
}
if (!subDagTargetTpsRatio.containsKey(root) ||
subDagTargetTpsRatio.get(root) < subDagTargetTpsRatio.get(upStreamSubDagRoot)) {
subDagTargetTpsRatio.put(root, subDagTargetTpsRatio.get(upStreamSubDagRoot));
hasDagScaleUp = true;
}
}
}
}
if (needScaleUpForMassiveTimer) {
for (JobVertexID vertexID: largeTimerCountSymptom.getJobVertexIDs()) {
double tartParallelism = taskMetrics.get(vertexID).getTimerCount() / timerCountThreshold;
double ratio = tartParallelism / taskMetrics.get(vertexID).getWorkload();
JobVertexID rootID = jobTopologyAnalyzer.getSubDagRoot(vertexID);
if (ratio > subDagTargetTpsRatio.getOrDefault(rootID, 1.0)) {
subDagTargetTpsRatio.put(rootID, ratio);
}
}
}
LOGGER.debug("Target scale up tps ratio for sub-dags after adjusting: {}.", subDagTargetTpsRatio);
return subDagTargetTpsRatio;
}
@VisibleForTesting
public Map getVertexTargetParallelisms(
Map subDagTargetTpsRatio,
Set vertexToDownScale,
Map taskMetrics) {
Map targetParallelisms = new HashMap<>();
for (JobVertexID subDagRoot : subDagTargetTpsRatio.keySet()) {
double ratio = subDagTargetTpsRatio.get(subDagRoot);
for (JobVertexID vertexId : jobTopologyAnalyzer.getSubDagVertices(subDagRoot)) {
TaskMetrics metric = taskMetrics.get(vertexId);
if (metric.getWorkload() > 0) {
targetParallelisms.put(vertexId, (int) Math.floor(metric.getWorkload() * ratio));
}
}
}
for (JobVertexID vertexID : vertexToDownScale) {
if (!targetParallelisms.containsKey(vertexID)) {
if (taskMetrics.get(vertexID).getWorkload() > 0) {
targetParallelisms.put(vertexID, (int) Math.ceil(taskMetrics.get(vertexID).getWorkload() * scaleTpsRatio));
}
}
}
Set adjustedVertex = new HashSet<>();
for (JobVertexID vertexID : targetParallelisms.keySet()) {
for (JobVertexID upStream : jobTopologyAnalyzer.getInputs(vertexID)) {
if (!adjustedVertex.contains(vertexID) && jobTopologyAnalyzer.getOutputs(upStream).size() > 1) {
LOGGER.debug("adjusting vertex {} target parallelism: {} -> {}", vertexID, targetParallelisms.get(vertexID), (int) Math.floor(targetParallelisms.get(vertexID) * multiOutputRatio));
targetParallelisms.put(
vertexID, (int) Math.floor(targetParallelisms.get(vertexID) * multiOutputRatio));
adjustedVertex.add(vertexID);
}
}
}
return targetParallelisms;
}
@VisibleForTesting
public Map getVertexMinParallelisms(
RestServerClient.JobConfig jobConfig,
Map checkpointInfo,
Map taskMetrics) {
Map minParallelisms = new HashMap<>();
for (JobVertexID vertexId : jobConfig.getVertexConfigs().keySet()) {
minParallelisms.put(vertexId, 1);
if (jobTopologyAnalyzer.isSource(vertexId) && sourcePartitionCountSubs.get(vertexId).getValue() != null) {
double partitionCount = sourcePartitionCountSubs.get(vertexId).getValue().f1;
int minParallelism = (int) Math.ceil(partitionCount / maxPartitionPerTask);
if (minParallelism > minParallelisms.get(vertexId)) {
minParallelisms.put(vertexId, minParallelism);
}
}
if (checkpointInfo != null && checkpointInfo.containsKey(vertexId)) {
TaskCheckpointStatistics taskCheckpointStatistics = checkpointInfo.get(vertexId);
minParallelisms.put(
vertexId,
Math.max(minParallelisms.get(vertexId), (int) Math.ceil(1.0 * taskCheckpointStatistics.getFullStateSize() / stateSizeThreshold)));
}
if (taskMetrics != null && taskMetrics.containsKey(vertexId)) {
TaskMetrics taskMetric = taskMetrics.get(vertexId);
double minParallelism = taskMetric.getTimerCount() / timerCountThreshold;
LOGGER.debug("Timer count constraint works: constraints {}, origin constraints {}.", Math.ceil(minParallelism), minParallelisms.get(vertexId));
minParallelisms.put(vertexId, Math.max(minParallelisms.get(vertexId), (int) Math.ceil(minParallelism)));
}
}
return minParallelisms;
}
public void updateTargetParallelismsSubjectToConstraints(
Map targetParallelisms,
Map minParallelisms,
RestServerClient.JobConfig jobConfig) {
// EqualParallelismGroups (EPG)
// Group vertices that must have equal parallelism.
// A group is identified by its Leader, which could be any vertex in the group.
// All vertices in the group are Members, including the Leader.
// The target parallelism of a group should be the max value among all the Members' target parallelisms.
// Members of a group shares the same max parallelism, which is the min value among all the Members' max parallelisms.
Map> epgLeader2Members = new HashMap<>();
Map epgMember2Leader = new HashMap<>();
Map epgLeader2TargetParallelism = new HashMap<>();
Map epgLeader2MaxParallelism = new HashMap<>();
// Initially, each vertex belongs to a separate group.
for (JobVertexID vertexId : jobConfig.getVertexConfigs().keySet()) {
RestServerClient.VertexConfig vertexConfig = jobConfig.getVertexConfigs().get(vertexId);
int targetParallelism = vertexConfig.getParallelism();
int maxParallelism = vertexConfig.getMaxParallelism();
if (targetParallelisms.containsKey(vertexId)) {
targetParallelism = targetParallelisms.get(vertexId);
if (jobTopologyAnalyzer.isSource(vertexId) && sourcePartitionCountSubs.get(vertexId).getValue() != null) {
// limit thread of parallel reader.
double partitionCount = sourcePartitionCountSubs.get(vertexId).getValue().f1;
if (partitionCount / targetParallelism > maxPartitionPerTask) {
targetParallelism = (int) Math.ceil(partitionCount / maxPartitionPerTask);
}
if (partitionCount > 0) {
targetParallelism = (int) Math.ceil(partitionCount / Math.max(1, Math.floor(partitionCount / targetParallelism)));
}
if (partitionCount > 0 && maxParallelism > partitionCount) {
maxParallelism = (int) partitionCount;
}
}
}
// when vertex has high state size we should not down scale the node.
if (minParallelisms != null && minParallelisms.containsKey(vertexId) && targetParallelism < minParallelisms.get(vertexId)) {
targetParallelism = minParallelisms.get(vertexId);
}
// parallelism > 0
if (targetParallelism < 1) {
targetParallelism = 1;
}
// parallelism <= max
if (targetParallelism > maxParallelism && maxParallelism > 0) {
targetParallelism = maxParallelism;
}
HashSet members = new HashSet<>();
members.add(vertexId);
epgLeader2Members.put(vertexId, members);
epgMember2Leader.put(vertexId, vertexId);
epgLeader2TargetParallelism.put(vertexId, targetParallelism);
epgLeader2MaxParallelism.put(vertexId, maxParallelism);
}
// merge groups according to co-location groups
Map colocationGroupId2Leader = new HashMap<>();
for (JobVertexID vertexId : jobConfig.getVertexConfigs().keySet()) {
RestServerClient.VertexConfig vertexConfig = jobConfig.getVertexConfigs().get(vertexId);
AbstractID colocationGroupId = vertexConfig.getColocationGroupId();
if (colocationGroupId == null) {
continue;
}
if (colocationGroupId2Leader.containsKey(colocationGroupId)) {
JobVertexID currentGroupLeader = epgMember2Leader.get(vertexId);
JobVertexID targetGroupLeader = colocationGroupId2Leader.get(colocationGroupId);
mergeEqualParallelismGroups(
currentGroupLeader,
targetGroupLeader,
epgLeader2Members,
epgMember2Leader,
epgLeader2TargetParallelism,
epgLeader2MaxParallelism);
} else {
colocationGroupId2Leader.put(colocationGroupId, epgMember2Leader.get(vertexId));
}
}
// merge groups according to forward streams
for (JobVertexID downStreamVertex : jobConfig.getInputNodes().keySet()) {
for (Tuple2 edge : jobConfig.getInputNodes().get(downStreamVertex)) {
JobVertexID upStreamVertex = edge.f0;
String shipStrategy = edge.f1;
if (shipStrategy.equals("FORWARD")) {
JobVertexID currentGroupLeader = epgMember2Leader.get(upStreamVertex);
JobVertexID targetGroupLeader = epgMember2Leader.get(downStreamVertex);
mergeEqualParallelismGroups(
currentGroupLeader,
targetGroupLeader,
epgLeader2Members,
epgMember2Leader,
epgLeader2TargetParallelism,
epgLeader2MaxParallelism);
}
}
}
if (checkRescale) {
// ProportionalParallelismGroups (PPG)
// Group EqualParallelismGroups whose parallelism must be proportional to each other.
// A group is identified by its Leader, which could be any EPG in the group.
// All EPGs in the group are Members, including the Leader.
// Each PPG has a base, and each EPG in the PPG has a factor.
// The parallelism of the member EPG is base * factor.
Map> ppgLeader2Members = new HashMap<>();
Map ppgMember2Leader = new HashMap<>();
Map ppgLeader2Base = new HashMap<>();
Map ppgMember2Factor = new HashMap<>();
// merge groups according to rescale streams
for (JobVertexID downStreamVertex : jobConfig.getInputNodes().keySet()) {
for (Tuple2 edge : jobConfig.getInputNodes().get(downStreamVertex)) {
JobVertexID upStreamVertex = edge.f0;
String shipStrategy = edge.f1;
if (shipStrategy.equals("RESCALE")) {
JobVertexID upStreamEpg = epgMember2Leader.get(upStreamVertex);
JobVertexID downStreamEpg = epgMember2Leader.get(downStreamVertex);
if (ppgMember2Leader.containsKey(upStreamEpg) &&
ppgMember2Leader.containsKey(downStreamEpg) &&
ppgMember2Leader.get(upStreamEpg).equals(ppgMember2Leader.get(downStreamEpg))) {
continue;
}
if (!ppgMember2Leader.containsKey(upStreamEpg)) {
HashSet members = new HashSet<>();
members.add(upStreamEpg);
ppgLeader2Members.put(upStreamEpg, members);
ppgMember2Leader.put(upStreamEpg, upStreamEpg);
ppgLeader2Base.put(upStreamEpg, epgLeader2TargetParallelism.get(upStreamEpg));
ppgMember2Factor.put(upStreamEpg, 1);
}
if (!ppgMember2Leader.containsKey(downStreamEpg)) {
HashSet members = new HashSet<>();
members.add(downStreamEpg);
ppgLeader2Members.put(downStreamEpg, members);
ppgMember2Leader.put(downStreamEpg, downStreamEpg);
ppgLeader2Base.put(downStreamEpg, epgLeader2TargetParallelism.get(downStreamEpg));
ppgMember2Factor.put(downStreamEpg, 1);
}
mergeProportionalParallelismGroups(
upStreamEpg,
downStreamEpg,
epgLeader2Members,
epgLeader2TargetParallelism,
epgLeader2MaxParallelism,
ppgLeader2Members,
ppgMember2Leader,
ppgLeader2Base,
ppgMember2Factor,
jobConfig);
}
}
}
}
// update target parallelisms
targetParallelisms.clear();
for (JobVertexID epgLeader : epgLeader2Members.keySet()) {
int targetParallelism = epgLeader2TargetParallelism.get(epgLeader);
for (JobVertexID vertexId : epgLeader2Members.get(epgLeader)) {
targetParallelisms.put(vertexId, targetParallelism);
}
}
}
private void mergeEqualParallelismGroups(
JobVertexID currentGroupLeader,
JobVertexID targetGroupLeader,
Map> leader2Members,
Map member2Leader,
Map leader2TargetParallelism,
Map leader2MaxParallelism) {
if (currentGroupLeader.equals(targetGroupLeader)) {
return;
}
int currentGroupTargetParallelism = leader2TargetParallelism.get(currentGroupLeader);
int currentGroupMaxParallelism = leader2MaxParallelism.get(currentGroupLeader);
int targetGroupTargetParallelism = leader2TargetParallelism.get(targetGroupLeader);
int targetGroupMaxParallelism = leader2MaxParallelism.get(targetGroupLeader);
int targetParallelism = Math.max(currentGroupTargetParallelism, targetGroupTargetParallelism);
int maxParallelism = Math.min(currentGroupMaxParallelism, targetGroupMaxParallelism);
if (targetParallelism > maxParallelism) {
targetParallelism = maxParallelism;
}
leader2Members.get(targetGroupLeader).addAll(leader2Members.get(currentGroupLeader));
leader2Members.get(currentGroupLeader).forEach(member -> member2Leader.put(member, targetGroupLeader));
leader2TargetParallelism.put(targetGroupLeader, targetParallelism);
leader2MaxParallelism.put(targetGroupLeader, maxParallelism);
leader2Members.remove(currentGroupLeader);
leader2TargetParallelism.remove(currentGroupLeader);
leader2MaxParallelism.remove(currentGroupLeader);
}
@VisibleForTesting
void mergeProportionalParallelismGroups(
JobVertexID upStreamEpg,
JobVertexID downStreamEpg,
Map> epgLeader2Members,
Map epgLeader2TargetParallelism,
Map epgLeader2MaxParallelism,
Map> ppgLeader2Members,
Map ppgMember2Leader,
Map ppgLeader2Base,
Map ppgMember2Factor,
RestServerClient.JobConfig jobConfig) {
JobVertexID upStreamPpgLeader = ppgMember2Leader.get(upStreamEpg);
JobVertexID downStreamPpgLeader = ppgMember2Leader.get(downStreamEpg);
int upStreamBase = ppgLeader2Base.get(upStreamPpgLeader);
int upStreamFactor = ppgMember2Factor.get(upStreamEpg);
int downStreamBase = ppgLeader2Base.get(downStreamPpgLeader);
int downStreamFactor = ppgMember2Factor.get(downStreamEpg);
// up stream greater plan
// (upStreamBase + upStreamBaseIncrease) * upStreamFactor = k * (downStreamBase + downStreamBaseIncrease) * downStreamFactor
int upStreamGreaterUpStreamBaseIncrease = 0;
int upStreamGreaterDownStreamBaseIncrease;
int upStreamGreaterK;
upStreamGreaterUpStreamBaseIncreaseLoop:
while (true) {
int upStreamParallelism = (upStreamBase + upStreamGreaterUpStreamBaseIncrease) * upStreamFactor;
upStreamGreaterK = (int) Math.ceil(1.0 * upStreamParallelism / downStreamFactor / downStreamBase);
upStreamGreaterKLoop:
while (upStreamGreaterK > 0) {
if (upStreamParallelism / downStreamFactor / upStreamGreaterK < downStreamBase) {
upStreamGreaterK--;
continue upStreamGreaterKLoop;
}
if (upStreamParallelism / downStreamFactor / upStreamGreaterK >= downStreamBase * 2) {
break upStreamGreaterKLoop;
}
if (upStreamParallelism / downStreamFactor % upStreamGreaterK == 0) {
upStreamGreaterDownStreamBaseIncrease = upStreamParallelism / downStreamFactor / upStreamGreaterK - downStreamBase;
break upStreamGreaterUpStreamBaseIncreaseLoop;
}
upStreamGreaterK--;
}
upStreamGreaterUpStreamBaseIncrease++;
}
boolean upStreamGreaterFeasible = true;
for (JobVertexID upStreamPpgMemberEpg : ppgLeader2Members.get(upStreamPpgLeader)) {
int upStreamPpgMemberParallelism = (upStreamBase + upStreamGreaterUpStreamBaseIncrease) * ppgMember2Factor.get(upStreamPpgMemberEpg);
if (upStreamPpgMemberParallelism > epgLeader2MaxParallelism.get(upStreamPpgMemberEpg)) {
upStreamGreaterFeasible = false;
break;
}
}
for (JobVertexID downStreamPpgMemberEpg : ppgLeader2Members.get(downStreamPpgLeader)) {
int downStreamPpgMemberParallelism = (downStreamBase + upStreamGreaterDownStreamBaseIncrease) * ppgMember2Factor.get(downStreamPpgMemberEpg);
if (downStreamPpgMemberParallelism > epgLeader2MaxParallelism.get(downStreamPpgMemberEpg)) {
upStreamGreaterFeasible = false;
break;
}
}
// down stream greater plan
// k * (upStreamBase + upStreamBaseIncrease) * upStreamFactor = (downStreamBase + downStreamBaseIncrease) * downStreamFactor
int downStreamGreaterDownStreamBaseIncrease = 0;
int downStreamGreaterUpStreamBaseIncrease;
int downStreamGreaterK;
downStreamGreaterDownStreamBaseIncreaseLoop:
while (true) {
int downStreamParallelism = (downStreamBase + downStreamGreaterDownStreamBaseIncrease) * downStreamFactor;
downStreamGreaterK = (int) Math.ceil(1.0 * downStreamParallelism / upStreamFactor / upStreamBase);
downStreamGreaterKLoop:
while (downStreamGreaterK > 0) {
if (downStreamParallelism / upStreamFactor / downStreamGreaterK < upStreamBase) {
downStreamGreaterK--;
continue downStreamGreaterKLoop;
}
if (downStreamParallelism / upStreamFactor / downStreamGreaterK >= upStreamBase * 2) {
break downStreamGreaterKLoop;
}
if (downStreamParallelism / upStreamFactor % downStreamGreaterK == 0) {
downStreamGreaterUpStreamBaseIncrease = downStreamParallelism / upStreamFactor / downStreamGreaterK - upStreamBase;
break downStreamGreaterDownStreamBaseIncreaseLoop;
}
downStreamGreaterK--;
}
downStreamGreaterDownStreamBaseIncrease++;
}
boolean downStreamGreaterFeasible = true;
for (JobVertexID downStreamPpgMemberEpg : ppgLeader2Members.get(downStreamPpgLeader)) {
int downStreamPpgMemberParallelism = (downStreamBase + downStreamGreaterDownStreamBaseIncrease) * ppgMember2Factor.get(downStreamPpgMemberEpg);
if (downStreamPpgMemberParallelism > epgLeader2MaxParallelism.get(downStreamPpgMemberEpg)) {
downStreamGreaterFeasible = false;
break;
}
}
for (JobVertexID upStreamPpgMemberEpg : ppgLeader2Members.get(upStreamPpgLeader)) {
int upStreamPpgMemberParallelism = (upStreamBase + downStreamGreaterUpStreamBaseIncrease) * ppgMember2Factor.get(upStreamPpgMemberEpg);
if (upStreamPpgMemberParallelism > epgLeader2MaxParallelism.get(upStreamPpgMemberEpg)) {
downStreamGreaterFeasible = false;
break;
}
}
// choose plan
boolean useUpStreamGreaterPlan;
if (upStreamGreaterFeasible && downStreamGreaterFeasible) {
// calculate cost
ResourceSpec upStreamBaseIncreaseCost = new ResourceSpec.Builder().build();
for (JobVertexID upStreamPpgMemberEpg : ppgLeader2Members.get(upStreamPpgLeader)) {
ResourceSpec memberEpgIncreaseCost = new ResourceSpec.Builder().build();
for (JobVertexID upStreamPpgMemberVertex : epgLeader2Members.get(upStreamPpgMemberEpg)) {
memberEpgIncreaseCost = memberEpgIncreaseCost.sum(jobConfig.getVertexConfigs().get(upStreamPpgMemberVertex).getResourceSpec());
}
for (int i = 0; i < ppgMember2Factor.get(upStreamPpgMemberEpg); ++i) {
upStreamBaseIncreaseCost = upStreamBaseIncreaseCost.sum(memberEpgIncreaseCost);
}
}
ResourceSpec downStreamBaseIncreaseCost = new ResourceSpec.Builder().build();
for (JobVertexID downStreamPpgMemberEpg : ppgLeader2Members.get(downStreamPpgLeader)) {
ResourceSpec memberEpgIncreaseCost = new ResourceSpec.Builder().build();
for (JobVertexID downStreamPpgMemberVertex : epgLeader2Members.get(downStreamPpgMemberEpg)) {
memberEpgIncreaseCost = memberEpgIncreaseCost.sum(jobConfig.getVertexConfigs().get(downStreamPpgMemberVertex).getResourceSpec());
}
for (int i = 0; i < ppgMember2Factor.get(downStreamPpgMemberEpg); ++i) {
downStreamBaseIncreaseCost = downStreamBaseIncreaseCost.sum(memberEpgIncreaseCost);
}
}
ResourceSpec upStreamGreaterCost = new ResourceSpec.Builder().build();
for (int i = 0; i < upStreamGreaterUpStreamBaseIncrease; ++i) {
upStreamGreaterCost = upStreamGreaterCost.sum(upStreamBaseIncreaseCost);
}
for (int i = 0; i < upStreamGreaterDownStreamBaseIncrease; ++i) {
upStreamGreaterCost = upStreamGreaterCost.sum(downStreamBaseIncreaseCost);
}
ResourceSpec downStreamGreaterCost = new ResourceSpec.Builder().build();
for (int i = 0; i < downStreamGreaterDownStreamBaseIncrease; ++i) {
downStreamGreaterCost = downStreamGreaterCost.sum(downStreamBaseIncreaseCost);
}
for (int i = 0; i < downStreamGreaterUpStreamBaseIncrease; ++i) {
downStreamGreaterCost = downStreamGreaterCost.sum(upStreamBaseIncreaseCost);
}
int upStreamGreaterCostMem = upStreamGreaterCost.getHeapMemory() + upStreamGreaterCost.getDirectMemory() + upStreamGreaterCost.getNativeMemory();
int downStreamGreaterCostMem = downStreamGreaterCost.getHeapMemory() + downStreamGreaterCost.getDirectMemory() + downStreamGreaterCost.getNativeMemory();
// compare cost
if (upStreamGreaterCostMem > downStreamGreaterCostMem) {
useUpStreamGreaterPlan = false;
} else if (upStreamGreaterCostMem < downStreamGreaterCostMem) {
useUpStreamGreaterPlan = true;
} else {
if (upStreamGreaterCost.getCpuCores() > downStreamGreaterCost.getCpuCores()) {
useUpStreamGreaterPlan = false;
} else {
useUpStreamGreaterPlan = true;
}
}
} else if (upStreamGreaterFeasible) {
useUpStreamGreaterPlan = true;
} else if (downStreamGreaterFeasible) {
useUpStreamGreaterPlan = false;
} else {
LOGGER.debug("Could not merge ProportionalParallelism Groups {} and {}.",
ppgLeader2Members.get(upStreamPpgLeader), ppgLeader2Members.get(downStreamPpgLeader));
return;
}
// update Proportional Parallelism Groups
if (useUpStreamGreaterPlan) {
BigInteger upStreamNewBase = BigInteger.valueOf(upStreamBase + upStreamGreaterUpStreamBaseIncrease);
BigInteger downStreamNewBase = BigInteger.valueOf(downStreamBase + upStreamGreaterDownStreamBaseIncrease);
int newBase = upStreamNewBase.gcd(downStreamNewBase).intValue();
int upStreamFactorRatio = (upStreamBase + upStreamGreaterUpStreamBaseIncrease) / newBase;
int downStreamFactorRatio = (downStreamBase + upStreamGreaterDownStreamBaseIncrease) / newBase;
for (JobVertexID upStreamPpgMemberEpg : ppgLeader2Members.get(upStreamPpgLeader)) {
ppgMember2Leader.put(upStreamPpgMemberEpg, downStreamPpgLeader);
ppgMember2Factor.put(upStreamPpgMemberEpg, ppgMember2Factor.get(upStreamPpgMemberEpg) * upStreamFactorRatio);
}
for (JobVertexID downStreamPpgMemberEpg : ppgLeader2Members.get(downStreamPpgLeader)) {
ppgMember2Factor.put(downStreamPpgMemberEpg, ppgMember2Factor.get(downStreamPpgMemberEpg) * downStreamFactorRatio);
}
ppgLeader2Members.get(downStreamPpgLeader).addAll(ppgLeader2Members.get(upStreamPpgLeader));
ppgLeader2Members.remove(upStreamPpgLeader);
ppgLeader2Base.put(downStreamPpgLeader, newBase);
ppgLeader2Base.remove(upStreamPpgLeader);
for (JobVertexID downStreamPpgMemberEpg : ppgLeader2Members.get(downStreamPpgLeader)) {
epgLeader2TargetParallelism.put(downStreamPpgMemberEpg, newBase * ppgMember2Factor.get(downStreamPpgMemberEpg));
}
} else {
BigInteger upStreamNewBase = BigInteger.valueOf(upStreamBase + downStreamGreaterUpStreamBaseIncrease);
BigInteger downStreamNewBase = BigInteger.valueOf(downStreamBase + downStreamGreaterDownStreamBaseIncrease);
int newBase = upStreamNewBase.gcd(downStreamNewBase).intValue();
int upStreamFactorRatio = (upStreamBase + downStreamGreaterUpStreamBaseIncrease) / newBase;
int downStreamFactorRatio = (downStreamBase + downStreamGreaterDownStreamBaseIncrease) / newBase;
for (JobVertexID downStreamPpgMemberEpg : ppgLeader2Members.get(downStreamPpgLeader)) {
ppgMember2Leader.put(downStreamPpgMemberEpg, upStreamPpgLeader);
ppgMember2Factor.put(downStreamPpgMemberEpg, ppgMember2Factor.get(downStreamPpgMemberEpg) * downStreamFactorRatio);
}
for (JobVertexID upStreamPpgMemberEpg : ppgLeader2Members.get(upStreamPpgLeader)) {
ppgMember2Factor.put(upStreamPpgMemberEpg, ppgMember2Factor.get(upStreamPpgMemberEpg) * upStreamFactorRatio);
}
ppgLeader2Members.get(upStreamPpgLeader).addAll(ppgLeader2Members.get(downStreamPpgLeader));
ppgLeader2Members.remove(downStreamPpgLeader);
ppgLeader2Base.put(upStreamPpgLeader, newBase);
ppgLeader2Base.remove(downStreamPpgLeader);
for (JobVertexID upStreamPpgMemberEpg : ppgLeader2Members.get(upStreamPpgLeader)) {
epgLeader2TargetParallelism.put(upStreamPpgMemberEpg, newBase * ppgMember2Factor.get(upStreamPpgMemberEpg));
}
}
}
private RescaleJobParallelism generateRescaleParallelismAction(
Map targetParallelisms,
Map minParallelisms,
RestServerClient.JobConfig jobConfig) {
if (targetParallelisms.isEmpty()) {
return null;
}
// generate rescale action from target parallelisms
RescaleJobParallelism rescaleJobParallelism = new RescaleJobParallelism(jobID, timeout);
for (JobVertexID vertexId : targetParallelisms.keySet()) {
RestServerClient.VertexConfig vertexConfig = jobConfig.getVertexConfigs().get(vertexId);
rescaleJobParallelism.addVertex(
vertexId, vertexConfig.getParallelism(), targetParallelisms.get(vertexId),
vertexConfig.getResourceSpec(), vertexConfig.getResourceSpec());
}
// update rescale action subject to max resource limit
if (maxCpuLimit != Double.MAX_VALUE || maxMemoryLimit != Integer.MAX_VALUE) {
RestServerClient.JobConfig targetJobConfig = rescaleJobParallelism.getAppliedJobConfig(jobConfig);
double targetTotalCpu = targetJobConfig.getJobTotalCpuCores();
int targetTotalMem = targetJobConfig.getJobTotalMemoryMb();
if (targetTotalCpu > maxCpuLimit || targetTotalMem > maxMemoryLimit) {
LOGGER.debug(
"Try to scale down parallelism: total resource of target job config =<{}, {}> exceed max limit =<{}, {}>.",
targetTotalCpu, targetTotalMem, maxCpuLimit, maxMemoryLimit);
RestServerClient.JobConfig adjustedJobConfig = MaxResourceLimitUtil
.scaleDownJobConfigToMaxResourceLimit(
targetJobConfig, minParallelisms, maxCpuLimit, maxMemoryLimit);
if (adjustedJobConfig == null) {
LOGGER.debug("Give up adjusting.");
return null;
}
rescaleJobParallelism = new RescaleJobParallelism(jobID, timeout);
for (JobVertexID vertexId : adjustedJobConfig.getVertexConfigs().keySet()) {
RestServerClient.VertexConfig originVertexConfig = jobConfig.getVertexConfigs().get(vertexId);
RestServerClient.VertexConfig adjustedVertexConfig = adjustedJobConfig.getVertexConfigs().get(
vertexId);
rescaleJobParallelism.addVertex(vertexId,
originVertexConfig.getParallelism(),
adjustedVertexConfig.getParallelism(),
originVertexConfig.getResourceSpec(),
adjustedVertexConfig.getResourceSpec());
}
}
}
RestServerClient.JobConfig appliedJobConfig = rescaleJobParallelism.getAppliedJobConfig(jobConfig);
LOGGER.debug("Resource applying generated action: =<{}, {}>.",
appliedJobConfig.getJobTotalCpuCores(), appliedJobConfig.getJobTotalMemoryMb());
rescaleJobParallelism.exculdeMinorDiffVertices(monitor.getConfig());
return rescaleJobParallelism;
}
}