Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.hadoop.hive.llap.tezplugins.LlapTaskSchedulerService Maven / Gradle / Ivy
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.llap.tezplugins;
import com.google.common.io.ByteArrayDataOutput;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hive.llap.tezplugins.metrics.LlapMetricsCollector;
import org.apache.hadoop.hive.llap.tezplugins.scheduler.StatsPerDag;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.hive.registry.impl.TezAmRegistryImpl;
import org.apache.hadoop.hive.registry.ServiceInstanceStateChangeListener;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.DelayQueue;
import java.util.concurrent.Delayed;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.apache.commons.lang3.mutable.MutableInt;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.JvmPauseMonitor;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.llap.metrics.LlapMetricsSystem;
import org.apache.hadoop.hive.llap.metrics.MetricsUtils;
import org.apache.hadoop.hive.llap.metrics.ReadWriteLockMetrics;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.QueryIdentifierProto;
import org.apache.hadoop.hive.llap.plugin.rpc.LlapPluginProtocolProtos.UpdateQueryRequestProto;
import org.apache.hadoop.hive.llap.registry.LlapServiceInstance;
import org.apache.hadoop.hive.llap.registry.LlapServiceInstanceSet;
import org.apache.hadoop.hive.llap.registry.impl.InactiveServiceInstance;
import org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService;
import org.apache.hadoop.hive.llap.tezplugins.LlapTaskCommunicator.OperationCallback;
import org.apache.hadoop.hive.llap.tezplugins.endpoint.LlapPluginServerImpl;
import org.apache.hadoop.hive.llap.tezplugins.helpers.MonotonicClock;
import org.apache.hadoop.hive.llap.tezplugins.metrics.LlapTaskSchedulerMetrics;
import org.apache.hadoop.hive.llap.tezplugins.scheduler.LoggingFutureCallback;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerState;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.api.records.NodeState;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.util.Clock;
import org.apache.hive.common.util.Ref;
import org.apache.tez.common.TezUtils;
import org.apache.tez.common.security.JobTokenIdentifier;
import org.apache.tez.common.security.JobTokenSecretManager;
import org.apache.tez.dag.api.TezUncheckedException;
import org.apache.tez.dag.app.dag.DAG;
import org.apache.tez.dag.app.dag.TaskAttempt;
import org.apache.tez.dag.app.dag.Vertex;
import org.apache.tez.dag.app.dag.impl.Edge;
import org.apache.tez.dag.records.TezDAGID;
import org.apache.tez.dag.records.TezTaskAttemptID;
import org.apache.tez.dag.records.TezVertexID;
import org.apache.tez.serviceplugins.api.DagInfo;
import org.apache.tez.serviceplugins.api.ServicePluginErrorDefaults;
import org.apache.tez.serviceplugins.api.TaskAttemptEndReason;
import org.apache.tez.serviceplugins.api.TaskScheduler;
import org.apache.tez.serviceplugins.api.TaskSchedulerContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.ByteStreams;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
public class LlapTaskSchedulerService extends TaskScheduler {
private static final Logger LOG = LoggerFactory.getLogger(LlapTaskSchedulerService.class);
private static final Logger WM_LOG = LoggerFactory.getLogger("GuaranteedTasks");
private static final TaskStartComparator TASK_INFO_COMPARATOR = new TaskStartComparator();
private final static Comparator PRIORITY_COMPARATOR = new Comparator() {
@Override
public int compare(Priority o1, Priority o2) {
return o1.getPriority() - o2.getPriority();
}
};
private final UpdateOperationCallback UPDATE_CALLBACK = new UpdateOperationCallback();
private final class UpdateOperationCallback implements OperationCallback {
@Override
public void setDone(TaskInfo ctx, Boolean result) {
handleUpdateResult(ctx, result);
}
@Override
public void setError(TaskInfo ctx, Throwable t) {
// The exception has been logged by the lower layer.
handleUpdateResult(ctx, false);
}
}
private final class RegisterDagCallback implements OperationCallback {
private final LlapServiceInstance llapServiceInstance;
private final NodeInfo nodeInfo;
RegisterDagCallback(NodeInfo nodeInfo, LlapServiceInstance llapServiceInstance) {
this.nodeInfo = nodeInfo;
this.llapServiceInstance = llapServiceInstance;
}
@Override
public void setDone(Void v, QueryIdentifierProto result) {
LOG.info("Dag with"
+ " appId=" + result.getApplicationIdString()
+ " dagId=" + result.getDagIndex()
+ " registered successfully for node " + nodeInfo.getHost());
addNode(nodeInfo, llapServiceInstance);
}
@Override
public void setError(Void v, Throwable t) {
LOG.warn("Error registering dag for node " + nodeInfo.getHost(), t);
// In case we fail to register the dag we add the node anyway
// We will try to register the dag when we schedule the first container
addNode(nodeInfo, llapServiceInstance);
}
}
/// Shared singleton MetricsSource instance for all FileData locks
private static final MetricsSource LOCK_METRICS;
static {
// create and register the MetricsSource for lock metrics
MetricsSystem ms = LlapMetricsSystem.instance();
LOCK_METRICS =
ReadWriteLockMetrics.createLockMetricsSource("TaskScheduler");
ms.register("LLAPTaskSchedulerLockMetrics",
"Lock metrics for R/W locks LLAP task scheduler", LOCK_METRICS);
}
// TODO: this is an ugly hack; see the same in LlapTaskCommunicator for discussion.
// This only lives for the duration of the service init.
static LlapTaskSchedulerService instance = null;
private final Configuration conf;
// interface into the registry service
private LlapServiceInstanceSet activeInstances;
// Tracks all instances, including ones which have been disabled in the past.
// LinkedHashMap to provide the same iteration order when selecting a random host.
@VisibleForTesting
final Map instanceToNodeMap = new LinkedHashMap<>();
// TODO Ideally, remove elements from this once it's known that no tasks are linked to the instance (all deallocated)
// Tracks tasks which could not be allocated immediately.
// Tasks are tracked in the order requests come in, at different priority levels.
// TODO HIVE-13538 For tasks at the same priority level, it may be worth attempting to schedule tasks with
// locality information before those without locality information
private final TreeMap> pendingTasks = new TreeMap<>(PRIORITY_COMPARATOR);
// Tracks running and queued (allocated) tasks. Cleared after a task completes.
private final ConcurrentMap knownTasks = new ConcurrentHashMap<>();
private final Map tasksById = new HashMap<>();
// Tracks tasks which are running. Useful for selecting a task to preempt based on when it started.
private final TreeMap> guaranteedTasks = new TreeMap<>(),
speculativeTasks = new TreeMap<>();
private final LlapPluginServerImpl pluginEndpoint;
private final boolean workloadManagementEnabled;
// Queue for disabled nodes. Nodes make it out of this queue when their expiration timeout is hit.
@VisibleForTesting
final DelayQueue disabledNodesQueue = new DelayQueue<>();
@VisibleForTesting
final DelayQueue delayedTaskQueue = new DelayQueue<>();
@VisibleForTesting
final BlockingQueue highPriorityTaskQueue = new LinkedBlockingQueue<>();
final BlockingQueue preemptionCandidates = new LinkedBlockingQueue<>();
private volatile boolean dagRunning = false;
private final ContainerFactory containerFactory;
@VisibleForTesting
final Clock clock;
private final ListeningExecutorService nodeEnabledExecutor;
private final NodeEnablerCallable nodeEnablerCallable =
new NodeEnablerCallable();
private final ListeningExecutorService delayedTaskSchedulerExecutor;
@VisibleForTesting
final DelayedTaskSchedulerCallable delayedTaskSchedulerCallable;
private final ListeningExecutorService preemptSchedulerExecutor;
final PreemptionSchedulerCallable preemptSchedulerCallable = new PreemptionSchedulerCallable();
private final ReadWriteLock lock;
private final Lock readLock;
private final Lock writeLock;
private final Lock scheduleLock = new ReentrantLock();
private final Condition scheduleCondition = scheduleLock.newCondition();
private final AtomicBoolean isClusterCapacityFull = new AtomicBoolean(false);
private final AtomicBoolean pendingScheduleInvocations = new AtomicBoolean(false);
private final ListeningExecutorService schedulerExecutor;
private final SchedulerCallable schedulerCallable = new SchedulerCallable();
private final AtomicBoolean isStopped = new AtomicBoolean(false);
// Tracks total pending preemptions.
private final AtomicInteger pendingPreemptions = new AtomicInteger(0);
// Tracks pending preemptions per host, using the hostname || Always to be accessed inside a lock
private final Map pendingPreemptionsPerHost = new HashMap<>();
private final NodeBlacklistConf nodeBlacklistConf;
private final LocalityDelayConf localityDelayConf;
private final int numSchedulableTasksPerNode;
// when there are no live nodes in the cluster and this timeout elapses the query is failed
private final long timeout;
private final Lock timeoutLock = new ReentrantLock();
private final ScheduledExecutorService timeoutExecutor;
private final ScheduledExecutorService scheduledLoggingExecutor;
private final SchedulerTimeoutMonitor timeoutMonitor;
private ScheduledFuture> timeoutFuture;
private final AtomicReference> timeoutFutureRef = new AtomicReference<>(null);
private final AtomicInteger assignedTaskCounter = new AtomicInteger(0);
private final LlapRegistryService registry = new LlapRegistryService(false);
private final TezAmRegistryImpl amRegistry;
private volatile ListenableFuture nodeEnablerFuture;
private volatile ListenableFuture delayedTaskSchedulerFuture;
private volatile ListenableFuture preemptTaskSchedulerFuture;
private volatile ListenableFuture schedulerFuture;
@VisibleForTesting
private final AtomicInteger dagCounter = new AtomicInteger(1);
// Statistics to track allocations
// All of stats variables are visible for testing.
@VisibleForTesting
StatsPerDag dagStats = new StatsPerDag();
private final LlapTaskSchedulerMetrics metrics;
private final JvmPauseMonitor pauseMonitor;
private final Random random = new Random();
private int totalGuaranteed = 0, unusedGuaranteed = 0;
private final boolean consistentSplits;
/**
* An internal version to make sure we don't race and overwrite a newer totalGuaranteed count in
* ZK with an older one, without requiring us to make ZK updates under the main writeLock.
* This is updated under writeLock, together with totalGuaranteed.
*/
private long totalGuaranteedVersion = Long.MIN_VALUE;
private final Object registryUpdateLock = new Object(); // The lock for ZK updates.
/** The last totalGuaranteedVersion sent to ZK. Updated under registryUpdateLock. */
private long tgVersionSent = Long.MIN_VALUE;
private LlapTaskCommunicator communicator;
private final int amPort;
private final String serializedToken, jobIdForToken;
// We expect the DAGs to not be super large, so store full dependency set for each vertex to
// avoid traversing the tree later. To save memory, this could be an array (of byte arrays?).
private final Object outputsLock = new Object();
private TezDAGID depsDagId = null;
private Map> transitiveOutputs;
private LlapMetricsCollector llapMetricsCollector;
public LlapTaskSchedulerService(TaskSchedulerContext taskSchedulerContext) {
this(taskSchedulerContext, new MonotonicClock(), true);
}
// The fields that HS2 uses to give AM information about plugin endpoint.
// Some of these will be removed when AM registry is implemented, as AM will generate and publish them.
/** Whether to enable the endpoint. */
public static final String LLAP_PLUGIN_ENDPOINT_ENABLED = "llap.plugin.endpoint.enabled";
@VisibleForTesting
public LlapTaskSchedulerService(TaskSchedulerContext taskSchedulerContext, Clock clock,
boolean initMetrics) {
super(taskSchedulerContext);
this.clock = clock;
this.amPort = taskSchedulerContext.getAppClientPort();
this.delayedTaskSchedulerCallable = createDelayedTaskSchedulerCallable();
try {
this.conf = TezUtils.createConfFromUserPayload(taskSchedulerContext.getInitialUserPayload());
} catch (IOException e) {
throw new TezUncheckedException(
"Failed to parse user payload for " + LlapTaskSchedulerService.class.getSimpleName(), e);
}
lock = ReadWriteLockMetrics.wrap(conf,
new ReentrantReadWriteLock(),
LOCK_METRICS);
readLock = lock.readLock();
writeLock = lock.writeLock();
this.consistentSplits = HiveConf.getBoolVar(conf, ConfVars.LLAP_CLIENT_CONSISTENT_SPLITS);
if (conf.getBoolean(LLAP_PLUGIN_ENDPOINT_ENABLED, false)) {
JobTokenSecretManager sm = null;
if (UserGroupInformation.isSecurityEnabled()) {
// Set up the security for plugin endpoint.
// We will create the token and publish it in the AM registry.
// Note: this application ID is bogus and is only needed for JobTokenSecretManager.
ApplicationId id = ApplicationId.newInstance(
System.nanoTime(), (int)(System.nanoTime() % 100000));
Token token = createAmsToken(id);
serializedToken = serializeToken(token);
jobIdForToken = token.getService().toString();
sm = new JobTokenSecretManager();
sm.addTokenForJob(jobIdForToken, token);
} else {
serializedToken = jobIdForToken = null;
}
pluginEndpoint = new LlapPluginServerImpl(sm,
HiveConf.getIntVar(conf, ConfVars.LLAP_PLUGIN_RPC_NUM_HANDLERS), this, HiveConf.getIntVar(conf, ConfVars.LLAP_PLUGIN_RPC_PORT));
} else {
serializedToken = jobIdForToken = null;
pluginEndpoint = null;
}
// This is called once per AM, so we don't get the starting duck count here.
this.containerFactory = new ContainerFactory(taskSchedulerContext.getApplicationAttemptId(),
taskSchedulerContext.getCustomClusterIdentifier());
// TODO HIVE-13483 Get all of these properties from the registry. This will need to take care of different instances
// publishing potentially different values when we support changing configurations dynamically.
// For now, this can simply be fetched from a single registry instance.
this.nodeBlacklistConf = new NodeBlacklistConf(
HiveConf.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_REENABLE_MIN_TIMEOUT_MS,
TimeUnit.MILLISECONDS),
HiveConf.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_REENABLE_MAX_TIMEOUT_MS,
TimeUnit.MILLISECONDS),
HiveConf.getFloatVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NODE_DISABLE_BACK_OFF_FACTOR));
this.numSchedulableTasksPerNode =
HiveConf.getIntVar(conf, ConfVars.LLAP_TASK_SCHEDULER_NUM_SCHEDULABLE_TASKS_PER_NODE);
long localityDelayMs = HiveConf
.getTimeVar(conf, ConfVars.LLAP_TASK_SCHEDULER_LOCALITY_DELAY, TimeUnit.MILLISECONDS);
this.localityDelayConf = new LocalityDelayConf(localityDelayMs);
this.timeoutMonitor = new SchedulerTimeoutMonitor();
this.timeout = HiveConf.getTimeVar(conf,
ConfVars.LLAP_DAEMON_TASK_SCHEDULER_TIMEOUT_SECONDS, TimeUnit.MILLISECONDS);
this.timeoutExecutor = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapTaskSchedulerTimeoutMonitor")
.build());
this.timeoutFuture = null;
this.scheduledLoggingExecutor = Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapTaskSchedulerTimedLogThread")
.build());
if (HiveConf.getTimeVar(conf,
HiveConf.ConfVars.LLAP_TASK_SCHEDULER_AM_COLLECT_DAEMON_METRICS_MS, TimeUnit.MILLISECONDS) > 0) {
this.llapMetricsCollector = new LlapMetricsCollector(conf, registry);
this.registry.registerServiceListener(llapMetricsCollector);
}
String instanceId = HiveConf.getTrimmedVar(conf, ConfVars.LLAP_DAEMON_SERVICE_HOSTS);
Preconditions.checkNotNull(instanceId, ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname
+ " must be defined");
ExecutorService executorServiceRaw =
Executors.newSingleThreadExecutor(
new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapSchedulerNodeEnabler").build());
nodeEnabledExecutor = MoreExecutors.listeningDecorator(executorServiceRaw);
ExecutorService delayedTaskSchedulerExecutorRaw = Executors.newFixedThreadPool(1,
new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapSchedulerDelayedTaskHandler")
.build());
delayedTaskSchedulerExecutor =
MoreExecutors.listeningDecorator(delayedTaskSchedulerExecutorRaw);
ExecutorService preemptTaskSchedulerExecutorRaw = Executors.newFixedThreadPool(1,
new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapSchedulerPreemptTaskHandler")
.build());
preemptSchedulerExecutor = MoreExecutors.listeningDecorator(preemptTaskSchedulerExecutorRaw);
ExecutorService schedulerExecutorServiceRaw = Executors.newSingleThreadExecutor(
new ThreadFactoryBuilder().setDaemon(true).setNameFormat("LlapScheduler").build());
schedulerExecutor = MoreExecutors.listeningDecorator(schedulerExecutorServiceRaw);
if (initMetrics && !conf.getBoolean(ConfVars.HIVE_IN_TEST.varname, false)) {
// Initialize the metrics system
LlapMetricsSystem.initialize("LlapTaskScheduler");
this.pauseMonitor = new JvmPauseMonitor(conf);
pauseMonitor.start();
String displayName = "LlapTaskSchedulerMetrics-" + MetricsUtils.getHostName();
String sessionId = conf.get("llap.daemon.metrics.sessionid");
// TODO: Not sure about the use of this. Should we instead use workerIdentity as sessionId?
this.metrics = LlapTaskSchedulerMetrics.create(displayName, sessionId);
} else {
this.metrics = null;
this.pauseMonitor = null;
}
String hostsString = HiveConf.getVar(conf, ConfVars.LLAP_DAEMON_SERVICE_HOSTS);
LOG.info("Running with configuration: hosts={}, numSchedulableTasksPerNode={}, "
+ "nodeBlacklistConf={}, localityConf={} consistentSplits={}",
hostsString, numSchedulableTasksPerNode, nodeBlacklistConf, localityDelayConf, consistentSplits);
this.amRegistry = TezAmRegistryImpl.create(conf, true);
this.workloadManagementEnabled =
!StringUtils.isEmpty(conf.get(ConfVars.HIVE_SERVER2_TEZ_INTERACTIVE_QUEUE.varname, "").trim());
synchronized (LlapTaskCommunicator.pluginInitLock) {
LlapTaskCommunicator peer = LlapTaskCommunicator.instance;
if (peer != null) {
// We are the last to initialize.
this.setTaskCommunicator(peer);
peer.setScheduler(this);
LlapTaskCommunicator.instance = null;
} else {
instance = this;
}
}
}
private Map> getDependencyInfo(TezDAGID depsDagId) {
// This logic assumes one dag at a time; if it was not the case it'd keep rewriting it.
synchronized (outputsLock) {
if (depsDagId == this.depsDagId) return transitiveOutputs;
this.depsDagId = depsDagId;
if (!HiveConf.getBoolVar(conf, ConfVars.LLAP_TASK_SCHEDULER_PREEMPT_INDEPENDENT)) {
this.transitiveOutputs = getTransitiveVertexOutputs(getContext().getCurrentDagInfo());
}
return this.transitiveOutputs;
}
}
private static Map> getTransitiveVertexOutputs(DagInfo info) {
if (!(info instanceof DAG)) {
LOG.warn("DAG info is not a DAG - cannot derive dependencies");
return null;
}
DAG dag = (DAG) info;
int vc = dag.getVertices().size();
// All the vertices belong to the same DAG, so we just use numbers.
Map> result = Maps.newHashMapWithExpectedSize(vc);
LinkedList queue = new LinkedList<>();
// We assume a DAG is a DAG, and that it's connected. Add direct dependencies.
for (Vertex v : dag.getVertices().values()) {
Map out = v.getOutputVertices();
if (out == null) {
result.put(v.getVertexId().getId(), Sets.newHashSet());
} else {
Set set = Sets.newHashSetWithExpectedSize(vc);
for (Vertex outV : out.keySet()) {
set.add(outV.getVertexId().getId());
}
result.put(v.getVertexId().getId(), set);
}
if (v.getOutputVerticesCount() == 0) {
queue.add(v.getVertexId());
}
}
Set processed = Sets.newHashSetWithExpectedSize(vc);
while (!queue.isEmpty()) {
TezVertexID id = queue.poll();
if (processed.contains(id.getId())) continue; // Already processed. See backtracking.
Vertex v = dag.getVertex(id);
Map out = v.getOutputVertices();
if (out != null) {
// Check that all the outputs have been processed; if not, insert them into queue
// before the current vertex and try again. It's possible e.g. in a structure like this:
// _1
// / 2
// 3 4 where 1 may be added to the queue before 2
boolean doBacktrack = false;
for (Vertex outV : out.keySet()) {
TezVertexID outId = outV.getVertexId();
int outNum = outId.getId();
if (!processed.contains(outNum)) {
if (!doBacktrack) {
queue.addFirst(id);
doBacktrack = true;
}
queue.addFirst(outId);
}
}
if (doBacktrack) continue;
}
int num = id.getId();
processed.add(num);
Set deps = result.get(num);
Map in = v.getInputVertices();
if (in != null) {
for (Vertex inV : in.keySet()) {
queue.add(inV.getVertexId());
// Our outputs are the transitive outputs of our inputs.
result.get(inV.getVertexId().getId()).addAll(deps);
}
}
}
return result;
}
private static Token createAmsToken(ApplicationId id) {
if (!UserGroupInformation.isSecurityEnabled()) return null;
JobTokenIdentifier identifier = new JobTokenIdentifier(new Text(id.toString()));
JobTokenSecretManager jobTokenManager = new JobTokenSecretManager();
Token sessionToken = new Token<>(identifier, jobTokenManager);
sessionToken.setService(identifier.getJobId());
return sessionToken;
}
private static String serializeToken(Token token) {
byte[] bytes = null;
try {
ByteArrayDataOutput out = ByteStreams.newDataOutput();
token.write(out);
bytes = out.toByteArray();
} catch (IOException e) {
// This shouldn't really happen on a byte array.
throw new RuntimeException(e);
}
return Base64.getEncoder().withoutPadding().encodeToString(bytes);
}
@VisibleForTesting
void updateGuaranteedCount(int newTotalGuaranteed) {
List toUpdate = null;
long tgVersionForZk;
writeLock.lock();
try {
// TODO: when this code is a little less hot, change most logs to debug.
// We will determine what to do under lock and then do stuff outside of the lock.
// The approach is state-based. We consider the task to have a duck when we have decided to
// give it one; the sends below merely fix the discrepancy with the actual state. We may add the
// ability to wait for LLAPs to positively ack the revokes in future.
// The "procedural" approach requires that we track the ducks traveling on network,
// concurrent terminations, etc. So, while more precise it's much more complex.
int delta = newTotalGuaranteed - totalGuaranteed;
tgVersionForZk = ++totalGuaranteedVersion;
WM_LOG.info("Received guaranteed tasks " + newTotalGuaranteed + " (internal version "
+ tgVersionForZk + "); the delta to adjust by is " + delta);
if (delta == 0) return;
totalGuaranteed = newTotalGuaranteed;
if (metrics != null) {
metrics.setWmTotalGuaranteed(totalGuaranteed);
}
if (delta > 0) {
if (unusedGuaranteed == 0) {
// There may be speculative tasks waiting.
toUpdate = new ArrayList<>();
int totalUpdated = distributeGuaranteed(delta, null, toUpdate);
delta -= totalUpdated;
WM_LOG.info("Distributed " + totalUpdated);
}
int result = (unusedGuaranteed += delta);
if (metrics != null) {
metrics.setWmUnusedGuaranteed(result);
}
WM_LOG.info("Setting unused to " + result + " based on remaining delta " + delta);
} else {
delta = -delta;
if (delta <= unusedGuaranteed) {
// Somebody took away our unwanted ducks.
int result = (unusedGuaranteed -= delta);
if (metrics != null) {
metrics.setWmUnusedGuaranteed(result);
}
WM_LOG.info("Setting unused to " + result + " based on full delta " + delta);
return;
} else {
delta -= unusedGuaranteed;
unusedGuaranteed = 0;
toUpdate = new ArrayList<>();
int totalUpdated = revokeGuaranteed(delta, null, toUpdate);
if (metrics != null) {
metrics.setWmUnusedGuaranteed(0);
}
WM_LOG.info("Setting unused to 0; revoked " + totalUpdated + " / " + delta);
// We must be able to take away the requisite number; if we can't, where'd the ducks go?
if (delta != totalUpdated) {
throw new AssertionError("Failed to revoke " + delta + " guaranteed tasks locally");
}
}
}
} finally {
writeLock.unlock();
}
updateGuaranteedInRegistry(tgVersionForZk, newTotalGuaranteed);
if (toUpdate == null) return;
WM_LOG.info("Sending updates to " + toUpdate.size() + " tasks");
for (TaskInfo ti : toUpdate) {
checkAndSendGuaranteedStateUpdate(ti);
}
}
@VisibleForTesting
protected void checkAndSendGuaranteedStateUpdate(TaskInfo ti) {
boolean newState = false;
synchronized (ti) {
assert ti.isPendingUpdate;
if ((ti.lastSetGuaranteed != null && ti.lastSetGuaranteed == ti.isGuaranteed)
|| ti.isGuaranteed == null) {
// Nothing to do - e.g. two messages have canceled each other before we could react,
// or the task was deallocated.
ti.requestedValue = ti.isGuaranteed;
setUpdateDoneUnderTiLock(ti);
WM_LOG.info("Not sending update to " + ti.attemptId);
return;
}
newState = ti.isGuaranteed;
}
// From this point on, the update is in motion - if someone changes the state again, that
// would only apply after the callback for the current message.
sendUpdateMessageAsync(ti, newState);
}
private void setUpdateStartedUnderTiLock(TaskInfo ti) {
ti.isPendingUpdate = true;
ti.requestedValue = ti.isGuaranteed;
// It's ok to update metrics for two tasks in parallel, but not for the same one.
if (metrics != null) {
metrics.setWmPendingStarted(ti.requestedValue);
}
}
private void setUpdateDoneUnderTiLock(TaskInfo ti) {
ti.isPendingUpdate = false;
// It's ok to update metrics for two tasks in parallel, but not for the same one.
// Don't update metrics for the cancelled tasks - already taken care of during cancellation.
if (metrics != null && ti.requestedValue != null) {
metrics.setWmPendingDone(ti.requestedValue);
}
ti.lastSetGuaranteed = ti.requestedValue;
ti.requestedValue = null;
}
@VisibleForTesting
protected void handleUpdateResult(TaskInfo ti, boolean isOk) {
// The update options for outside the lock - see below the synchronized block.
Boolean newStateSameTask = null, newStateAnyTask = null;
WM_LOG.info("Received response for " + ti.attemptId + ", " + isOk);
synchronized (ti) {
assert ti.isPendingUpdate;
if (ti.isGuaranteed == null) {
// The task has been terminated and the duck accounted for based on local state.
// Whatever we were doing is irrelevant. The metrics have also been updated.
ti.isPendingUpdate = false;
ti.requestedValue = null;
return;
}
boolean requestedValue = ti.requestedValue;
if (isOk) {
// We have propagated the value to the task.
setUpdateDoneUnderTiLock(ti);
if (requestedValue == ti.isGuaranteed) return;
// The state has changed during the update. Let's undo what we just did.
newStateSameTask = ti.isGuaranteed;
setUpdateStartedUnderTiLock(ti);
} else {
if (metrics != null) {
metrics.setWmPendingFailed(requestedValue);
}
// An error, or couldn't find the task - lastSetGuaranteed does not change. The logic here
// does not account for one special case - we have updated the task, but the response was
// lost and we have received a network error. The state could be inconsistent, making
// a deadlock possible in extreme cases if not handled. This will be detected by heartbeat.
if (requestedValue != ti.isGuaranteed) {
// We failed to do something that was rendered irrelevant while we were failing.
ti.isPendingUpdate = false;
ti.requestedValue = null;
return;
}
// We failed to update this task. Instead of retrying for this task, find another.
// To change isGuaranteed and modify maps, we'd need the epic lock. So, we will not
// update the pending state for now as we release this lock to take both.
newStateAnyTask = requestedValue;
}
} // End of synchronized (ti)
if (newStateSameTask != null) {
WM_LOG.info("Sending update to the same task in response handling "
+ ti.attemptId + ", " + newStateSameTask);
// We need to send the state update again (the state has changed since the last one).
sendUpdateMessageAsync(ti, newStateSameTask);
}
if (newStateAnyTask == null) return;
// The update is failed and could be retried.
// Instead of retrying with this task, we will try to pick a different suitable task.
List toUpdate = new ArrayList<>(1);
writeLock.lock();
try {
synchronized (ti) {
// We have already updated the metrics for the failure; change the state.
ti.isPendingUpdate = false;
ti.requestedValue = null;
if (newStateAnyTask != ti.isGuaranteed) {
// The state has changed between this and previous check within this method.
// The failed update was rendered irrelevant, so we just exit.
return;
}
WM_LOG.info("Sending update to a different task in response handling "
+ ti.attemptId + ", " + newStateAnyTask);
// First, "give up" on this task and put it back in the original list.
boolean isRemoved = removeFromRunningTaskMap(
newStateAnyTask ? guaranteedTasks : speculativeTasks, ti.task, ti);
if (!isRemoved) {
String error = "Couldn't find the task in the correct map after an update " + ti.task;
LOG.error(error);
throw new AssertionError(error);
}
ti.isGuaranteed = !newStateAnyTask;
// Put into the map that this task was in before we decided to update it.
addToRunningTasksMap(newStateAnyTask ? speculativeTasks : guaranteedTasks, ti);
}
// Now try to pick another task to update - or potentially the same task.
int count = 0;
if (newStateAnyTask) {
count = distributeGuaranteed(1, ti, toUpdate);
} else {
count = revokeGuaranteed(1, ti, toUpdate);
}
assert count == 1 && toUpdate.size() == 1; // Must at least be able to return ti back.
} finally {
writeLock.unlock();
}
checkAndSendGuaranteedStateUpdate(toUpdate.get(0));
}
@Override
public void initialize() {
registry.init(conf);
if (pluginEndpoint != null) {
pluginEndpoint.init(conf);
}
}
@Override
public void start() throws IOException {
if (pluginEndpoint != null) {
pluginEndpoint.start();
}
writeLock.lock();
try {
scheduledLoggingExecutor.scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
readLock.lock();
try {
if (dagRunning) {
LOG.info("Stats for current dag: {}", dagStats);
}
} finally {
readLock.unlock();
}
}
}, 0, 10000L, TimeUnit.MILLISECONDS);
nodeEnablerFuture = nodeEnabledExecutor.submit(nodeEnablerCallable);
Futures.addCallback(nodeEnablerFuture, new LoggingFutureCallback("NodeEnablerThread", LOG),
MoreExecutors.directExecutor());
delayedTaskSchedulerFuture =
delayedTaskSchedulerExecutor.submit(delayedTaskSchedulerCallable);
Futures.addCallback(delayedTaskSchedulerFuture, new LoggingFutureCallback("DelayedTaskSchedulerThread", LOG),
MoreExecutors.directExecutor());
preemptTaskSchedulerFuture =
preemptSchedulerExecutor.submit(preemptSchedulerCallable);
Futures.addCallback(preemptTaskSchedulerFuture,
new LoggingFutureCallback("PreemptTaskSchedulerThread", LOG), MoreExecutors.directExecutor());
schedulerFuture = schedulerExecutor.submit(schedulerCallable);
Futures.addCallback(schedulerFuture, new LoggingFutureCallback("SchedulerThread", LOG),
MoreExecutors.directExecutor());
registry.start();
activeInstances = registry.getInstances();
registry.registerStateChangeListener(new NodeStateChangeListener());
for (LlapServiceInstance inst : activeInstances.getAll()) {
registerAndAddNode(new NodeInfo(inst, nodeBlacklistConf, clock,
numSchedulableTasksPerNode, metrics), inst);
}
if (amRegistry != null) {
amRegistry.start();
int pluginPort = pluginEndpoint != null ? pluginEndpoint.getActualPort() : -1;
amRegistry.register(amPort, pluginPort, HiveConf.getVar(conf, ConfVars.HIVE_SESSION_ID),
serializedToken, jobIdForToken, 0);
}
} finally {
writeLock.unlock();
}
}
@VisibleForTesting
protected void setServiceInstanceSet(LlapServiceInstanceSet serviceInstanceSet) {
this.activeInstances = serviceInstanceSet;
}
private class NodeStateChangeListener
implements ServiceInstanceStateChangeListener {
private final Logger LOG = LoggerFactory.getLogger(NodeStateChangeListener.class);
@Override
public void onCreate(LlapServiceInstance serviceInstance, int ephSeqVersion) {
LOG.info("Added node with identity: {} as a result of registry callback",
serviceInstance.getWorkerIdentity());
registerAndAddNode(new NodeInfo(serviceInstance, nodeBlacklistConf, clock,
numSchedulableTasksPerNode, metrics), serviceInstance);
}
@Override
public void onUpdate(LlapServiceInstance serviceInstance, int ephSeqVersion) {
NodeInfo nodeInfo = instanceToNodeMap.get(serviceInstance.getWorkerIdentity());
nodeInfo.updateLlapServiceInstance(serviceInstance, numSchedulableTasksPerNode);
LOG.info("Updated node with identity: {} as a result of registry callback",
serviceInstance.getWorkerIdentity());
}
@Override
public void onRemove(LlapServiceInstance serviceInstance, int ephSeqVersion) {
NodeReport nodeReport = constructNodeReport(serviceInstance, false);
LOG.info("Sending out nodeReport for onRemove: {}", nodeReport);
getContext().nodesUpdated(Collections.singletonList(nodeReport));
instanceToNodeMap.remove(serviceInstance.getWorkerIdentity());
LOG.info("Removed node with identity: {} due to RegistryNotification. currentActiveInstances={}",
serviceInstance.getWorkerIdentity(), activeInstances.size());
if (metrics != null) {
metrics.setClusterNodeCount(activeInstances.size());
}
// if there are no more nodes. Signal timeout monitor to start timer
if (activeInstances.size() == 0) {
LOG.info("No node found. Signalling scheduler timeout monitor thread to start timer.");
startTimeoutMonitor();
}
}
}
private void startTimeoutMonitor() {
timeoutLock.lock();
try {
// If timer is null, start a new one.
// If timer has completed during previous invocation, start a new one.
// If timer already started and is not completed, leaving it running without resetting it.
if ((timeoutFuture == null || (timeoutFuture != null && timeoutFuture.isDone()))
&& activeInstances.size() == 0) {
timeoutFuture = timeoutExecutor.schedule(timeoutMonitor, timeout, TimeUnit.MILLISECONDS);
timeoutFutureRef.set(timeoutFuture);
LOG.info("Scheduled timeout monitor task to run after {} ms", timeout);
} else {
LOG.info("Timeout monitor task not started. Timeout future state: {}, #instances: {}",
timeoutFuture == null ? "null" : timeoutFuture.isDone(), activeInstances.size());
}
} finally {
timeoutLock.unlock();
}
}
private void stopTimeoutMonitor() {
timeoutLock.lock();
try {
if (timeoutFuture != null && activeInstances.size() != 0 && timeoutFuture.cancel(false)) {
timeoutFutureRef.set(null);
LOG.info("Stopped timeout monitor task");
} else {
LOG.info("Timeout monitor task not stopped. Timeout future state: {}, #instances: {}",
timeoutFuture == null ? "null" : timeoutFuture.isDone(), activeInstances.size());
}
timeoutFuture = null;
} finally {
timeoutLock.unlock();
}
}
@Override
public void shutdown() {
writeLock.lock();
try {
if (!this.isStopped.getAndSet(true)) {
scheduledLoggingExecutor.shutdownNow();
nodeEnablerCallable.shutdown();
if (nodeEnablerFuture != null) {
nodeEnablerFuture.cancel(true);
}
nodeEnabledExecutor.shutdownNow();
timeoutExecutor.shutdown();
if (timeoutFuture != null) {
timeoutFuture.cancel(true);
timeoutFuture = null;
}
timeoutExecutor.shutdownNow();
delayedTaskSchedulerCallable.shutdown();
if (delayedTaskSchedulerFuture != null) {
delayedTaskSchedulerFuture.cancel(true);
}
delayedTaskSchedulerExecutor.shutdownNow();
preemptSchedulerCallable.shutdown();
if (preemptTaskSchedulerFuture != null) {
preemptTaskSchedulerFuture.cancel(true);
}
preemptSchedulerExecutor.shutdownNow();
schedulerCallable.shutdown();
if (schedulerFuture != null) {
schedulerFuture.cancel(true);
}
schedulerExecutor.shutdownNow();
if (registry != null) {
registry.stop();
}
if (amRegistry != null) {
amRegistry.stop();
}
if (pluginEndpoint != null) {
pluginEndpoint.stop();
}
if (pauseMonitor != null) {
pauseMonitor.stop();
}
if (metrics != null) {
LlapMetricsSystem.shutdown();
}
}
} finally {
writeLock.unlock();
}
}
@Override
public Resource getTotalResources() {
int memory = 0;
int vcores = 0;
readLock.lock();
try {
int numInstancesFound = 0;
for (LlapServiceInstance inst : activeInstances.getAll()) {
Resource r = inst.getResource();
memory += r.getMemory();
vcores += r.getVirtualCores();
numInstancesFound++;
}
if (LOG.isDebugEnabled()) {
LOG.debug("GetTotalResources: numInstancesFound={}, totalMem={}, totalVcores={}",
numInstancesFound, memory, vcores);
}
} finally {
readLock.unlock();
}
return Resource.newInstance(memory, vcores);
}
/**
* The difference between this and getTotalResources() is that this only gives currently free
* resource instances, while the other lists all the instances that may become available in a
* while.
*/
@Override
public Resource getAvailableResources() {
// need a state store eventually for current state & measure backoffs
int memory = 0;
int vcores = 0;
readLock.lock();
try {
int numInstancesFound = 0;
for (LlapServiceInstance inst : activeInstances.getAll()) {
NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity());
if (nodeInfo != null && !nodeInfo.isDisabled()) {
Resource r = inst.getResource();
memory += r.getMemory();
vcores += r.getVirtualCores();
numInstancesFound++;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("GetAvailableResources: numInstancesFound={}, totalMem={}, totalVcores={}",
numInstancesFound, memory, vcores);
}
} finally {
readLock.unlock();
}
return Resource.newInstance(memory, vcores);
}
@Override
public int getClusterNodeCount() {
readLock.lock();
try {
return activeInstances.getAll().size();
} finally {
readLock.unlock();
}
}
@Override
public void dagComplete() {
// This is effectively DAG completed, and can be used to reset statistics being tracked.
LOG.info("DAG: " + dagCounter.get() + " completed. Scheduling stats: " + dagStats);
dagCounter.incrementAndGet();
if (metrics != null) {
metrics.incrCompletedDagCount();
}
long tgVersionForZk;
writeLock.lock();
try {
dagRunning = false;
dagStats = new StatsPerDag();
int pendingCount = 0;
for (Entry> entry : pendingTasks.entrySet()) {
if (entry.getValue() != null) {
pendingCount += entry.getValue().size();
}
}
int runningCount = 0;
// We don't send messages to pending tasks with the flags; they should be killed elsewhere.
for (Entry> entry : guaranteedTasks.entrySet()) {
TreeSet set = speculativeTasks.get(entry.getKey());
if (set == null) {
set = new TreeSet<>();
speculativeTasks.put(entry.getKey(), set);
}
for (TaskInfo info : entry.getValue()) {
synchronized (info) {
info.isGuaranteed = false;
}
set.add(info);
}
}
guaranteedTasks.clear();
for (Entry> entry : speculativeTasks.entrySet()) {
if (entry.getValue() != null) {
runningCount += entry.getValue().size();
}
}
totalGuaranteed = unusedGuaranteed = 0;
tgVersionForZk = ++totalGuaranteedVersion;
if (metrics != null) {
metrics.setDagId(null);
// We remove the tasks above without state checks so just reset all metrics to 0.
metrics.resetWmMetrics();
}
LOG.info("DAG reset. Current knownTaskCount={}, pendingTaskCount={}, runningTaskCount={}",
knownTasks.size(), pendingCount, runningCount);
} finally {
writeLock.unlock();
}
if (workloadManagementEnabled) {
updateGuaranteedInRegistry(tgVersionForZk, 0);
}
// TODO Cleanup pending tasks etc, so that the next dag is not affected.
}
private void updateGuaranteedInRegistry(long tgVersionForZk, int newTotalGuaranteed) {
if (amRegistry == null) return;
synchronized (registryUpdateLock) {
// Make sure the updates are not sent to ZK out of order compared to how we apply them in AM.
if (tgVersionForZk <= tgVersionSent) return;
try {
amRegistry.updateGuaranteed(newTotalGuaranteed);
tgVersionSent = tgVersionForZk;
} catch (IOException ex) {
// Ignore for now. HS2 will probably try to send us the count we already have again.
// We are assuming here that if we can't talk to ZK we will eventually fail.
LOG.error("Failed to update guaranteed count in registry; ignoring", ex);
}
}
}
@Override
public void blacklistNode(NodeId nodeId) {
LOG.info("BlacklistNode not supported");
// TODO Disable blacklisting in Tez when using LLAP, until this is properly supported.
// Blacklisting can cause containers to move to a terminating state, which can cause attempt to be marked as failed.
// This becomes problematic when we set #allowedFailures to 0
// TODO HIVE-13484 What happens when we try scheduling a task on a node that Tez at this point thinks is blacklisted.
}
@Override
public void unblacklistNode(NodeId nodeId) {
LOG.info("unBlacklistNode not supported");
// TODO: See comments under blacklistNode.
}
@Override
public void allocateTask(Object task, Resource capability, String[] hosts, String[] racks,
Priority priority, Object containerSignature, Object clientCookie) {
TezTaskAttemptID id = getTaskAttemptId(task);
TaskInfo taskInfo = new TaskInfo(localityDelayConf, clock, task, clientCookie, priority,
capability, hosts, racks, clock.getTime(), id);
LOG.info("Received allocateRequest. task={}, priority={}, capability={}",
task, priority, capability);
if (!dagRunning) {
if (metrics != null && id != null) {
metrics.setDagId(id.getDAGID().toString());
}
dagRunning = true;
}
dagStats.registerTaskRequest(hosts, racks);
addPendingTask(taskInfo);
trySchedulingPendingTasks();
}
@Override
public void allocateTask(Object task, Resource capability, ContainerId containerId,
Priority priority, Object containerSignature, Object clientCookie) {
// Container affinity can be implemented as Host affinity for LLAP. Not required until
// 1:1 edges are used in Hive.
TezTaskAttemptID id = getTaskAttemptId(task);
TaskInfo taskInfo = new TaskInfo(localityDelayConf, clock, task, clientCookie, priority,
capability, null, null, clock.getTime(), id);
LOG.info("Received allocateRequest. task={}, priority={}, capability={}, containerId={}",
task, priority, capability, containerId);
if (!dagRunning) {
if (metrics != null && id != null) {
metrics.setDagId(id.getDAGID().toString());
}
dagRunning = true;
}
dagStats.registerTaskRequest(null, null);
addPendingTask(taskInfo);
trySchedulingPendingTasks();
}
protected TezTaskAttemptID getTaskAttemptId(Object task) {
// TODO: why does Tez API use "Object" for this?
if (task instanceof TaskAttempt) {
return ((TaskAttempt)task).getTaskAttemptID();
}
throw new AssertionError("LLAP plugin can only schedule task attempts");
}
// This may be invoked before a container is ever assigned to a task. allocateTask... app decides
// the task is no longer required, and asks for a de-allocation.
@Override
public boolean deallocateTask(
Object task, boolean taskSucceeded, TaskAttemptEndReason endReason, String diagnostics) {
if (LOG.isDebugEnabled()) {
LOG.debug("Processing deallocateTask for task={}, taskSucceeded={}, endReason={}", task,
taskSucceeded, endReason);
}
boolean isEarlyExit = false;
TaskInfo toUpdate = null, taskInfo;
writeLock.lock(); // Updating several local structures
try {
taskInfo = unregisterTask(task);
if (taskInfo == null) {
LOG.error("Could not determine ContainerId for task: "
+ task
+ " . Could have hit a race condition. Ignoring."
+ " The query may hang since this \"unknown\" container is now taking up a slot permanently");
return false;
}
boolean isGuaranteedFreed = false;
synchronized (taskInfo) {
if (taskInfo.isGuaranteed == null) {
WM_LOG.error("Task appears to have been deallocated twice: " + task
+ " There may be inconsistencies in guaranteed task counts.");
} else {
if (metrics != null) {
metrics.setWmTaskFinished(taskInfo.isGuaranteed, taskInfo.isPendingUpdate);
}
isGuaranteedFreed = taskInfo.isGuaranteed;
// This tells the pending update (if any) that whatever it is doing is irrelevant,
// and also makes sure we don't take the duck back twice if this is called twice.
taskInfo.isGuaranteed = null;
}
}
// Do not put the unused duck back; we'd run the tasks below, then assign it by priority.
// NOTE: this method MUST call distributeGuaranteedOnTaskCompletion before exiting.
if (taskInfo.containerId == null) {
if (taskInfo.getState() == TaskInfo.State.ASSIGNED) {
LOG.error("Task: "
+ task
+ " assigned, but could not find the corresponding containerId."
+ " The query may hang since this \"unknown\" container is now taking up a slot permanently");
} else {
LOG.info("Ignoring deallocate request for task " + task
+ " which hasn't been assigned to a container");
removePendingTask(taskInfo);
}
if (isGuaranteedFreed) {
toUpdate = distributeGuaranteedOnTaskCompletion();
isEarlyExit = true;
}
return false;
}
NodeInfo nodeInfo = taskInfo.assignedNode;
assert nodeInfo != null;
// endReason shows up as OTHER for CONTAINER_TIME_OUT
LOG.info("Processing de-allocate request for task={}, state={}, endReason={}", taskInfo.task,
taskInfo.getState(), endReason);
// Re-enable the node if preempted
if (taskInfo.getState() == TaskInfo.State.PREEMPTED) {
unregisterPendingPreemption(taskInfo.assignedNode.getHost());
nodeInfo.registerUnsuccessfulTaskEnd(true);
if (nodeInfo.isDisabled()) {
// Re-enable the node, if a task completed due to preemption. Capacity has become available,
// and we may have been able to communicate with the node.
queueNodeForReEnablement(nodeInfo);
}
// In case of success, trigger a scheduling run for pending tasks.
trySchedulingPendingTasks();
} else {
if (taskSucceeded) {
// The node may have been blacklisted at this point - which means it may not be in the
// activeNodeList.
nodeInfo.registerTaskSuccess();
if (nodeInfo.isDisabled()) {
// Re-enable the node. If a task succeeded, a slot may have become available.
// Also reset commFailures since a task was able to communicate back and indicate success.
queueNodeForReEnablement(nodeInfo);
}
// In case of success, trigger a scheduling run for pending tasks.
trySchedulingPendingTasks();
} else { // Task Failed
nodeInfo.registerUnsuccessfulTaskEnd(false);
// TODO Include EXTERNAL_PREEMPTION in this list?
// TODO HIVE-16134. Differentiate between EXTERNAL_PREEMPTION_WAITQUEU vs EXTERNAL_PREEMPTION_FINISHABLE?
if (endReason != null && EnumSet
.of(TaskAttemptEndReason.EXECUTOR_BUSY, TaskAttemptEndReason.COMMUNICATION_ERROR)
.contains(endReason)) {
if (endReason == TaskAttemptEndReason.COMMUNICATION_ERROR) {
dagStats.registerCommFailure(taskInfo.assignedNode.getHost());
} else if (endReason == TaskAttemptEndReason.EXECUTOR_BUSY) {
dagStats.registerTaskRejected(taskInfo.assignedNode.getHost());
}
}
if (endReason != null && endReason == TaskAttemptEndReason.NODE_FAILED) {
LOG.info(
"Task {} ended on {} with a NODE_FAILED message." +
" A message should come in from the registry to disable this node unless" +
" this was a temporary communication failure",
task, nodeInfo.toShortString());
}
boolean commFailure =
endReason != null && endReason == TaskAttemptEndReason.COMMUNICATION_ERROR;
disableNode(nodeInfo, commFailure);
}
}
if (isGuaranteedFreed) {
toUpdate = distributeGuaranteedOnTaskCompletion();
}
} finally {
writeLock.unlock();
if (isEarlyExit) {
// Most of the method got skipped but we still need to handle the duck.
checkAndSendGuaranteedStateUpdate(toUpdate);
}
}
if (toUpdate != null) {
assert !isEarlyExit;
checkAndSendGuaranteedStateUpdate(toUpdate);
}
getContext().containerBeingReleased(taskInfo.containerId);
getContext().containerCompleted(taskInfo.task, ContainerStatus.newInstance(taskInfo.containerId,
ContainerState.COMPLETE, "", 0));
return true;
}
public void notifyStarted(TezTaskAttemptID attemptId) {
TaskInfo info = null;
readLock.lock();
try {
info = tasksById.get(attemptId);
if (info == null) {
WM_LOG.warn("Unknown task start notification " + attemptId);
return;
}
} finally {
readLock.unlock();
}
handleUpdateResult(info, true);
}
// Must be called under the epic lock.
private TaskInfo distributeGuaranteedOnTaskCompletion() {
List toUpdate = new ArrayList<>(1);
int updatedCount = distributeGuaranteed(1, null, toUpdate);
assert updatedCount <= 1;
if (updatedCount == 0) {
int result = ++unusedGuaranteed;
if (metrics != null) {
metrics.setWmUnusedGuaranteed(result);
}
WM_LOG.info("Returning the unused duck; unused is now " + result);
}
if (toUpdate.isEmpty()) return null;
assert toUpdate.size() == 1;
return toUpdate.get(0);
}
@Override
public Object deallocateContainer(ContainerId containerId) {
LOG.debug("Ignoring deallocateContainer for containerId: {}", containerId);
// Containers are not being tracked for re-use.
// This is safe to ignore since a deallocate task will come in.
return null;
}
@Override
public void setShouldUnregister() {
}
@Override
public boolean hasUnregistered() {
// Nothing to do. No registration involved.
return true;
}
boolean isRequestedHostPresent(TaskInfo request) {
return (request.requestedHosts != null && request.requestedHosts.length > 0);
}
/**
* @param request the list of preferred hosts. null implies any host
* @return
*/
private SelectHostResult selectHost(TaskInfo request, Map> availableHostMap) {
// short-circuit when no-active instances exist
if (availableHostMap.isEmpty()) {
return SELECT_HOST_RESULT_DELAYED_RESOURCES;
}
String[] requestedHosts = request.requestedHosts;
// The requestedHostsDebugStr is merely used by debug-logging calls,
// and it is properly assigned in the following isDebugEnabled block.
String requestedHostsDebugStr = null;
if (LOG.isDebugEnabled()) {
requestedHostsDebugStr = Arrays.toString(requestedHosts);
LOG.debug("selectingHost for task={} on hosts={}", request.task,
requestedHostsDebugStr);
}
long schedulerAttemptTime = clock.getTime();
readLock.lock(); // Read-lock. Not updating any stats at the moment.
try {
boolean shouldDelayForLocality = request.shouldDelayForLocality(schedulerAttemptTime);
LOG.debug("ShouldDelayForLocality={} for task={} on hosts={}", shouldDelayForLocality,
request.task, requestedHostsDebugStr);
if (isRequestedHostPresent(request)) {
int prefHostCount = -1;
boolean requestedHostsWillBecomeAvailable = false;
for (String host : requestedHosts) {
prefHostCount++;
// Check if the host is removed from the registry after availableHostMap is created.
Set activeInstancesByHost = activeInstances.getByHost(host);
if (activeInstancesByHost == null || activeInstancesByHost.isEmpty()) {
continue;
}
// Pick the first host always. Weak attempt at cache affinity.
if (availableHostMap.containsKey(host)) {
List instances = availableHostMap.getOrDefault(host, new ArrayList<>());
// Host is there and there are available resources!
if (!instances.isEmpty()) {
NodeInfo nodeInfo = instances.iterator().next();
// Successfully scheduled.
LOG.info("Assigning {} when looking for {}." + " local=true FirstRequestedHost={}, #prefLocations={}",
nodeInfo.toShortString(), host, (prefHostCount == 0), requestedHosts.length);
return new SelectHostResult(nodeInfo);
} else {
// The node cannot accept a task at the moment. (there is host -- but no resources)
if (shouldDelayForLocality) {
// Perform some checks on whether the node will become available or not.
if (request.shouldForceLocality()) {
requestedHostsWillBecomeAvailable = true;
} else {
for (LlapServiceInstance inst : activeInstancesByHost) {
NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity());
if (nodeInfo == null) {
LOG.warn("Null NodeInfo when attempting to get host {}", host);
// Leave requestedHostWillBecomeAvailable as is. If some other host is found - delay,
// else ends up allocating to a random host immediately.
continue;
}
if (nodeInfo.getEnableTime() > request.getLocalityDelayTimeout()
&& nodeInfo.isDisabled() && nodeInfo.hadCommFailure()) {
LOG.debug("Host={} will not become available within requested timeout", nodeInfo);
// This node will likely be activated after the task timeout expires.
} else {
// Worth waiting for the timeout.
requestedHostsWillBecomeAvailable = true;
}
}
}
}
}
}
}
// Check if forcing the location is required.
if (shouldDelayForLocality) {
if (requestedHostsWillBecomeAvailable) {
if (LOG.isDebugEnabled()) {
LOG.debug("Delaying local allocation for [" + request.task +
"] when trying to allocate on [" + requestedHostsDebugStr + "]" +
". ScheduleAttemptTime=" + schedulerAttemptTime + ", taskDelayTimeout=" +
request.getLocalityDelayTimeout());
}
return SELECT_HOST_RESULT_DELAYED_LOCALITY;
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping local allocation for [" + request.task +
"] when trying to allocate on [" + requestedHostsDebugStr +
"] since none of these hosts are part of the known list");
}
}
}
}
/* fall through - miss in locality or no locality-requested */
List activeNodesWithFreeSlots = new ArrayList<>();
availableHostMap.values().forEach(activeNodesWithFreeSlots::addAll);
// no locality-requested, randomly pick a node containing free slots
if (requestedHosts == null || requestedHosts.length == 0) {
if (LOG.isDebugEnabled()) {
LOG.debug("No-locality requested. Selecting a random host for task={}", request.task);
}
return randomSelection(activeNodesWithFreeSlots);
}
// miss in locality request, try picking consistent location with fallback to random selection
final String firstRequestedHost = requestedHosts[0];
// requested host died or unknown host requested, fallback to random selection.
// TODO: At this point we don't know the slot number of the requested host, so can't rollover to next available
if (!availableHostMap.containsKey(firstRequestedHost)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Requested node [{}] in consistent order does not exist. Falling back to random selection for " +
"request {}", firstRequestedHost, request);
}
return randomSelection(activeNodesWithFreeSlots);
}
// requested host is still alive but cannot accept task, pick the next available host in consistent order
if (!activeNodesWithFreeSlots.isEmpty()) {
NodeInfo nextSlot = null;
boolean found = false;
for (Entry> entry : availableHostMap.entrySet()) {
if (found && !entry.getValue().isEmpty()) {
nextSlot = entry.getValue().iterator().next();
break;
}
if (entry.getKey().equals(firstRequestedHost)) found = true;
}
// rollover
if (nextSlot == null) nextSlot = activeNodesWithFreeSlots.stream().findFirst().get();
LOG.info("Assigning {} in consistent order when looking for first requested host, from #hosts={},"
+ " requestedHosts={}", nextSlot.toShortString(), availableHostMap.size(),
((requestedHosts == null || requestedHosts.length == 0) ? "null" : requestedHostsDebugStr));
return new SelectHostResult(nextSlot);
}
// When all nodes are busy, reset locality delay
if (request.localityDelayTimeout > 0 && isRequestedHostPresent(request)) {
request.resetLocalityDelayInfo();
}
return SELECT_HOST_RESULT_DELAYED_RESOURCES;
} finally {
readLock.unlock();
}
}
private SelectHostResult randomSelection(final List nodesWithFreeSlots) {
if (nodesWithFreeSlots.isEmpty()) {
return SELECT_HOST_RESULT_DELAYED_RESOURCES;
}
NodeInfo randomNode = nodesWithFreeSlots.get(random.nextInt(nodesWithFreeSlots.size()));
LOG.info("Assigning {} when looking for any host, from #hosts={}, requestedHosts=null", randomNode.toShortString(),
nodesWithFreeSlots.size());
return new SelectHostResult(randomNode);
}
private void registerAndAddNode(NodeInfo node, LlapServiceInstance serviceInstance) {
if (communicator != null) {
boolean registered = communicator
.registerDag(node, new RegisterDagCallback(node, serviceInstance));
if (!registered) {
addNode(node, serviceInstance);
}
} else {
addNode(node, serviceInstance);
}
}
private void addNode(NodeInfo node, LlapServiceInstance serviceInstance) {
// we have just added a new node. Signal timeout monitor to reset timer
if (activeInstances.size() != 0 && timeoutFutureRef.get() != null) {
LOG.info("New node added. Signalling scheduler timeout monitor thread to stop timer.");
stopTimeoutMonitor();
}
NodeReport nodeReport = constructNodeReport(serviceInstance, true);
getContext().nodesUpdated(Collections.singletonList(nodeReport));
// When the same node goes away and comes back... the old entry will be lost - which means
// we don't know how many fragments we have actually scheduled on this node.
// Replacing it is the right thing to do though, since we expect the AM to kill all the fragments running on the node, via timeouts.
// De-allocate messages coming in from the old node are sent to the NodeInfo instance for the old node.
instanceToNodeMap.put(node.getNodeIdentity(), node);
if (metrics != null) {
metrics.setClusterNodeCount(activeInstances.size());
}
// Trigger scheduling since a new node became available.
LOG.info("Adding new node: {}. TotalNodeCount={}. activeInstances.size={}",
node, instanceToNodeMap.size(), activeInstances.size());
trySchedulingPendingTasks();
}
private void reenableDisabledNode(NodeInfo nodeInfo) {
writeLock.lock();
try {
LOG.info("Attempting to re-enable node: " + nodeInfo.toShortString());
if (activeInstances.getInstance(nodeInfo.getNodeIdentity()) != null) {
nodeInfo.enableNode();
if (metrics != null) {
metrics.setDisabledNodeCount(disabledNodesQueue.size());
}
} else {
LOG.info("Not re-enabling node: {}, since it is not present in the RegistryActiveNodeList",
nodeInfo.toShortString());
}
} finally {
writeLock.unlock();
}
}
/**
* Updates relevant structures on the node, and fixes the position in the disabledNodeQueue
* to facilitate the actual re-enablement of the node.
* @param nodeInfo the node to be re-enabled
*/
private void queueNodeForReEnablement(final NodeInfo nodeInfo) {
if ( disabledNodesQueue.remove(nodeInfo)) {
LOG.info("Queueing node for re-enablement: {}", nodeInfo.toShortString());
nodeInfo.resetExpireInformation();
disabledNodesQueue.add(nodeInfo);
}
}
private void disableNode(NodeInfo nodeInfo, boolean isCommFailure) {
writeLock.lock();
try {
if (nodeInfo == null || nodeInfo.isDisabled()) {
if (LOG.isDebugEnabled()) {
if (nodeInfo != null) {
LOG.debug("Node: " + nodeInfo.toShortString() +
" already disabled, or invalid. Not doing anything.");
} else {
LOG.debug("Ignoring disableNode invocation for null NodeInfo");
}
}
} else {
nodeInfo.disableNode(isCommFailure);
// TODO: handle task to container map events in case of hard failures
disabledNodesQueue.add(nodeInfo);
if (metrics != null) {
metrics.setDisabledNodeCount(disabledNodesQueue.size());
}
// Trigger a scheduling run - in case there's some task which was waiting for this node to
// become available.
trySchedulingPendingTasks();
}
} finally {
writeLock.unlock();
}
}
private static NodeReport constructNodeReport(LlapServiceInstance serviceInstance,
boolean healthy) {
NodeReport nodeReport = NodeReport.newInstance(NodeId
.newInstance(serviceInstance.getHost(), serviceInstance.getRpcPort()),
healthy ? NodeState.RUNNING : NodeState.LOST,
serviceInstance.getServicesAddress(), null, null,
null, 0, "", 0l);
return nodeReport;
}
private void addPendingTask(TaskInfo taskInfo) {
writeLock.lock();
try {
List tasksAtPriority = pendingTasks.get(taskInfo.priority);
if (tasksAtPriority == null) {
tasksAtPriority = new LinkedList<>();
pendingTasks.put(taskInfo.priority, tasksAtPriority);
}
// Delayed tasks will not kick in right now. That will happen in the scheduling loop.
tasksAtPriority.add(taskInfo);
knownTasks.putIfAbsent(taskInfo.task, taskInfo);
tasksById.put(taskInfo.attemptId, taskInfo);
if (metrics != null) {
metrics.incrPendingTasksCount();
}
LOG.info("PendingTasksInfo={}", constructPendingTaskCountsLogMessage());
} finally {
writeLock.unlock();
}
}
/* Remove a task from the pending list */
private void removePendingTask(TaskInfo taskInfo) {
writeLock.lock();
try {
Priority priority = taskInfo.priority;
List taskInfoList = pendingTasks.get(priority);
if (taskInfoList == null || taskInfoList.isEmpty() || !taskInfoList.remove(taskInfo)) {
LOG.warn("Could not find task: " + taskInfo.task + " in pending list, at priority: "
+ priority);
}
} finally {
writeLock.unlock();
}
}
/* Register a running task into the runningTasks structure */
@VisibleForTesting
protected void registerRunningTask(TaskInfo taskInfo) {
boolean isGuaranteed = false;
synchronized (taskInfo) {
assert !taskInfo.isPendingUpdate;
// Update is included with the submit request; callback is via notifyStarted.
isGuaranteed = taskInfo.isGuaranteed;
taskInfo.isPendingUpdate = true;
taskInfo.requestedValue = taskInfo.isGuaranteed;
if (metrics != null) {
metrics.setWmTaskStarted(taskInfo.requestedValue);
}
setUpdateStartedUnderTiLock(taskInfo);
}
TreeMap> runningTasks =
isGuaranteed ? guaranteedTasks : speculativeTasks;
writeLock.lock();
try {
WM_LOG.info("Registering " + taskInfo.attemptId + "; " + taskInfo.isGuaranteed);
addToRunningTasksMap(runningTasks, taskInfo);
if (metrics != null) {
metrics.decrPendingTasksCount();
}
} finally {
writeLock.unlock();
}
}
@VisibleForTesting
protected TaskInfo getTaskInfo(Object task) {
return knownTasks.get(task);
}
/* Unregister a task from the known and running structures */
private TaskInfo unregisterTask(Object task) {
writeLock.lock();
try {
TaskInfo taskInfo = knownTasks.remove(task);
if (taskInfo != null) {
tasksById.remove(taskInfo.attemptId);
WM_LOG.info("Unregistering " + taskInfo.attemptId + "; " + taskInfo.isGuaranteed);
if (taskInfo.getState() == TaskInfo.State.ASSIGNED) {
// Remove from the running list.
if (!removeFromRunningTaskMap(speculativeTasks, task, taskInfo)
&& !removeFromRunningTaskMap(guaranteedTasks, task, taskInfo)) {
Preconditions.checkState(false, "runningTasks should contain an entry if the task" +
" was in running state. Caused by task: {}", task);
}
}
} else {
LOG.warn("Could not find TaskInfo for task: {}. Not removing it from the running set", task);
}
return taskInfo;
} finally {
writeLock.unlock();
}
}
private static void addToRunningTasksMap(
TreeMap> runningTasks, TaskInfo taskInfo) {
int priority = taskInfo.priority.getPriority();
TreeSet tasksAtpriority = runningTasks.get(priority);
if (tasksAtpriority == null) {
tasksAtpriority = new TreeSet<>(TASK_INFO_COMPARATOR);
runningTasks.put(priority, tasksAtpriority);
}
tasksAtpriority.add(taskInfo);
}
private static boolean removeFromRunningTaskMap(TreeMap> runningTasks,
Object task, TaskInfo taskInfo) {
int priority = taskInfo.priority.getPriority();
Set tasksAtPriority = runningTasks.get(priority);
if (tasksAtPriority == null) return false;
boolean result = tasksAtPriority.remove(taskInfo);
if (tasksAtPriority.isEmpty()) {
runningTasks.remove(priority);
}
return result;
}
private enum ScheduleResult {
// Successfully scheduled
SCHEDULED,
// Delayed to find a local match
DELAYED_LOCALITY,
// Delayed due to temporary resource availability
DELAYED_RESOURCES,
// Inadequate total resources - will never succeed / wait for new executors to become available
INADEQUATE_TOTAL_RESOURCES,
}
private Pair>> getResourceAvailability() {
int memory = 0;
int vcores = 0;
int numInstancesFound = 0;
Map> availableHostMap;
readLock.lock();
try {
// maintain insertion order (needed for Next slot in locality miss)
availableHostMap = new LinkedHashMap<>(instanceToNodeMap.size());
Collection instances = consistentSplits ?
// might also include Inactive instances
activeInstances.getAllInstancesOrdered(true):
// if consistent splits are NOT used we don't need the ordering as there will be no cache benefit anyways
activeInstances.getAll();
boolean foundSlot = false;
for (LlapServiceInstance inst : instances) {
NodeInfo nodeInfo = instanceToNodeMap.get(inst.getWorkerIdentity());
if (nodeInfo != null) {
List hostList = availableHostMap.get(nodeInfo.getHost());
if (hostList == null) {
hostList = new ArrayList<>();
availableHostMap.put(nodeInfo.getHost(), hostList);
}
if (!(inst instanceof InactiveServiceInstance)) {
Resource r = inst.getResource();
memory += r.getMemory();
vcores += r.getVirtualCores();
numInstancesFound++;
// Only add to List Nodes with available resources
// Hosts, however, exist even for nodes that do not currently have resources
if (nodeInfo.canAcceptTask()) {
foundSlot = true;
hostList.add(nodeInfo);
}
}
} else {
LOG.warn("Null NodeInfo when attempting to get available resources for " + inst.getWorkerIdentity());
}
}
// isClusterCapacityFull will be set to false on every trySchedulingPendingTasks call
// set it false here to bail out early when we know there are no resources available.
if (!foundSlot) {
isClusterCapacityFull.set(true);
}
} finally {
readLock.unlock();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Available resources: numInstancesFound={}, totalMem={}, totalVcores={} availableHosts: {}",
numInstancesFound, memory, vcores, availableHostMap.size());
}
return new ImmutablePair<>(Resource.newInstance(memory, vcores), availableHostMap);
}
/**
* Should cycle when:
* 1. There are available Resources OR
* 2. There is a pending task with higher pri than running ones
* @param availableHostMap
* @return
*/
private boolean shouldCycle(Map> availableHostMap) {
// short-circuit on resource availability
int nodeCnt = 0;
for (List nodes : availableHostMap.values()) {
nodeCnt += nodes.size();
}
if (nodeCnt > 0) return true;
// check if pending Pri is lower than existing tasks pri
int specMax = speculativeTasks.isEmpty() ? Integer.MIN_VALUE : speculativeTasks.lastKey();
int guarMax = guaranteedTasks.isEmpty() ? Integer.MIN_VALUE : guaranteedTasks.lastKey();
return pendingTasks.firstKey().getPriority() < Math.max(specMax, guarMax);
}
@VisibleForTesting
protected void schedulePendingTasks() throws InterruptedException {
// Early exit for scheduler calls that came **before** the end of this run and no resources are available.
// Preemption and locality delay will be taken care of in the first run.
if (isClusterCapacityFull.get()) {
return;
}
Ref downgradedTask = new Ref<>(null);
Pair>> availabilityPair = getResourceAvailability();
writeLock.lock();
try {
if (LOG.isDebugEnabled()) {
LOG.debug("ScheduleRun: {}", constructPendingTaskCountsLogMessage());
}
Iterator>> pendingIterator = pendingTasks.entrySet().iterator();
while (pendingIterator.hasNext() && shouldCycle(availabilityPair.getRight())) {
Entry> entry = pendingIterator.next();
List taskListAtPriority = entry.getValue();
Iterator taskIter = taskListAtPriority.iterator();
boolean scheduledAllAtPriority = true;
while (taskIter.hasNext()) {
TaskInfo taskInfo = taskIter.next();
if (taskInfo.getNumPreviousAssignAttempts() == 1) {
dagStats.registerDelayedAllocation();
}
taskInfo.triedAssigningTask();
ScheduleResult scheduleResult = scheduleTask(taskInfo, availabilityPair, downgradedTask);
// Note: we must handle downgradedTask after this. We do it at the end, outside the lock.
LOG.debug("ScheduleResult for Task: {} = {}", taskInfo, scheduleResult);
if (scheduleResult == ScheduleResult.SCHEDULED) {
taskIter.remove();
} else {
if (scheduleResult == ScheduleResult.INADEQUATE_TOTAL_RESOURCES) {
LOG.info("Inadequate total resources before scheduling pending tasks." +
" Signalling scheduler timeout monitor thread to start timer.");
startTimeoutMonitor();
// TODO Nothing else should be done for this task. Move on.
}
if (scheduleResult == ScheduleResult.DELAYED_LOCALITY) {
// Add the task to the delayed task queue if it does not already exist.
maybeAddToDelayedTaskQueue(taskInfo);
}
// Add to HighPriority queue to potentially preempt a lower priority task and unblock this one
maybeAddToHighPriorityTaskQueue(taskInfo);
// Since there was an allocation failure - don't try assigning tasks at the next priority.
scheduledAllAtPriority = false;
// Don't break if this allocation failure was a result of a LOCALITY_DELAY. Others could still be allocated.
if (scheduleResult != ScheduleResult.DELAYED_LOCALITY) {
break;
}
} // end of else - i.e. could not allocate
} // end of loop over pending tasks
if (taskListAtPriority.isEmpty()) {
// Remove the entry, if there's nothing left at the specific priority level
pendingIterator.remove();
}
if (!scheduledAllAtPriority) {
LOG.debug(
"Unable to schedule all requests at priority={}. Skipping subsequent priority levels",
entry.getKey());
// Don't attempt scheduling for additional priorities
break;
}
}
// Finally take care of preemption requests that can unblock higher-pri tasks.
// This removes preemptable tasks from the runningList and sends out a preempt request to the system.
// Subsequent tasks will be scheduled once the de-allocate request for the preempted task is processed.
while (!preemptionCandidates.isEmpty()) {
TaskInfo toPreempt = preemptionCandidates.take();
// 1. task has not terminated
if (toPreempt.isGuaranteed != null) {
String host = toPreempt.getAssignedNode().getHost();
// 2. is currently assigned 3. no preemption pending on that Host
if (toPreempt.getState() == TaskInfo.State.ASSIGNED &&
(pendingPreemptionsPerHost.get(host) == null || pendingPreemptionsPerHost.get(host).intValue() == 0)) {
LOG.info("Preempting {} running at Host={}", toPreempt, host);
dagStats.registerTaskPreempted(toPreempt.getAssignedNode().getHost());
registerPendingPreemption(toPreempt.getAssignedNode().getHost());
toPreempt.setPreemptedInfo(clock.getTime());
// Task cleanup
TreeMap> taskMap =
toPreempt.isGuaranteed ? guaranteedTasks : speculativeTasks;
taskMap.get(toPreempt.priority.getPriority()).remove(toPreempt);
if (taskMap.get(toPreempt.priority.getPriority()).isEmpty()) {
taskMap.remove(toPreempt.priority.getPriority());
}
// Preemption will finally be registered as a deallocateTask as a result of preemptContainer
// That resets preemption info and allows additional tasks to be preempted at that Host if required
getContext().preemptContainer(toPreempt.containerId);
} else {
// Maybe consider for future preemptions
toPreempt.setPreemptedTime(0L);
}
}
}
} finally {
writeLock.unlock();
}
if (downgradedTask.value != null) {
WM_LOG.info("Downgrading " + downgradedTask.value.attemptId);
checkAndSendGuaranteedStateUpdate(downgradedTask.value);
}
}
private static int vertexNum(TaskInfo taskInfo) {
return taskInfo.getAttemptId().getTaskID().getVertexID().getId(); // Sigh...
}
private String constructPendingTaskCountsLogMessage() {
StringBuilder sb = new StringBuilder();
int totalCount = 0;
sb.append("numPriorityLevels=").append(pendingTasks.size()).append(". ");
Iterator>> pendingIterator =
pendingTasks.entrySet().iterator();
while (pendingIterator.hasNext()) {
Entry> entry = pendingIterator.next();
int count = entry.getValue() == null ? 0 : entry.getValue().size();
sb.append("[p=").append(entry.getKey().toString()).append(",c=").append(count).append("]");
totalCount += count;
}
int runningTasks = guaranteedTasks.values().stream().mapToInt(t -> t.size()).sum() +
speculativeTasks.values().stream().mapToInt(t -> t.size()).sum();
sb.append(". runningTasks=").append(runningTasks);
sb.append(". totalPendingTasks=").append(totalCount);
sb.append(". delayedTaskQueueSize=").append(delayedTaskQueue.size());
sb.append(". highPriTaskQueueSize=").append(highPriorityTaskQueue.size());
sb.append(". preemptTaskQueueSize=").append(preemptionCandidates.size());
sb.append(". pendingPreemptions=").append(pendingPreemptions.get());
return sb.toString();
}
private ScheduleResult scheduleTask(TaskInfo taskInfo, Pair>> availabilityPair,
Ref downgradedTask) {
Preconditions.checkNotNull(availabilityPair.getLeft(), "totalResource can not be null");
// If there's no memory available at the cluster, fail
if (availabilityPair.getLeft().getMemory() <= 0) {
return SELECT_HOST_RESULT_INADEQUATE_TOTAL_CAPACITY.scheduleResult;
}
SelectHostResult selectHostResult = selectHost(taskInfo, availabilityPair.getRight());
if (selectHostResult.scheduleResult != ScheduleResult.SCHEDULED) {
return selectHostResult.scheduleResult;
}
boolean isGuaranteed = false;
if (unusedGuaranteed > 0) {
boolean wasGuaranteed;
synchronized (taskInfo) {
assert !taskInfo.isPendingUpdate; // No updates before it's running.
wasGuaranteed = taskInfo.isGuaranteed;
taskInfo.isGuaranteed = true;
isGuaranteed = true;
}
if (wasGuaranteed) {
// This should never happen - we only schedule one attempt once.
WM_LOG.error("The task had guaranteed flag set before scheduling: " + taskInfo);
} else {
int result = --unusedGuaranteed;
if (metrics != null) {
metrics.setWmUnusedGuaranteed(result);
}
WM_LOG.info("Using an unused duck for " + taskInfo.attemptId
+ "; unused is now " + result);
}
} else {
// We could be scheduling a guaranteed task when a higher priority task cannot be
// scheduled. Try to take a duck away from a lower priority task here.
if (findGuaranteedToReallocate(taskInfo, downgradedTask)) {
// We are revoking another duck; don't wait. We could also give the duck
// to this task in the callback instead.
synchronized (taskInfo) {
assert !taskInfo.isPendingUpdate; // No updates before it's running.
taskInfo.isGuaranteed = true;
isGuaranteed = true;
}
// Note: after this, the caller MUST send the downgrade message to downgradedTask
// (outside of the writeLock, preferably), before exiting.
}
}
// in LLAP, the containers are used exactly once for exactly one attempt
// the container ids are entirely arbitrary and have 64 bit space per application
// this is why the isGuaranteed (& any other initial information) can be encoded into the
// bits indicating container id
NodeInfo nodeInfo = selectHostResult.nodeInfo;
Container container =
containerFactory.createContainer(nodeInfo.getResourcePerExecutor(), taskInfo.priority,
nodeInfo.getHost(),
nodeInfo.getRpcPort(),
nodeInfo.getServiceAddress(),
isGuaranteed);
writeLock.lock(); // While updating local structures
// Note: this is actually called under the epic writeLock in schedulePendingTasks
try {
// The canAccept part of this log message does not account for this allocation.
assignedTaskCounter.incrementAndGet();
LOG.info("Assigned #{}, task={} on node={}, to container={}",
assignedTaskCounter.get(),
taskInfo, nodeInfo.toShortString(), container.getId());
dagStats.registerTaskAllocated(taskInfo.requestedHosts, taskInfo.requestedRacks,
nodeInfo.getHost());
taskInfo.setAssignmentInfo(nodeInfo, container.getId(), clock.getTime());
registerRunningTask(taskInfo);
nodeInfo.registerTaskScheduled();
// if no more resources on Node -> remove
if (!nodeInfo.canAcceptTask()) {
availabilityPair.getRight().get(nodeInfo.getHost()).remove(nodeInfo);
}
} finally {
writeLock.unlock();
}
getContext().taskAllocated(taskInfo.task, taskInfo.clientCookie, container);
return selectHostResult.scheduleResult;
}
/**
* Go through the running tasks Tree in descending priority and find a candidate to preempt. The candidate should be:
* 1. lower priority than forTask
* 2. below forTask in the DAG
* 3. not considered already for preemption
* A successful candidate is added to the preemptionCandidates queue for preemption by the scheduling loop.
* @param runningTasks Tree of running tasks
* @param forTask Task to find preemption candidate for
* @param preemptHosts Hosts preference
* @return true when preemption candidate is found
*/
private boolean addTaskPreemptionCandidate(TreeMap> runningTasks, TaskInfo forTask,
Set preemptHosts) {
NavigableMap> orderedMap = runningTasks.descendingMap();
Iterator>> iterator = orderedMap.entrySet().iterator();
while (iterator.hasNext()) {
Entry> entryAtPriority = iterator.next();
if (entryAtPriority.getKey() > forTask.priority.getPriority()) {
Iterator taskInfoIterator = entryAtPriority.getValue().iterator();
while (taskInfoIterator.hasNext()) {
TaskInfo taskInfo = taskInfoIterator.next();
if (preemptHosts != null && !preemptHosts.contains(taskInfo.getAssignedNode().getHost())) {
continue; // Not the right host.
}
Map> depInfo = getDependencyInfo(taskInfo.getAttemptId().getDAGID());
Set vertexDepInfo = null;
if (depInfo != null) {
vertexDepInfo = depInfo.get(vertexNum(forTask));
}
if (depInfo != null && vertexDepInfo == null) {
LOG.warn("Cannot find info for " + vertexNum(forTask) + " " + depInfo);
}
if (vertexDepInfo != null && !vertexDepInfo.contains(vertexNum(taskInfo))) {
// Only preempt if the task being preempted is "below" us in the dag.
continue;
}
// If preemption Candidate is not already considered
if (taskInfo.getPreemptedTime() == 0) {
// Candidate for preemption.
LOG.debug("Preemption candidate={} for Task={} with potentialHosts={}", taskInfo, forTask, preemptHosts);
taskInfo.setPreemptedTime(clock.getTime());
preemptionCandidates.add(taskInfo);
return true;
}
}
} else {
// No tasks qualify as preemptable
LOG.debug("No tasks qualify as preempt candidates for priority {}. Current priority={}",
forTask.priority.getPriority(), entryAtPriority.getKey());
break;
}
}
return false;
}
// Note: this is called under the epic lock.
private int distributeGuaranteed(int count, TaskInfo failedUpdate, List toUpdate) {
WM_LOG.info("Distributing " + count + " among " + speculativeTasks.size() + " levels"
+ (failedUpdate == null ? "" : "; on failure"));
Iterator>> iterator = speculativeTasks.entrySet().iterator();
int remainingCount = count;
// When done, handleUpdate.. may break the iterator, so the order of these checks is important.
while (remainingCount > 0 && iterator.hasNext()) {
remainingCount = handleUpdateForSinglePriorityLevel(
remainingCount, iterator, failedUpdate, toUpdate, true);
}
return count - remainingCount;
}
// Note: this is called under the epic lock.
private int revokeGuaranteed(int count, TaskInfo failedUpdate, List toUpdate) {
WM_LOG.info("Revoking " + count + " from " + guaranteedTasks.size() + " levels"
+ (failedUpdate == null ? "" : "; on failure"));
int remainingCount = count;
Iterator>> iterator =
guaranteedTasks.descendingMap().entrySet().iterator();
// When done, handleUpdate.. may break the iterator, so the order of these checks is important.
while (remainingCount > 0 && iterator.hasNext()) {
remainingCount = handleUpdateForSinglePriorityLevel(
remainingCount, iterator, failedUpdate, toUpdate, false);
}
return count - remainingCount;
}
// Must be called under the epic lock.
private boolean findGuaranteedToReallocate(TaskInfo candidate, Ref toUpdate) {
Iterator>> iterator =
guaranteedTasks.descendingMap().entrySet().iterator();
while (iterator.hasNext()) {
Entry> entry = iterator.next();
int priority = entry.getKey();
TreeSet atPriority = entry.getValue();
if (priority <= candidate.priority.getPriority()) {
return false; // The tasks from now on are more important than the candidate.
}
TaskInfo taskInfo = atPriority.pollLast();
if (taskInfo != null) {
synchronized (taskInfo) {
assert taskInfo.isGuaranteed;
taskInfo.isGuaranteed = false;
// See the comment in handleUpdateForSinglePriorityLevel.
if (!taskInfo.isPendingUpdate) {
setUpdateStartedUnderTiLock(taskInfo);
toUpdate.value = taskInfo;
}
}
addToRunningTasksMap(speculativeTasks, taskInfo);
}
// Remove entire priority level if it's been emptied.
if (atPriority.isEmpty()) {
iterator.remove();
}
if (taskInfo != null) {
return true;
}
}
return false;
}
private int handleUpdateForSinglePriorityLevel(int remainingCount,
Iterator>> iterator, TaskInfo failedUpdate,
List toUpdate, boolean newValue) {
Entry> entry = iterator.next();
TreeSet atPriority = entry.getValue();
WM_LOG.info("At priority " + entry.getKey() + " observing " + entry.getValue().size());
Iterator atPriorityIter = newValue ? atPriority.iterator() : atPriority.descendingIterator();
TreeMap> toMap = newValue ? guaranteedTasks : speculativeTasks,
fromMap = newValue ? speculativeTasks : guaranteedTasks;
while (atPriorityIter.hasNext() && remainingCount > 0) {
TaskInfo taskInfo = atPriorityIter.next();
if (taskInfo == failedUpdate) continue;
atPriorityIter.remove();
synchronized (taskInfo) {
assert taskInfo.isGuaranteed != newValue;
taskInfo.isGuaranteed = newValue;
// When we introduce a discrepancy to the state we give the task to an updater unless it
// was already given to one. If the updater is already doing stuff, it would handle the
// changed state when it's done with whatever it's doing. The updater is not going to
// give up until the discrepancies are eliminated.
if (!taskInfo.isPendingUpdate) {
setUpdateStartedUnderTiLock(taskInfo);
WM_LOG.info("Adding " + taskInfo.attemptId + " to update");
toUpdate.add(taskInfo);
} else {
WM_LOG.info("Not adding " + taskInfo.attemptId + " to update - already pending");
}
}
addToRunningTasksMap(toMap, taskInfo);
--remainingCount;
}
// Remove entire priority level if it's been emptied.
// We do this before checking failedUpdate because that might break the iterator.
if (atPriority.isEmpty()) {
iterator.remove();
}
// We include failedUpdate only after looking at all the tasks at the same priority.
if (failedUpdate != null && entry.getKey() == failedUpdate.priority.getPriority()
&& remainingCount > 0) {
// This will break the iterator. However, this is the last task we can add the way this currently
// runs (only one duck is distributed when failedUpdate is present), so that should be ok.
removeFromRunningTaskMap(fromMap, failedUpdate.task, failedUpdate);
synchronized (failedUpdate) {
assert failedUpdate.isGuaranteed != newValue;
failedUpdate.isGuaranteed = newValue;
setUpdateStartedUnderTiLock(failedUpdate);
}
WM_LOG.info("Adding failed " + failedUpdate.attemptId + " to update");
// Do not check the state - this is coming from the updater under epic lock.
toUpdate.add(failedUpdate);
addToRunningTasksMap(toMap, failedUpdate);
--remainingCount;
}
return remainingCount;
}
private void registerPendingPreemption(String host) {
writeLock.lock();
try {
pendingPreemptions.incrementAndGet();
if (metrics != null) {
metrics.incrPendingPreemptionTasksCount();
}
MutableInt val = pendingPreemptionsPerHost.get(host);
if (val == null) {
val = new MutableInt(0);
pendingPreemptionsPerHost.put(host, val);
}
val.increment();
} finally {
writeLock.unlock();
}
}
private void unregisterPendingPreemption(String host) {
writeLock.lock();
try {
pendingPreemptions.decrementAndGet();
if (metrics != null) {
metrics.decrPendingPreemptionTasksCount();
}
MutableInt val = pendingPreemptionsPerHost.get(host);
Preconditions.checkNotNull(val);
val.decrement();
// Not bothering with removing the entry. There's a limited number of hosts, and a good
// chance that the entry will make it back in when the AM is used for a long duration.
} finally {
writeLock.unlock();
}
}
private void maybeAddToDelayedTaskQueue(TaskInfo taskInfo) {
// There's no point adding a task with forceLocality set - since that will never exit the queue.
// Add other tasks if they are not already in the queue.
if (!taskInfo.shouldForceLocality() && !taskInfo.isInDelayedQueue()) {
taskInfo.setInDelayedQueue(true);
delayedTaskQueue.add(taskInfo);
}
}
private void maybeAddToHighPriorityTaskQueue(TaskInfo taskInfo) {
// Only add task if its not already in the Queue AND there no mores than HOSTS tasks there already
// as we are performing up to HOSTS preemptions at a time
if (!taskInfo.isInHighPriorityQueue() && highPriorityTaskQueue.size() < activeInstances.size()) {
taskInfo.setInHighPriorityQueue(true);
highPriorityTaskQueue.add(taskInfo);
}
}
// ------ Inner classes defined after this point ------
class PreemptionSchedulerCallable implements Callable {
private final AtomicBoolean isShutdown = new AtomicBoolean(false);
@Override
public Void call() {
while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) {
try {
TaskInfo taskInfo = highPriorityTaskQueue.take();
// Tasks can exist in the queue even after they have been scheduled.
// Process task Preemption only if the task is still in PENDING state.
processTaskPreemption(taskInfo);
} catch (InterruptedException e) {
if (isShutdown.get()) {
LOG.info("PreemptTaskScheduler thread interrupted after shutdown");
break;
} else {
LOG.warn("PreemptTaskScheduler thread interrupted before being shutdown");
throw new RuntimeException("PreemptTaskScheduler thread interrupted without being shutdown", e);
}
}
}
return null;
}
private void processTaskPreemption(TaskInfo taskInfo) {
if (shouldAttemptTask(taskInfo) && tryTaskPreemption(taskInfo)) {
trySchedulingPendingTasks();
}
// Enables scheduler to reAdd task in Queue if needed
taskInfo.setInHighPriorityQueue(false);
}
private boolean tryTaskPreemption(TaskInfo taskInfo) {
// Find a lower priority task that can be preempted on a particular host.
// ONLY if there's no pending preemptions on that host to avoid preempting twice for a task.
Set potentialHosts = null; // null => preempt on any host.
readLock.lock();
try {
// Protect against a bad location being requested.
if (taskInfo.requestedHosts != null && taskInfo.requestedHosts.length != 0) {
potentialHosts = Sets.newHashSet(taskInfo.requestedHosts);
}
if (potentialHosts != null) {
// Preempt on specific host
boolean shouldPreempt = true;
for (String host : potentialHosts) {
// Preempt only if there are no pending preemptions on the same host
// When the preemption registers, the request at the highest priority will be given the slot,
// even if the initial preemption was caused by some other task.
// TODO Maybe register which task the preemption was for, to avoid a bad non-local allocation.
MutableInt pendingHostPreemptions = pendingPreemptionsPerHost.get(host);
if (pendingHostPreemptions != null && pendingHostPreemptions.intValue() > 0) {
shouldPreempt = false;
LOG.debug("No preempt candidate for task={}. Found an existing preemption request on host={}, pendingPreemptionCount={}",
taskInfo.task, host, pendingHostPreemptions.intValue());
break;
}
}
if (!shouldPreempt) {
LOG.debug("No preempt candidate for {} on potential hosts={}. An existing preemption request exists",
taskInfo.task, potentialHosts);
return false;
}
} else {
// Unknown requested host -- Request for a preemption if there's none pending. If a single preemption is pending,
// and this is the next task to be assigned, it will be assigned once that slot becomes available.
if (pendingPreemptions.get() != 0) {
LOG.debug("Skipping preempt candidate since there are {} pending preemption request. For task={}",
pendingPreemptions.get(), taskInfo);
return false;
}
}
LOG.debug("Attempting preempt candidate for task={}, priority={} on potential hosts={}. pendingPreemptions={}",
taskInfo.task, taskInfo.priority, potentialHosts == null ? "ANY" : potentialHosts, pendingPreemptions.get());
return addTaskPreemptionCandidate(speculativeTasks, taskInfo, potentialHosts) ||
addTaskPreemptionCandidate(guaranteedTasks, taskInfo, potentialHosts);
} finally {
readLock.unlock();
}
}
public void shutdown() {
isShutdown.set(true);
}
public boolean shouldAttemptTask(TaskInfo taskInfo) {
// bail-out when preemptions are pending on every host OR task is not Pending
return taskInfo.getState() == TaskInfo.State.PENDING &&
pendingPreemptions.get() < getClusterNodeCount();
}
}
@VisibleForTesting
class DelayedTaskSchedulerCallable implements Callable {
private final AtomicBoolean isShutdown = new AtomicBoolean(false);
@Override
public Void call() {
while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) {
try {
TaskInfo taskInfo = getNextTask();
taskInfo.setInDelayedQueue(false);
// Tasks can exist in the delayed queue even after they have been scheduled.
// Trigger scheduling only if the task is still in PENDING state.
processEvictedTask(taskInfo);
} catch (InterruptedException e) {
if (isShutdown.get()) {
LOG.info("DelayedTaskScheduler thread interrupted after shutdown");
break;
} else {
LOG.warn("DelayedTaskScheduler thread interrupted before being shutdown");
throw new RuntimeException(
"DelayedTaskScheduler thread interrupted without being shutdown", e);
}
}
}
return null;
}
public void shutdown() {
isShutdown.set(true);
}
public TaskInfo getNextTask() throws InterruptedException {
return delayedTaskQueue.take();
}
public void processEvictedTask(TaskInfo taskInfo) {
if (shouldScheduleTask(taskInfo)) {
trySchedulingPendingTasks();
}
}
public boolean shouldScheduleTask(TaskInfo taskInfo) {
return taskInfo.getState() == TaskInfo.State.PENDING;
}
}
@VisibleForTesting
DelayedTaskSchedulerCallable createDelayedTaskSchedulerCallable() {
return new DelayedTaskSchedulerCallable();
}
private class NodeEnablerCallable implements Callable {
private final AtomicBoolean isShutdown = new AtomicBoolean(false);
private static final long POLL_TIMEOUT = 10000L;
@Override
public Void call() {
while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) {
try {
NodeInfo nodeInfo =
disabledNodesQueue.poll(POLL_TIMEOUT, TimeUnit.MILLISECONDS);
if (nodeInfo != null) {
// A node became available. Enable the node and try scheduling.
reenableDisabledNode(nodeInfo);
trySchedulingPendingTasks();
}
} catch (InterruptedException e) {
if (isShutdown.get()) {
LOG.info("NodeEnabler thread interrupted after shutdown");
break;
} else {
LOG.warn("NodeEnabler thread interrupted without being shutdown");
throw new RuntimeException("NodeEnabler thread interrupted without being shutdown", e);
}
}
}
return null;
}
// Call this first, then send in an interrupt to the thread.
public void shutdown() {
isShutdown.set(true);
}
}
private void trySchedulingPendingTasks() {
scheduleLock.lock();
try {
isClusterCapacityFull.set(false);
pendingScheduleInvocations.set(true);
scheduleCondition.signal();
} finally {
scheduleLock.unlock();
}
}
private class SchedulerTimeoutMonitor implements Runnable {
private final Logger LOG = LoggerFactory.getLogger(SchedulerTimeoutMonitor.class);
@Override
public void run() {
LOG.info("Reporting SERVICE_UNAVAILABLE error as no instances are running");
try {
getContext().reportError(ServicePluginErrorDefaults.SERVICE_UNAVAILABLE,
"No LLAP Daemons are running", getContext().getCurrentDagInfo());
} catch (Exception e) {
DagInfo currentDagInfo = getContext().getCurrentDagInfo();
LOG.error("Exception when reporting SERVICE_UNAVAILABLE error for dag: {}",
currentDagInfo == null ? "" : currentDagInfo.getName(), e);
}
}
}
private class SchedulerCallable implements Callable {
private AtomicBoolean isShutdown = new AtomicBoolean(false);
@Override
public Void call() throws Exception {
while (!isShutdown.get() && !Thread.currentThread().isInterrupted()) {
scheduleLock.lock();
try {
while (!pendingScheduleInvocations.get()) {
scheduleCondition.await();
}
} catch (InterruptedException e) {
if (isShutdown.get()) {
LOG.info("Scheduler thread interrupted after shutdown");
break;
} else {
LOG.warn("Scheduler thread interrupted without being shutdown");
throw new RuntimeException("Scheduler thread interrupted without being shutdown", e);
}
} finally {
scheduleLock.unlock();
}
// Set pending to false since scheduling is about to run. Any triggers up to this point
// will be handled in the next run.
// A new request may come in right after this is set to false, but before the actual scheduling.
// This will be handled in this run, but will cause an immediate run after, which is harmless.
// isClusterCapacityFull helps in such runs bailing out when we know no resources are available.
// pendingScheduleInvocations is mainly to handle a trySchedulingPendingTasks request while in the middle of
// a run - since the event which triggered it may not be processed for all tasks in the run.
pendingScheduleInvocations.set(false);
// Schedule outside of the scheduleLock - which should only be used to wait on the condition.
try {
schedulePendingTasks();
} catch (InterruptedException ie) {
if (isShutdown.get()) {
return null; // We are good.
}
LOG.error("Scheduler thread was interrupte without shutdown and will now exit", ie);
throw ie;
} catch (Throwable t) {
// TODO: we might as well kill the AM at this point. How do we do that from here?
LOG.error("Fatal error: scheduler thread has failed and will now exit", t);
throw (t instanceof Exception) ? (Exception)t : new Exception(t);
}
}
return null;
}
// Call this first, then send in an interrupt to the thread.
public void shutdown() {
isShutdown.set(true);
}
}
// ------ Additional static classes defined after this point ------
@VisibleForTesting
static class NodeInfo implements Delayed {
private final NodeBlacklistConf blacklistConf;
LlapServiceInstance serviceInstance;
private final Clock clock;
long expireTimeMillis = -1;
private long numSuccessfulTasks = 0;
private long numSuccessfulTasksAtLastBlacklist = -1;
float cumulativeBackoffFactor = 1.0f;
// Indicates whether a node had a recent communication failure.
// This is primarily for tracking and logging purposes for the moment.
// TODO At some point, treat task rejection and communication failures differently.
private boolean hadCommFailure = false;
// Indicates whether a node is disabled - for whatever reason - commFailure, busy, etc.
private boolean disabled = false;
private int numScheduledTasks = 0;
private int numSchedulableTasks;
private final LlapTaskSchedulerMetrics metrics;
private Resource resourcePerExecutor;
private String shortStringBase;
/**
* Create a NodeInfo bound to a service instance
* @param serviceInstance the associated serviceInstance
* @param blacklistConf blacklist configuration
* @param clock clock to use to obtain timing information
* @param numSchedulableTasksConf number of schedulable tasks on the node. 0 represents auto
* detect based on the serviceInstance, -1 indicates indicates
* @param metrics
*/
NodeInfo(LlapServiceInstance serviceInstance, NodeBlacklistConf blacklistConf, Clock clock,
int numSchedulableTasksConf, final LlapTaskSchedulerMetrics metrics) {
Preconditions.checkArgument(numSchedulableTasksConf >= -1, "NumSchedulableTasks must be >=-1");
this.blacklistConf = blacklistConf;
this.clock = clock;
this.metrics = metrics;
updateLlapServiceInstance(serviceInstance, numSchedulableTasksConf);
}
String getNodeIdentity() {
return serviceInstance.getWorkerIdentity();
}
String getHost() {
return serviceInstance.getHost();
}
int getRpcPort() {
return serviceInstance.getRpcPort();
}
String getServiceAddress() {
return serviceInstance.getServicesAddress();
}
public Resource getResourcePerExecutor() {
return resourcePerExecutor;
}
void updateLlapServiceInstance(LlapServiceInstance serviceInstance, int numSchedulableTasksConf) {
this.serviceInstance = serviceInstance;
int numVcores = serviceInstance.getResource().getVirtualCores();
int memoryPerInstance = serviceInstance.getResource().getMemory();
int memoryPerExecutor = (int)(memoryPerInstance / (double) numVcores);
resourcePerExecutor = Resource.newInstance(memoryPerExecutor, 1);
int oldNumSchedulableTasks = numSchedulableTasks;
if (numSchedulableTasksConf == 0) {
int pendingQueueuCapacity = 0;
String pendingQueueCapacityString = serviceInstance.getProperties()
.get(LlapRegistryService.LLAP_DAEMON_TASK_SCHEDULER_ENABLED_WAIT_QUEUE_SIZE);
if (pendingQueueCapacityString == null) {
pendingQueueCapacityString = serviceInstance.getProperties()
.get(ConfVars.LLAP_DAEMON_TASK_SCHEDULER_WAIT_QUEUE_SIZE.varname);
}
LOG.info("Setting up node: {} with available capacity={}, pendingQueueSize={}, memory={}",
serviceInstance, serviceInstance.getResource().getVirtualCores(),
pendingQueueCapacityString, serviceInstance.getResource().getMemory());
if (pendingQueueCapacityString != null) {
pendingQueueuCapacity = Integer.parseInt(pendingQueueCapacityString);
}
this.numSchedulableTasks = numVcores + pendingQueueuCapacity;
} else {
this.numSchedulableTasks = numSchedulableTasksConf;
LOG.info("Setting up node: " + serviceInstance + " with schedulableCapacity=" + this.numSchedulableTasks);
}
if (metrics != null) {
metrics.incrSchedulableTasksCount(numSchedulableTasks - oldNumSchedulableTasks);
}
shortStringBase = setupShortStringBase();
}
void resetExpireInformation() {
expireTimeMillis = -1;
hadCommFailure = false;
}
void enableNode() {
resetExpireInformation();
disabled = false;
}
void disableNode(boolean commFailure) {
long duration = blacklistConf.minDelay;
long currentTime = clock.getTime();
this.hadCommFailure = commFailure;
disabled = true;
if (numSuccessfulTasksAtLastBlacklist == numSuccessfulTasks) {
// Relying on a task succeeding to reset the exponent.
// There's no notifications on whether a task gets accepted or not. That would be ideal to
// reset this.
cumulativeBackoffFactor = cumulativeBackoffFactor * blacklistConf.backoffFactor;
} else {
// Was able to execute something before the last blacklist. Reset the exponent.
cumulativeBackoffFactor = 1.0f;
}
long delayTime = (long) (duration * cumulativeBackoffFactor);
if (delayTime > blacklistConf.maxDelay) {
delayTime = blacklistConf.maxDelay;
}
LOG.info("Disabling instance {} for {} milli-seconds. commFailure={}", toShortString(), delayTime, commFailure);
expireTimeMillis = currentTime + delayTime;
numSuccessfulTasksAtLastBlacklist = numSuccessfulTasks;
}
void registerTaskScheduled() {
numScheduledTasks++;
if (metrics != null) {
metrics.incrRunningTasksCount();
metrics.decrSchedulableTasksCount();
}
}
void registerTaskSuccess() {
numSuccessfulTasks++;
numScheduledTasks--;
if (metrics != null) {
metrics.incrSuccessfulTasksCount();
metrics.decrRunningTasksCount();
metrics.incrSchedulableTasksCount();
}
}
void registerUnsuccessfulTaskEnd(boolean wasPreempted) {
numScheduledTasks--;
if (metrics != null) {
metrics.decrRunningTasksCount();
metrics.incrSchedulableTasksCount();
}
if (wasPreempted) {
if (metrics != null) {
metrics.incrPreemptedTasksCount();
}
}
}
/**
* @return the time at which this node will be re-enabled
*/
long getEnableTime() {
return expireTimeMillis;
}
public boolean isDisabled() {
return disabled;
}
boolean hadCommFailure() {
return hadCommFailure;
}
boolean _canAcceptInternal() {
return !hadCommFailure && !disabled
&&(numSchedulableTasks == -1 || ((numSchedulableTasks - numScheduledTasks) > 0));
}
/* Returning true does not guarantee that the task will run, considering other queries
may be running in the system. Also depends upon the capacity usage configuration
*/
boolean canAcceptTask() {
boolean result = _canAcceptInternal();
if (LOG.isTraceEnabled()) {
LOG.trace(constructCanAcceptLogResult(result));
}
return result;
}
String constructCanAcceptLogResult(boolean result) {
StringBuilder sb = new StringBuilder();
sb.append("Node[").append(serviceInstance.getHost()).append(":").append(serviceInstance.getRpcPort())
.append(", ").append(serviceInstance.getWorkerIdentity()).append("]: ")
.append("canAcceptTask=").append(result)
.append(", numScheduledTasks=").append(numScheduledTasks)
.append(", numSchedulableTasks=").append(numSchedulableTasks)
.append(", hadCommFailure=").append(hadCommFailure)
.append(", disabled=").append(disabled);
return sb.toString();
}
@Override
public long getDelay(TimeUnit unit) {
return unit.convert(expireTimeMillis - clock.getTime(), TimeUnit.MILLISECONDS);
}
@Override
public int compareTo(Delayed o) {
NodeInfo other = (NodeInfo) o;
if (other.expireTimeMillis > this.expireTimeMillis) {
return -1;
} else if (other.expireTimeMillis < this.expireTimeMillis) {
return 1;
} else {
return 0;
}
}
private String setupShortStringBase() {
return "{" + serviceInstance.getHost() + ":" + serviceInstance.getRpcPort() + ", id=" + getNodeIdentity();
}
@Override
public String toString() {
return "NodeInfo{" + "instance=" + serviceInstance
+ ", expireTimeMillis=" + expireTimeMillis + ", numSuccessfulTasks=" + numSuccessfulTasks
+ ", numSuccessfulTasksAtLastBlacklist=" + numSuccessfulTasksAtLastBlacklist
+ ", cumulativeBackoffFactor=" + cumulativeBackoffFactor
+ ", numSchedulableTasks=" + numSchedulableTasks
+ ", numScheduledTasks=" + numScheduledTasks
+ ", disabled=" + disabled
+ ", commFailures=" + hadCommFailure
+'}';
}
private String toShortString() {
StringBuilder sb = new StringBuilder();
sb.append(", canAcceptTask=").append(_canAcceptInternal());
sb.append(", st=").append(numScheduledTasks);
sb.append(", ac=").append((numSchedulableTasks - numScheduledTasks));
sb.append(", commF=").append(hadCommFailure);
sb.append(", disabled=").append(disabled);
sb.append("}");
return shortStringBase + sb.toString();
}
}
// TODO There needs to be a mechanism to figure out different attempts for the same task. Delays
// could potentially be changed based on this.
@VisibleForTesting
static class TaskInfo implements Delayed {
enum State {
PENDING, ASSIGNED, PREEMPTED
}
// IDs used to ensure two TaskInfos are different without using the underlying task instance.
// Required for insertion into a TreeMap
static final AtomicLong ID_GEN = new AtomicLong(0);
final long uniqueId;
final LocalityDelayConf localityDelayConf;
final Clock clock;
final Object task;
final Object clientCookie;
final Priority priority;
final Resource capability;
final String[] requestedHosts;
final String[] requestedRacks;
final long requestTime;
long localityDelayTimeout;
// Adjust locality delay based on scheduling time instead of init time of task info.
boolean adjustedLocalityDelay;
long startTime;
long preemptTime;
ContainerId containerId;
NodeInfo assignedNode;
private State state = State.PENDING;
boolean inDelayedQueue = false;
boolean inHighPriorityQueue = false;
private final TezTaskAttemptID attemptId;
// The state for guaranteed task tracking. Synchronized on 'this'.
// In addition, "isGuaranteed" is only modified under the epic lock (because it involves
// modifying the corresponding structures that contain the task objects, at the same time).
/** Local state in the AM; true/false are what they say, null means terminated and irrelevant. */
private Boolean isGuaranteed = false;
/** The last state positively propagated to the task. Set by the updater. */
private Boolean lastSetGuaranteed = null;
private Boolean requestedValue = null;
/** Whether there's an update in progress for this TaskInfo. */
private boolean isPendingUpdate = false;
private int numAssignAttempts = 0;
// TaskInfo instances for two different tasks will not be the same. Only a single instance should
// ever be created for a taskAttempt
public TaskInfo(LocalityDelayConf localityDelayConf, Clock clock, Object task, Object clientCookie, Priority priority, Resource capability,
String[] hosts, String[] racks, long requestTime, TezTaskAttemptID id) {
this.localityDelayConf = localityDelayConf;
this.clock = clock;
this.task = task;
this.clientCookie = clientCookie;
this.priority = priority;
this.capability = capability;
this.requestedHosts = hosts;
this.requestedRacks = racks;
this.requestTime = requestTime;
if (localityDelayConf.getNodeLocalityDelay() == -1) {
localityDelayTimeout = Long.MAX_VALUE;
} else if (localityDelayConf.getNodeLocalityDelay() == 0) {
localityDelayTimeout = 0L;
} else {
localityDelayTimeout = requestTime + localityDelayConf.getNodeLocalityDelay();
}
this.uniqueId = ID_GEN.getAndIncrement();
this.attemptId = id;
}
// TODO: these appear to always be called under write lock. Do they need sync?
synchronized void setAssignmentInfo(NodeInfo nodeInfo, ContainerId containerId, long startTime) {
this.assignedNode = nodeInfo;
this.containerId = containerId;
this.startTime = startTime;
this.state = State.ASSIGNED;
}
synchronized void setPreemptedInfo(long preemptTime) {
this.state = State.PREEMPTED;
setPreemptedTime(preemptTime);
// Give an opportunity for preempted task to get better locality next time.
this.adjustedLocalityDelay = false;
}
synchronized void setPreemptedTime(long preemptTime) {
this.preemptTime = preemptTime;
}
synchronized long getPreemptedTime() {
return this.preemptTime;
}
synchronized NodeInfo getAssignedNode() {
return assignedNode;
}
synchronized void setInDelayedQueue(boolean val) {
this.inDelayedQueue = val;
}
synchronized void setInHighPriorityQueue(boolean val) {
this.inHighPriorityQueue = val;
}
synchronized void triedAssigningTask() {
numAssignAttempts++;
}
synchronized int getNumPreviousAssignAttempts() {
return numAssignAttempts;
}
synchronized State getState() {
return state;
}
synchronized boolean isInDelayedQueue() {
return inDelayedQueue;
}
synchronized boolean isInHighPriorityQueue() {
return inHighPriorityQueue;
}
boolean shouldDelayForLocality(long schedulerAttemptTime) {
adjustLocalityDelayInfo();
// getDelay <=0 means the task will be evicted from the queue.
return localityDelayTimeout > schedulerAttemptTime;
}
void adjustLocalityDelayInfo() {
if (localityDelayTimeout > 0 && localityDelayTimeout != Long.MAX_VALUE && !adjustedLocalityDelay) {
adjustedLocalityDelay = true;
resetLocalityDelayInfo();
}
}
void resetLocalityDelayInfo() {
localityDelayTimeout = clock.getTime() + localityDelayConf.getNodeLocalityDelay();
}
boolean shouldForceLocality() {
return localityDelayTimeout == Long.MAX_VALUE;
}
long getLocalityDelayTimeout() {
return localityDelayTimeout;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
TaskInfo taskInfo = (TaskInfo) o;
if (uniqueId != taskInfo.uniqueId) {
return false;
}
return task.equals(taskInfo.task);
}
@Override
public int hashCode() {
int result = (int) (uniqueId ^ (uniqueId >>> 32));
result = 31 * result + task.hashCode();
return result;
}
@Override
public String toString() {
return "TaskInfo{" +
"task=" + task +
", priority=" + priority +
", startTime=" + startTime +
", containerId=" + containerId +
(assignedNode != null ? ", assignedNode=" + assignedNode.toShortString() : "") +
", uniqueId=" + uniqueId +
", localityDelayTimeout=" + localityDelayTimeout +
'}';
}
@Override
public long getDelay(TimeUnit unit) {
return unit.convert(localityDelayTimeout - clock.getTime(), TimeUnit.MILLISECONDS);
}
@Override
public int compareTo(Delayed o) {
TaskInfo other = (TaskInfo) o;
if (other.localityDelayTimeout > this.localityDelayTimeout) {
return -1;
} else if (other.localityDelayTimeout < this.localityDelayTimeout) {
return 1;
} else {
return 0;
}
}
@VisibleForTesting
boolean isGuaranteed() {
return isGuaranteed;
}
@VisibleForTesting
boolean getLastSetGuaranteed() {
return lastSetGuaranteed;
}
@VisibleForTesting
boolean isUpdateInProgress() {
return isPendingUpdate;
}
TezTaskAttemptID getAttemptId() {
return attemptId;
}
}
// Newer tasks first.
private static class TaskStartComparator implements Comparator {
@Override
public int compare(TaskInfo o1, TaskInfo o2) {
if (o1.startTime > o2.startTime) {
return -1;
} else if (o1.startTime < o2.startTime) {
return 1;
} else {
// Comparing on time is not sufficient since two may be created at the same time,
// in which case inserting into a TreeSet/Map would break
if (o1.uniqueId > o2.uniqueId) {
return -1;
} else if (o1.uniqueId < o2.uniqueId) {
return 1;
} else {
return 0;
}
}
}
}
private static class SelectHostResult {
final NodeInfo nodeInfo;
final ScheduleResult scheduleResult;
SelectHostResult(NodeInfo nodeInfo) {
this.nodeInfo = nodeInfo;
this.scheduleResult = ScheduleResult.SCHEDULED;
}
SelectHostResult(ScheduleResult scheduleResult) {
this.nodeInfo = null;
this.scheduleResult = scheduleResult;
}
}
private static final SelectHostResult SELECT_HOST_RESULT_INADEQUATE_TOTAL_CAPACITY =
new SelectHostResult(ScheduleResult.INADEQUATE_TOTAL_RESOURCES);
private static final SelectHostResult SELECT_HOST_RESULT_DELAYED_LOCALITY =
new SelectHostResult(ScheduleResult.DELAYED_LOCALITY);
private static final SelectHostResult SELECT_HOST_RESULT_DELAYED_RESOURCES =
new SelectHostResult(ScheduleResult.DELAYED_RESOURCES);
private static final class NodeBlacklistConf {
private final long minDelay;
private final long maxDelay;
private final float backoffFactor;
public NodeBlacklistConf(long minDelay, long maxDelay, float backoffFactor) {
this.minDelay = minDelay;
this.maxDelay = maxDelay;
this.backoffFactor = backoffFactor;
}
@Override
public String toString() {
return "NodeBlacklistConf{" +
"minDelay=" + minDelay +
", maxDelay=" + maxDelay +
", backoffFactor=" + backoffFactor +
'}';
}
}
@VisibleForTesting
static final class LocalityDelayConf {
private final long nodeLocalityDelay;
public LocalityDelayConf(long nodeLocalityDelay) {
this.nodeLocalityDelay = nodeLocalityDelay;
}
public long getNodeLocalityDelay() {
return nodeLocalityDelay;
}
@Override
public String toString() {
return "LocalityDelayConf{" +
"nodeLocalityDelay=" + nodeLocalityDelay +
'}';
}
}
public void updateQuery(UpdateQueryRequestProto request) {
if (request.hasGuaranteedTaskCount()) {
updateGuaranteedCount(request.getGuaranteedTaskCount());
}
}
void setTaskCommunicator(LlapTaskCommunicator communicator) {
this.communicator = communicator;
}
protected void sendUpdateMessageAsync(TaskInfo ti, boolean newState) {
WM_LOG.info("Sending message to " + ti.attemptId + ": " + newState);
communicator.startUpdateGuaranteed(ti.attemptId, ti.assignedNode, newState, UPDATE_CALLBACK, ti);
}
@VisibleForTesting
int getUnusedGuaranteedCount() {
return unusedGuaranteed;
}
/**
* A direct call from communicator to scheduler to propagate data that cannot be passed via Tez.
*/
public void taskInfoUpdated(TezTaskAttemptID attemptId, boolean isGuaranteed) {
TaskInfo ti = null;
writeLock.lock();
try {
ti = tasksById.get(attemptId);
if (ti == null) {
WM_LOG.warn("Unknown task from heartbeat " + attemptId);
return;
}
} finally {
writeLock.unlock();
}
boolean newState = false;
synchronized (ti) {
if (ti.isPendingUpdate) return; // A pending update is not done.
if (ti.isGuaranteed == null) return; // The task has terminated, out of date heartbeat.
if (ti.lastSetGuaranteed != null && ti.lastSetGuaranteed == isGuaranteed) {
return; // The heartbeat is consistent with what we have.
}
ti.lastSetGuaranteed = isGuaranteed;
if (isGuaranteed == ti.isGuaranteed) return; // Already consistent. Can happen w/null lSG.
// There could be races here, e.g. heartbeat delivered us the old value just after we have
// received a successful confirmation from the API, so we are about to overwrite the latter.
// We could solve this by adding a version or smth like that; or by ignoring discrepancies
// unless we have previously received an update error for this task; however, the only effect
// right now are a few cheap redundant update calls; let's just do the simple thing.
newState = ti.isGuaranteed;
setUpdateStartedUnderTiLock(ti);
} // End of synchronized (ti)
WM_LOG.info("Sending an update based on inconsistent state from heartbeat for "
+ attemptId + ", " + newState);
sendUpdateMessageAsync(ti, newState);
}
}