Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.elasticsearch.cluster.service.MasterService Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.service;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.Assertions;
import org.elasticsearch.action.support.PlainActionFuture;
import org.elasticsearch.cluster.AckedClusterStateTaskListener;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterState.Builder;
import org.elasticsearch.cluster.ClusterStatePublicationEvent;
import org.elasticsearch.cluster.ClusterStateTaskConfig;
import org.elasticsearch.cluster.ClusterStateTaskExecutor;
import org.elasticsearch.cluster.ClusterStateTaskExecutor.ClusterTasksResult;
import org.elasticsearch.cluster.ClusterStateTaskListener;
import org.elasticsearch.cluster.coordination.ClusterStatePublisher;
import org.elasticsearch.cluster.coordination.FailedToCommitClusterStateException;
import org.elasticsearch.cluster.metadata.IndexAbstraction;
import org.elasticsearch.cluster.metadata.ProcessClusterEventTimeoutException;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.util.concurrent.CountDown;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.common.util.concurrent.FutureUtils;
import org.elasticsearch.common.util.concurrent.PrioritizedEsThreadPoolExecutor;
import org.elasticsearch.common.util.concurrent.ThreadContext;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.discovery.Discovery;
import org.elasticsearch.node.Node;
import org.elasticsearch.threadpool.Scheduler;
import org.elasticsearch.threadpool.ThreadPool;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.SortedMap;
import java.util.concurrent.TimeUnit;
import java.util.function.LongSupplier;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import static org.elasticsearch.common.util.concurrent.EsExecutors.daemonThreadFactory;
public class MasterService extends AbstractLifecycleComponent {
private static final Logger logger = LogManager.getLogger(MasterService.class);
public static final Setting MASTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING = Setting.positiveTimeSetting(
"cluster.service.slow_master_task_logging_threshold",
TimeValue.timeValueSeconds(10),
Setting.Property.Dynamic,
Setting.Property.NodeScope
);
public static final Setting MASTER_SERVICE_STARVATION_LOGGING_THRESHOLD_SETTING = Setting.positiveTimeSetting(
"cluster.service.master_service_starvation_logging_threshold",
TimeValue.timeValueMinutes(5),
Setting.Property.NodeScope
);
static final String MASTER_UPDATE_THREAD_NAME = "masterService#updateTask";
ClusterStatePublisher clusterStatePublisher;
private final String nodeName;
private java.util.function.Supplier clusterStateSupplier;
private volatile TimeValue slowTaskLoggingThreshold;
private final TimeValue starvationLoggingThreshold;
protected final ThreadPool threadPool;
private volatile PrioritizedEsThreadPoolExecutor threadPoolExecutor;
private volatile Batcher taskBatcher;
private final ClusterStateUpdateStatsTracker clusterStateUpdateStatsTracker = new ClusterStateUpdateStatsTracker();
public MasterService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool) {
this.nodeName = Objects.requireNonNull(Node.NODE_NAME_SETTING.get(settings));
this.slowTaskLoggingThreshold = MASTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(MASTER_SERVICE_SLOW_TASK_LOGGING_THRESHOLD_SETTING, this::setSlowTaskLoggingThreshold);
this.starvationLoggingThreshold = MASTER_SERVICE_STARVATION_LOGGING_THRESHOLD_SETTING.get(settings);
this.threadPool = threadPool;
}
private void setSlowTaskLoggingThreshold(TimeValue slowTaskLoggingThreshold) {
this.slowTaskLoggingThreshold = slowTaskLoggingThreshold;
}
public synchronized void setClusterStatePublisher(ClusterStatePublisher publisher) {
clusterStatePublisher = publisher;
}
public synchronized void setClusterStateSupplier(java.util.function.Supplier clusterStateSupplier) {
this.clusterStateSupplier = clusterStateSupplier;
}
@Override
protected synchronized void doStart() {
Objects.requireNonNull(clusterStatePublisher, "please set a cluster state publisher before starting");
Objects.requireNonNull(clusterStateSupplier, "please set a cluster state supplier before starting");
threadPoolExecutor = createThreadPoolExecutor();
taskBatcher = new Batcher(logger, threadPoolExecutor);
}
protected PrioritizedEsThreadPoolExecutor createThreadPoolExecutor() {
return EsExecutors.newSinglePrioritizing(
nodeName + "/" + MASTER_UPDATE_THREAD_NAME,
daemonThreadFactory(nodeName, MASTER_UPDATE_THREAD_NAME),
threadPool.getThreadContext(),
threadPool.scheduler(),
new MasterServiceStarvationWatcher(
starvationLoggingThreshold.getMillis(),
threadPool::relativeTimeInMillis,
() -> threadPoolExecutor
)
);
}
public ClusterStateUpdateStats getClusterStateUpdateStats() {
return clusterStateUpdateStatsTracker.getStatistics();
}
@SuppressWarnings("unchecked")
class Batcher extends TaskBatcher {
Batcher(Logger logger, PrioritizedEsThreadPoolExecutor threadExecutor) {
super(logger, threadExecutor);
}
@Override
protected void onTimeout(List extends BatchedTask> tasks, TimeValue timeout) {
threadPool.generic()
.execute(
() -> tasks.forEach(
task -> ((UpdateTask) task).listener.onFailure(
task.source,
new ProcessClusterEventTimeoutException(timeout, task.source)
)
)
);
}
@Override
protected void run(Object batchingKey, List extends BatchedTask> tasks, String tasksSummary) {
ClusterStateTaskExecutor taskExecutor = (ClusterStateTaskExecutor) batchingKey;
List updateTasks = (List) tasks;
runTasks(new TaskInputs(taskExecutor, updateTasks, tasksSummary));
}
class UpdateTask extends BatchedTask {
final ClusterStateTaskListener listener;
UpdateTask(
Priority priority,
String source,
Object task,
ClusterStateTaskListener listener,
ClusterStateTaskExecutor> executor
) {
super(priority, source, executor, task);
this.listener = listener;
}
@Override
public String describeTasks(List extends BatchedTask> tasks) {
return ((ClusterStateTaskExecutor) batchingKey).describeTasks(
tasks.stream().map(BatchedTask::getTask).collect(Collectors.toList())
);
}
}
}
@Override
protected synchronized void doStop() {
ThreadPool.terminate(threadPoolExecutor, 10, TimeUnit.SECONDS);
}
@Override
protected synchronized void doClose() {}
/**
* The current cluster state exposed by the discovery layer. Package-visible for tests.
*/
ClusterState state() {
return clusterStateSupplier.get();
}
public static boolean isMasterUpdateThread() {
return Thread.currentThread().getName().contains('[' + MASTER_UPDATE_THREAD_NAME + ']');
}
public static boolean assertMasterUpdateThread() {
assert isMasterUpdateThread() : "not called from the master service thread";
return true;
}
public static boolean assertNotMasterUpdateThread(String reason) {
assert isMasterUpdateThread() == false
: "Expected current thread [" + Thread.currentThread() + "] to not be the master service thread. Reason: [" + reason + "]";
return true;
}
private void runTasks(TaskInputs taskInputs) {
final String summary = taskInputs.summary;
if (lifecycle.started() == false) {
logger.debug("processing [{}]: ignoring, master service not started", summary);
return;
}
logger.debug("executing cluster state update for [{}]", summary);
final ClusterState previousClusterState = state();
if (previousClusterState.nodes().isLocalNodeElectedMaster() == false && taskInputs.runOnlyWhenMaster()) {
logger.debug("failing [{}]: local node is no longer master", summary);
taskInputs.onNoLongerMaster();
return;
}
final long computationStartTime = threadPool.rawRelativeTimeInMillis();
final TaskOutputs taskOutputs = calculateTaskOutputs(taskInputs, previousClusterState);
taskOutputs.notifyFailedTasks();
final TimeValue computationTime = getTimeSince(computationStartTime);
logExecutionTime(computationTime, "compute cluster state update", summary);
if (taskOutputs.clusterStateUnchanged()) {
final long notificationStartTime = threadPool.rawRelativeTimeInMillis();
taskOutputs.notifySuccessfulTasksOnUnchangedClusterState();
final TimeValue executionTime = getTimeSince(notificationStartTime);
logExecutionTime(executionTime, "notify listeners on unchanged cluster state", summary);
clusterStateUpdateStatsTracker.onUnchangedClusterState(computationTime.millis(), executionTime.millis());
} else {
final ClusterState newClusterState = taskOutputs.newClusterState;
if (logger.isTraceEnabled()) {
logger.trace("cluster state updated, source [{}]\n{}", summary, newClusterState);
} else {
logger.debug("cluster state updated, version [{}], source [{}]", newClusterState.version(), summary);
}
final long publicationStartTime = threadPool.rawRelativeTimeInMillis();
try {
final ClusterStatePublicationEvent clusterStatePublicationEvent = new ClusterStatePublicationEvent(
summary,
previousClusterState,
newClusterState,
computationTime.millis(),
publicationStartTime
);
// new cluster state, notify all listeners
final DiscoveryNodes.Delta nodesDelta = newClusterState.nodes().delta(previousClusterState.nodes());
if (nodesDelta.hasChanges() && logger.isInfoEnabled()) {
String nodesDeltaSummary = nodesDelta.shortSummary();
if (nodesDeltaSummary.length() > 0) {
logger.info(
"{}, term: {}, version: {}, delta: {}",
summary,
newClusterState.term(),
newClusterState.version(),
nodesDeltaSummary
);
}
}
logger.debug("publishing cluster state version [{}]", newClusterState.version());
publish(clusterStatePublicationEvent, taskOutputs);
} catch (Exception e) {
handleException(summary, publicationStartTime, newClusterState, e);
}
}
}
private TimeValue getTimeSince(long startTimeMillis) {
return TimeValue.timeValueMillis(Math.max(0, threadPool.rawRelativeTimeInMillis() - startTimeMillis));
}
protected void publish(ClusterStatePublicationEvent clusterStatePublicationEvent, TaskOutputs taskOutputs) {
final PlainActionFuture fut = new PlainActionFuture() {
@Override
protected boolean blockingAllowed() {
return isMasterUpdateThread() || super.blockingAllowed();
}
};
clusterStatePublisher.publish(
clusterStatePublicationEvent,
fut,
taskOutputs.createAckListener(threadPool, clusterStatePublicationEvent.getNewState())
);
// indefinitely wait for publication to complete
try {
FutureUtils.get(fut);
onPublicationSuccess(clusterStatePublicationEvent, taskOutputs);
} catch (Exception e) {
onPublicationFailed(clusterStatePublicationEvent, taskOutputs, e);
}
}
void onPublicationSuccess(ClusterStatePublicationEvent clusterStatePublicationEvent, TaskOutputs taskOutputs) {
final long notificationStartTime = threadPool.rawRelativeTimeInMillis();
taskOutputs.processedDifferentClusterState(clusterStatePublicationEvent.getOldState(), clusterStatePublicationEvent.getNewState());
try {
taskOutputs.clusterStatePublished(clusterStatePublicationEvent);
} catch (Exception e) {
logger.error(
() -> new ParameterizedMessage(
"exception thrown while notifying executor of new cluster state publication [{}]",
clusterStatePublicationEvent.getSummary()
),
e
);
}
final TimeValue executionTime = getTimeSince(notificationStartTime);
logExecutionTime(
executionTime,
"notify listeners on successful publication of cluster state (version: "
+ clusterStatePublicationEvent.getNewState().version()
+ ", uuid: "
+ clusterStatePublicationEvent.getNewState().stateUUID()
+ ')',
clusterStatePublicationEvent.getSummary()
);
clusterStateUpdateStatsTracker.onPublicationSuccess(
threadPool.rawRelativeTimeInMillis(),
clusterStatePublicationEvent,
executionTime.millis()
);
}
void onPublicationFailed(ClusterStatePublicationEvent clusterStatePublicationEvent, TaskOutputs taskOutputs, Exception exception) {
if (exception instanceof FailedToCommitClusterStateException) {
final long notificationStartTime = threadPool.rawRelativeTimeInMillis();
final long version = clusterStatePublicationEvent.getNewState().version();
logger.warn(
() -> new ParameterizedMessage(
"failing [{}]: failed to commit cluster state version [{}]",
clusterStatePublicationEvent.getSummary(),
version
),
exception
);
taskOutputs.publishingFailed((FailedToCommitClusterStateException) exception);
final long notificationMillis = threadPool.rawRelativeTimeInMillis() - notificationStartTime;
clusterStateUpdateStatsTracker.onPublicationFailure(
threadPool.rawRelativeTimeInMillis(),
clusterStatePublicationEvent,
notificationMillis
);
} else {
assert publicationMayFail() : exception;
clusterStateUpdateStatsTracker.onPublicationFailure(threadPool.rawRelativeTimeInMillis(), clusterStatePublicationEvent, 0L);
handleException(
clusterStatePublicationEvent.getSummary(),
clusterStatePublicationEvent.getPublicationStartTimeMillis(),
clusterStatePublicationEvent.getNewState(),
exception
);
}
}
protected boolean publicationMayFail() {
return false;
}
private void handleException(String summary, long startTimeMillis, ClusterState newClusterState, Exception e) {
final TimeValue executionTime = getTimeSince(startTimeMillis);
final long version = newClusterState.version();
final String stateUUID = newClusterState.stateUUID();
final String fullState = newClusterState.toString();
logger.warn(
new ParameterizedMessage(
"took [{}] and then failed to publish updated cluster state (version: {}, uuid: {}) for [{}]:\n{}",
executionTime,
version,
stateUUID,
summary,
fullState
),
e
);
// TODO: do we want to call updateTask.onFailure here?
}
private TaskOutputs calculateTaskOutputs(TaskInputs taskInputs, ClusterState previousClusterState) {
ClusterTasksResult clusterTasksResult = executeTasks(taskInputs, previousClusterState);
ClusterState newClusterState = patchVersions(previousClusterState, clusterTasksResult);
return new TaskOutputs(
taskInputs,
previousClusterState,
newClusterState,
getNonFailedTasks(taskInputs, clusterTasksResult),
clusterTasksResult.executionResults
);
}
private ClusterState patchVersions(ClusterState previousClusterState, ClusterTasksResult> executionResult) {
ClusterState newClusterState = executionResult.resultingState;
if (previousClusterState != newClusterState) {
// only the master controls the version numbers
final SortedMap previousIndicesLookup = newClusterState.metadata().getIndicesLookup();
Builder builder = incrementVersion(newClusterState);
if (previousClusterState.routingTable() != newClusterState.routingTable()) {
builder.routingTable(newClusterState.routingTable().withIncrementedVersion());
}
if (previousClusterState.metadata() != newClusterState.metadata()) {
builder.metadata(newClusterState.metadata().withIncrementedVersion());
}
newClusterState = builder.build();
assert previousIndicesLookup == newClusterState.metadata().getIndicesLookup();
}
return newClusterState;
}
public Builder incrementVersion(ClusterState clusterState) {
return ClusterState.builder(clusterState).incrementVersion();
}
/**
* Submits a cluster state update task; unlike {@link #submitStateUpdateTask(String, Object, ClusterStateTaskConfig,
* ClusterStateTaskExecutor, ClusterStateTaskListener)}, submitted updates will not be batched.
*
* @param source the source of the cluster state update task
* @param updateTask the full context for the cluster state update
* task
*
*/
public & ClusterStateTaskListener> void submitStateUpdateTask(
String source,
T updateTask
) {
submitStateUpdateTask(source, updateTask, updateTask, updateTask, updateTask);
}
/**
* Submits a cluster state update task; submitted updates will be
* batched across the same instance of executor. The exact batching
* semantics depend on the underlying implementation but a rough
* guideline is that if the update task is submitted while there
* are pending update tasks for the same executor, these update
* tasks will all be executed on the executor in a single batch
*
* @param source the source of the cluster state update task
* @param task the state needed for the cluster state update task
* @param config the cluster state update task configuration
* @param executor the cluster state update task executor; tasks
* that share the same executor will be executed
* batches on this executor
* @param listener callback after the cluster state update task
* completes
* @param the type of the cluster state update task state
*
*/
public void submitStateUpdateTask(
String source,
T task,
ClusterStateTaskConfig config,
ClusterStateTaskExecutor executor,
ClusterStateTaskListener listener
) {
submitStateUpdateTasks(source, Collections.singletonMap(task, listener), config, executor);
}
/**
* Output created by executing a set of tasks provided as TaskInputs
*/
class TaskOutputs {
final TaskInputs taskInputs;
final ClusterState previousClusterState;
final ClusterState newClusterState;
final List nonFailedTasks;
final Map executionResults;
TaskOutputs(
TaskInputs taskInputs,
ClusterState previousClusterState,
ClusterState newClusterState,
List nonFailedTasks,
Map executionResults
) {
this.taskInputs = taskInputs;
this.previousClusterState = previousClusterState;
this.newClusterState = newClusterState;
this.nonFailedTasks = nonFailedTasks;
this.executionResults = executionResults;
}
void publishingFailed(FailedToCommitClusterStateException t) {
nonFailedTasks.forEach(task -> task.listener.onFailure(task.source(), t));
}
void processedDifferentClusterState(ClusterState previousClusterState, ClusterState newClusterState) {
nonFailedTasks.forEach(task -> task.listener.clusterStateProcessed(task.source(), previousClusterState, newClusterState));
}
void clusterStatePublished(ClusterStatePublicationEvent clusterStatePublicationEvent) {
taskInputs.executor.clusterStatePublished(clusterStatePublicationEvent);
}
Discovery.AckListener createAckListener(ThreadPool threadPool, ClusterState newClusterState) {
return new DelegatingAckListener(
nonFailedTasks.stream()
.filter(task -> task.listener instanceof AckedClusterStateTaskListener)
.map(
task -> new AckCountDownListener(
(AckedClusterStateTaskListener) task.listener,
newClusterState.version(),
newClusterState.nodes(),
threadPool
)
)
.collect(Collectors.toList())
);
}
boolean clusterStateUnchanged() {
return previousClusterState == newClusterState;
}
void notifyFailedTasks() {
// fail all tasks that have failed
for (Batcher.UpdateTask updateTask : taskInputs.updateTasks) {
assert executionResults.containsKey(updateTask.task) : "missing " + updateTask;
final ClusterStateTaskExecutor.TaskResult taskResult = executionResults.get(updateTask.task);
if (taskResult.isSuccess() == false) {
updateTask.listener.onFailure(updateTask.source(), taskResult.getFailure());
}
}
}
void notifySuccessfulTasksOnUnchangedClusterState() {
nonFailedTasks.forEach(task -> {
if (task.listener instanceof AckedClusterStateTaskListener) {
// no need to wait for ack if nothing changed, the update can be counted as acknowledged
((AckedClusterStateTaskListener) task.listener).onAllNodesAcked(null);
}
task.listener.clusterStateProcessed(task.source(), newClusterState, newClusterState);
});
}
}
/**
* Returns the tasks that are pending.
*/
public List pendingTasks() {
return Arrays.stream(threadPoolExecutor.getPending()).map(pending -> {
assert pending.task instanceof SourcePrioritizedRunnable
: "thread pool executor should only use SourcePrioritizedRunnable instances but found: "
+ pending.task.getClass().getName();
SourcePrioritizedRunnable task = (SourcePrioritizedRunnable) pending.task;
return new PendingClusterTask(
pending.insertionOrder,
pending.priority,
new Text(task.source()),
task.getAgeInMillis(),
pending.executing
);
}).collect(Collectors.toList());
}
/**
* Returns the number of currently pending tasks.
*/
public int numberOfPendingTasks() {
return threadPoolExecutor.getNumberOfPendingTasks();
}
/**
* Returns the maximum wait time for tasks in the queue
*
* @return A zero time value if the queue is empty, otherwise the time value oldest task waiting in the queue
*/
public TimeValue getMaxTaskWaitTime() {
return threadPoolExecutor.getMaxTaskWaitTime();
}
private SafeClusterStateTaskListener safe(ClusterStateTaskListener listener, Supplier contextSupplier) {
if (listener instanceof AckedClusterStateTaskListener) {
return new SafeAckedClusterStateTaskListener((AckedClusterStateTaskListener) listener, contextSupplier, logger);
} else {
return new SafeClusterStateTaskListener(listener, contextSupplier, logger);
}
}
private static class SafeClusterStateTaskListener implements ClusterStateTaskListener {
private final ClusterStateTaskListener listener;
protected final Supplier context;
private final Logger logger;
SafeClusterStateTaskListener(ClusterStateTaskListener listener, Supplier context, Logger logger) {
this.listener = listener;
this.context = context;
this.logger = logger;
}
@Override
public void onFailure(String source, Exception e) {
try (ThreadContext.StoredContext ignore = context.get()) {
listener.onFailure(source, e);
} catch (Exception inner) {
inner.addSuppressed(e);
logger.error(() -> new ParameterizedMessage("exception thrown by listener notifying of failure from [{}]", source), inner);
}
}
@Override
public void onNoLongerMaster(String source) {
try (ThreadContext.StoredContext ignore = context.get()) {
listener.onNoLongerMaster(source);
} catch (Exception e) {
logger.error(
() -> new ParameterizedMessage("exception thrown by listener while notifying no longer master from [{}]", source),
e
);
}
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
try (ThreadContext.StoredContext ignore = context.get()) {
listener.clusterStateProcessed(source, oldState, newState);
} catch (Exception e) {
logger.error(
() -> new ParameterizedMessage(
"exception thrown by listener while notifying of cluster state processed from [{}], old cluster state:\n"
+ "{}\nnew cluster state:\n{}",
source,
oldState,
newState
),
e
);
}
}
}
private static class SafeAckedClusterStateTaskListener extends SafeClusterStateTaskListener implements AckedClusterStateTaskListener {
private final AckedClusterStateTaskListener listener;
private final Logger logger;
SafeAckedClusterStateTaskListener(
AckedClusterStateTaskListener listener,
Supplier context,
Logger logger
) {
super(listener, context, logger);
this.listener = listener;
this.logger = logger;
}
@Override
public boolean mustAck(DiscoveryNode discoveryNode) {
return listener.mustAck(discoveryNode);
}
@Override
public void onAllNodesAcked(@Nullable Exception e) {
try (ThreadContext.StoredContext ignore = context.get()) {
listener.onAllNodesAcked(e);
} catch (Exception inner) {
inner.addSuppressed(e);
logger.error("exception thrown by listener while notifying on all nodes acked", inner);
}
}
@Override
public void onAckTimeout() {
try (ThreadContext.StoredContext ignore = context.get()) {
listener.onAckTimeout();
} catch (Exception e) {
logger.error("exception thrown by listener while notifying on ack timeout", e);
}
}
@Override
public TimeValue ackTimeout() {
return listener.ackTimeout();
}
}
private void logExecutionTime(TimeValue executionTime, String activity, String summary) {
if (executionTime.getMillis() > slowTaskLoggingThreshold.getMillis()) {
logger.warn(
"took [{}/{}ms] to {} for [{}], which exceeds the warn threshold of [{}]",
executionTime,
executionTime.getMillis(),
activity,
summary,
slowTaskLoggingThreshold
);
} else {
logger.debug("took [{}] to {} for [{}]", executionTime, activity, summary);
}
}
private static class DelegatingAckListener implements Discovery.AckListener {
private final List listeners;
private DelegatingAckListener(List listeners) {
this.listeners = listeners;
}
@Override
public void onCommit(TimeValue commitTime) {
for (Discovery.AckListener listener : listeners) {
listener.onCommit(commitTime);
}
}
@Override
public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
for (Discovery.AckListener listener : listeners) {
listener.onNodeAck(node, e);
}
}
}
private static class AckCountDownListener implements Discovery.AckListener {
private static final Logger logger = LogManager.getLogger(AckCountDownListener.class);
private final AckedClusterStateTaskListener ackedTaskListener;
private final CountDown countDown;
private final DiscoveryNode masterNode;
private final ThreadPool threadPool;
private final long clusterStateVersion;
private volatile Scheduler.Cancellable ackTimeoutCallback;
private Exception lastFailure;
AckCountDownListener(
AckedClusterStateTaskListener ackedTaskListener,
long clusterStateVersion,
DiscoveryNodes nodes,
ThreadPool threadPool
) {
this.ackedTaskListener = ackedTaskListener;
this.clusterStateVersion = clusterStateVersion;
this.threadPool = threadPool;
this.masterNode = nodes.getMasterNode();
int countDown = 0;
for (DiscoveryNode node : nodes) {
// we always wait for at least the master node
if (node.equals(masterNode) || ackedTaskListener.mustAck(node)) {
countDown++;
}
}
logger.trace("expecting {} acknowledgements for cluster_state update (version: {})", countDown, clusterStateVersion);
this.countDown = new CountDown(countDown + 1); // we also wait for onCommit to be called
}
@Override
public void onCommit(TimeValue commitTime) {
TimeValue ackTimeout = ackedTaskListener.ackTimeout();
if (ackTimeout == null) {
ackTimeout = TimeValue.ZERO;
}
final TimeValue timeLeft = TimeValue.timeValueNanos(Math.max(0, ackTimeout.nanos() - commitTime.nanos()));
if (timeLeft.nanos() == 0L) {
onTimeout();
} else if (countDown.countDown()) {
finish();
} else {
this.ackTimeoutCallback = threadPool.schedule(this::onTimeout, timeLeft, ThreadPool.Names.GENERIC);
// re-check if onNodeAck has not completed while we were scheduling the timeout
if (countDown.isCountedDown()) {
ackTimeoutCallback.cancel();
}
}
}
@Override
public void onNodeAck(DiscoveryNode node, @Nullable Exception e) {
if (node.equals(masterNode) == false && ackedTaskListener.mustAck(node) == false) {
return;
}
if (e == null) {
logger.trace("ack received from node [{}], cluster_state update (version: {})", node, clusterStateVersion);
} else {
this.lastFailure = e;
logger.debug(
() -> new ParameterizedMessage(
"ack received from node [{}], cluster_state update (version: {})",
node,
clusterStateVersion
),
e
);
}
if (countDown.countDown()) {
finish();
}
}
private void finish() {
logger.trace("all expected nodes acknowledged cluster_state update (version: {})", clusterStateVersion);
if (ackTimeoutCallback != null) {
ackTimeoutCallback.cancel();
}
ackedTaskListener.onAllNodesAcked(lastFailure);
}
public void onTimeout() {
if (countDown.fastForward()) {
logger.trace("timeout waiting for acknowledgement for cluster_state update (version: {})", clusterStateVersion);
ackedTaskListener.onAckTimeout();
}
}
}
private ClusterTasksResult executeTasks(TaskInputs taskInputs, ClusterState previousClusterState) {
ClusterTasksResult clusterTasksResult;
try {
List inputs = taskInputs.updateTasks.stream().map(tUpdateTask -> tUpdateTask.task).collect(Collectors.toList());
clusterTasksResult = taskInputs.executor.execute(previousClusterState, inputs);
if (previousClusterState != clusterTasksResult.resultingState
&& previousClusterState.nodes().isLocalNodeElectedMaster()
&& (clusterTasksResult.resultingState.nodes().isLocalNodeElectedMaster() == false)) {
throw new AssertionError("update task submitted to MasterService cannot remove master");
}
} catch (Exception e) {
logger.trace(
() -> new ParameterizedMessage(
"failed to execute cluster state update (on version: [{}], uuid: [{}]) for [{}]\n{}{}{}",
previousClusterState.version(),
previousClusterState.stateUUID(),
taskInputs.summary,
previousClusterState.nodes(),
previousClusterState.routingTable(),
previousClusterState.getRoutingNodes()
), // may be expensive => construct message lazily
e
);
clusterTasksResult = ClusterTasksResult.builder()
.failures(taskInputs.updateTasks.stream().map(updateTask -> updateTask.task)::iterator, e)
.build(previousClusterState);
}
assert clusterTasksResult.executionResults != null;
assert clusterTasksResult.executionResults.size() == taskInputs.updateTasks.size()
: String.format(
Locale.ROOT,
"expected [%d] task result%s but was [%d]",
taskInputs.updateTasks.size(),
taskInputs.updateTasks.size() == 1 ? "" : "s",
clusterTasksResult.executionResults.size()
);
if (Assertions.ENABLED) {
ClusterTasksResult finalClusterTasksResult = clusterTasksResult;
taskInputs.updateTasks.forEach(
updateTask -> {
assert finalClusterTasksResult.executionResults.containsKey(updateTask.task) : "missing task result for " + updateTask;
}
);
}
return clusterTasksResult;
}
private List getNonFailedTasks(TaskInputs taskInputs, ClusterTasksResult clusterTasksResult) {
return taskInputs.updateTasks.stream().filter(updateTask -> {
assert clusterTasksResult.executionResults.containsKey(updateTask.task) : "missing " + updateTask;
final ClusterStateTaskExecutor.TaskResult taskResult = clusterTasksResult.executionResults.get(updateTask.task);
return taskResult.isSuccess();
}).collect(Collectors.toList());
}
/**
* Represents a set of tasks to be processed together with their executor
*/
private class TaskInputs {
final String summary;
final List updateTasks;
final ClusterStateTaskExecutor executor;
TaskInputs(ClusterStateTaskExecutor executor, List updateTasks, String summary) {
this.summary = summary;
this.executor = executor;
this.updateTasks = updateTasks;
}
boolean runOnlyWhenMaster() {
return executor.runOnlyOnMaster();
}
void onNoLongerMaster() {
updateTasks.forEach(task -> task.listener.onNoLongerMaster(task.source()));
}
}
/**
* Submits a batch of cluster state update tasks; submitted updates are guaranteed to be processed together,
* potentially with more tasks of the same executor.
*
* @param source the source of the cluster state update task
* @param tasks a map of update tasks and their corresponding listeners
* @param config the cluster state update task configuration
* @param executor the cluster state update task executor; tasks
* that share the same executor will be executed
* batches on this executor
* @param the type of the cluster state update task state
*
*/
public void submitStateUpdateTasks(
final String source,
final Map tasks,
final ClusterStateTaskConfig config,
final ClusterStateTaskExecutor executor
) {
if (lifecycle.started() == false) {
return;
}
final ThreadContext threadContext = threadPool.getThreadContext();
final Supplier supplier = threadContext.newRestorableContext(true);
try (ThreadContext.StoredContext ignore = threadContext.stashContext()) {
threadContext.markAsSystemContext();
List safeTasks = tasks.entrySet()
.stream()
.map(e -> taskBatcher.new UpdateTask(config.priority(), source, e.getKey(), safe(e.getValue(), supplier), executor))
.collect(Collectors.toList());
taskBatcher.submitTasks(safeTasks, config.timeout());
} catch (EsRejectedExecutionException e) {
// ignore cases where we are shutting down..., there is really nothing interesting
// to be done here...
if (lifecycle.stoppedOrClosed() == false) {
throw e;
}
}
}
private static class MasterServiceStarvationWatcher implements PrioritizedEsThreadPoolExecutor.StarvationWatcher {
private final long warnThreshold;
private final LongSupplier nowMillisSupplier;
private final Supplier threadPoolExecutorSupplier;
// accesses of these mutable fields are synchronized (on this)
private long lastLogMillis;
private long nonemptySinceMillis;
private boolean isEmpty = true;
MasterServiceStarvationWatcher(
long warnThreshold,
LongSupplier nowMillisSupplier,
Supplier threadPoolExecutorSupplier
) {
this.nowMillisSupplier = nowMillisSupplier;
this.threadPoolExecutorSupplier = threadPoolExecutorSupplier;
this.warnThreshold = warnThreshold;
}
@Override
public synchronized void onEmptyQueue() {
isEmpty = true;
}
@Override
public void onNonemptyQueue() {
final long nowMillis = nowMillisSupplier.getAsLong();
final long nonemptyDurationMillis;
synchronized (this) {
if (isEmpty) {
isEmpty = false;
nonemptySinceMillis = nowMillis;
lastLogMillis = nowMillis;
return;
}
if (nowMillis - lastLogMillis < warnThreshold) {
return;
}
lastLogMillis = nowMillis;
nonemptyDurationMillis = nowMillis - nonemptySinceMillis;
}
final PrioritizedEsThreadPoolExecutor threadPoolExecutor = threadPoolExecutorSupplier.get();
final TimeValue maxTaskWaitTime = threadPoolExecutor.getMaxTaskWaitTime();
logger.warn(
"pending task queue has been nonempty for [{}/{}ms] which is longer than the warn threshold of [{}ms];"
+ " there are currently [{}] pending tasks, the oldest of which has age [{}/{}ms]",
TimeValue.timeValueMillis(nonemptyDurationMillis),
nonemptyDurationMillis,
warnThreshold,
threadPoolExecutor.getNumberOfPendingTasks(),
maxTaskWaitTime,
maxTaskWaitTime.millis()
);
}
}
private static class ClusterStateUpdateStatsTracker {
private long unchangedTaskCount;
private long publicationSuccessCount;
private long publicationFailureCount;
private long unchangedComputationElapsedMillis;
private long unchangedNotificationElapsedMillis;
private long successfulComputationElapsedMillis;
private long successfulPublicationElapsedMillis;
private long successfulContextConstructionElapsedMillis;
private long successfulCommitElapsedMillis;
private long successfulCompletionElapsedMillis;
private long successfulMasterApplyElapsedMillis;
private long successfulNotificationElapsedMillis;
private long failedComputationElapsedMillis;
private long failedPublicationElapsedMillis;
private long failedContextConstructionElapsedMillis;
private long failedCommitElapsedMillis;
private long failedCompletionElapsedMillis;
private long failedMasterApplyElapsedMillis;
private long failedNotificationElapsedMillis;
synchronized void onUnchangedClusterState(long computationElapsedMillis, long notificationElapsedMillis) {
unchangedTaskCount += 1;
unchangedComputationElapsedMillis += computationElapsedMillis;
unchangedNotificationElapsedMillis += notificationElapsedMillis;
}
synchronized void onPublicationSuccess(
long currentTimeMillis,
ClusterStatePublicationEvent clusterStatePublicationEvent,
long notificationElapsedMillis
) {
publicationSuccessCount += 1;
successfulComputationElapsedMillis += clusterStatePublicationEvent.getComputationTimeMillis();
successfulPublicationElapsedMillis += currentTimeMillis - clusterStatePublicationEvent.getPublicationStartTimeMillis();
successfulContextConstructionElapsedMillis += clusterStatePublicationEvent.getPublicationContextConstructionElapsedMillis();
successfulCommitElapsedMillis += clusterStatePublicationEvent.getPublicationCommitElapsedMillis();
successfulCompletionElapsedMillis += clusterStatePublicationEvent.getPublicationCompletionElapsedMillis();
successfulMasterApplyElapsedMillis += clusterStatePublicationEvent.getMasterApplyElapsedMillis();
successfulNotificationElapsedMillis += notificationElapsedMillis;
}
synchronized void onPublicationFailure(
long currentTimeMillis,
ClusterStatePublicationEvent clusterStatePublicationEvent,
long notificationMillis
) {
publicationFailureCount += 1;
failedComputationElapsedMillis += clusterStatePublicationEvent.getComputationTimeMillis();
failedPublicationElapsedMillis += currentTimeMillis - clusterStatePublicationEvent.getPublicationStartTimeMillis();
failedContextConstructionElapsedMillis += clusterStatePublicationEvent.maybeGetPublicationContextConstructionElapsedMillis();
failedCommitElapsedMillis += clusterStatePublicationEvent.maybeGetPublicationCommitElapsedMillis();
failedCompletionElapsedMillis += clusterStatePublicationEvent.maybeGetPublicationCompletionElapsedMillis();
failedMasterApplyElapsedMillis += clusterStatePublicationEvent.maybeGetMasterApplyElapsedMillis();
failedNotificationElapsedMillis += notificationMillis;
}
synchronized ClusterStateUpdateStats getStatistics() {
return new ClusterStateUpdateStats(
unchangedTaskCount,
publicationSuccessCount,
publicationFailureCount,
unchangedComputationElapsedMillis,
unchangedNotificationElapsedMillis,
successfulComputationElapsedMillis,
successfulPublicationElapsedMillis,
successfulContextConstructionElapsedMillis,
successfulCommitElapsedMillis,
successfulCompletionElapsedMillis,
successfulMasterApplyElapsedMillis,
successfulNotificationElapsedMillis,
failedComputationElapsedMillis,
failedPublicationElapsedMillis,
failedContextConstructionElapsedMillis,
failedCommitElapsedMillis,
failedCompletionElapsedMillis,
failedMasterApplyElapsedMillis,
failedNotificationElapsedMillis
);
}
}
}