
org.elasticsearch.cluster.coordination.LeaderChecker Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.coordination;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.logging.log4j.util.MessageSupplier;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.discovery.zen.MasterFaultDetection;
import org.elasticsearch.monitor.NodeHealthService;
import org.elasticsearch.monitor.StatusInfo;
import org.elasticsearch.threadpool.ThreadPool.Names;
import org.elasticsearch.transport.ConnectTransportException;
import org.elasticsearch.transport.NodeDisconnectedException;
import org.elasticsearch.transport.ReceiveTimeoutTransportException;
import org.elasticsearch.transport.Transport;
import org.elasticsearch.transport.TransportConnectionListener;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequest;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportRequestOptions.Type;
import org.elasticsearch.transport.TransportResponse;
import org.elasticsearch.transport.TransportResponse.Empty;
import org.elasticsearch.transport.TransportResponseHandler;
import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReference;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
/**
* The LeaderChecker is responsible for allowing followers to check that the currently elected leader is still connected and healthy. We are
* fairly lenient, possibly allowing multiple checks to fail before considering the leader to be faulty, to allow for the leader to
* temporarily stand down on occasion, e.g. if it needs to move to a higher term. On deciding that the leader has failed a follower will
* become a candidate and attempt to become a leader itself.
*/
public class LeaderChecker {
private static final Logger logger = LogManager.getLogger(LeaderChecker.class);
static final String LEADER_CHECK_ACTION_NAME = "internal:coordination/fault_detection/leader_check";
// the time between checks sent to the leader
public static final Setting LEADER_CHECK_INTERVAL_SETTING = Setting.timeSetting(
"cluster.fault_detection.leader_check.interval",
TimeValue.timeValueMillis(1000),
TimeValue.timeValueMillis(100),
Setting.Property.NodeScope
);
// the timeout for each check sent to the leader
public static final Setting LEADER_CHECK_TIMEOUT_SETTING = Setting.timeSetting(
"cluster.fault_detection.leader_check.timeout",
TimeValue.timeValueMillis(10000),
TimeValue.timeValueMillis(1),
Setting.Property.NodeScope
);
// the number of failed checks that must happen before the leader is considered to have failed.
public static final Setting LEADER_CHECK_RETRY_COUNT_SETTING = Setting.intSetting(
"cluster.fault_detection.leader_check.retry_count",
3,
1,
Setting.Property.NodeScope
);
private final Settings settings;
private final TimeValue leaderCheckInterval;
private final TimeValue leaderCheckTimeout;
private final int leaderCheckRetryCount;
private final TransportService transportService;
private final LeaderFailureListener leaderFailureListener;
private final NodeHealthService nodeHealthService;
private final AtomicReference currentChecker = new AtomicReference<>();
private volatile DiscoveryNodes discoveryNodes;
LeaderChecker(
final Settings settings,
final TransportService transportService,
final LeaderFailureListener leaderFailureListener,
final NodeHealthService nodeHealthService
) {
this.settings = settings;
leaderCheckInterval = LEADER_CHECK_INTERVAL_SETTING.get(settings);
leaderCheckTimeout = LEADER_CHECK_TIMEOUT_SETTING.get(settings);
leaderCheckRetryCount = LEADER_CHECK_RETRY_COUNT_SETTING.get(settings);
this.transportService = transportService;
this.leaderFailureListener = leaderFailureListener;
this.nodeHealthService = nodeHealthService;
transportService.registerRequestHandler(
LEADER_CHECK_ACTION_NAME,
Names.SAME,
false,
false,
LeaderCheckRequest::new,
(request, channel, task) -> {
handleLeaderCheck(request);
channel.sendResponse(Empty.INSTANCE);
}
);
transportService.registerRequestHandler(
MasterFaultDetection.MASTER_PING_ACTION_NAME,
Names.SAME,
false,
false,
MasterFaultDetection.MasterPingRequest::new,
(request, channel, task) -> {
try {
handleLeaderCheck(new LeaderCheckRequest(request.sourceNode));
} catch (CoordinationStateRejectedException e) {
throw new MasterFaultDetection.ThisIsNotTheMasterYouAreLookingForException(e.getMessage());
}
channel.sendResponse(new MasterFaultDetection.MasterPingResponseResponse());
}
);
transportService.addConnectionListener(new TransportConnectionListener() {
@Override
public void onNodeDisconnected(DiscoveryNode node, Transport.Connection connection) {
handleDisconnectedNode(node);
}
});
}
public DiscoveryNode leader() {
CheckScheduler checkScheduler = currentChecker.get();
return checkScheduler == null ? null : checkScheduler.leader;
}
/**
* Starts and / or stops a leader checker for the given leader. Should only be called after successfully joining this leader.
*
* @param leader the node to be checked as leader, or null if checks should be disabled
*/
void updateLeader(@Nullable final DiscoveryNode leader) {
assert transportService.getLocalNode().equals(leader) == false;
final CheckScheduler checkScheduler;
if (leader != null) {
checkScheduler = new CheckScheduler(leader);
} else {
checkScheduler = null;
}
CheckScheduler previousChecker = currentChecker.getAndSet(checkScheduler);
if (previousChecker != null) {
previousChecker.close();
}
if (checkScheduler != null) {
checkScheduler.handleWakeUp();
}
}
/**
* Update the "known" discovery nodes. Should be called on the leader before a new cluster state is published to reflect the new
* publication targets, and also called if a leader becomes a non-leader.
*/
void setCurrentNodes(DiscoveryNodes discoveryNodes) {
logger.trace("setCurrentNodes: {}", discoveryNodes);
this.discoveryNodes = discoveryNodes;
}
// For assertions
boolean currentNodeIsMaster() {
return discoveryNodes.isLocalNodeElectedMaster();
}
private void handleLeaderCheck(LeaderCheckRequest request) {
final DiscoveryNodes discoveryNodes = this.discoveryNodes;
assert discoveryNodes != null;
final StatusInfo statusInfo = nodeHealthService.getHealth();
if (statusInfo.getStatus() == UNHEALTHY) {
logger.debug("this node is unhealthy [{}], rejecting leader check: {}", statusInfo.getInfo(), request);
throw new NodeHealthCheckFailureException(statusInfo.getInfo());
} else if (discoveryNodes.isLocalNodeElectedMaster() == false) {
logger.debug("rejecting leader check on non-master: {}", request);
throw new CoordinationStateRejectedException("no longer the elected master");
} else if (discoveryNodes.nodeExists(request.getSender()) == false) {
logger.debug("rejecting leader check from removed node: {}", request);
throw new CoordinationStateRejectedException(
"rejecting check since [" + request.getSender().descriptionWithoutAttributes() + "] has been removed from the cluster"
);
} else {
logger.trace("handling {}", request);
}
}
private void handleDisconnectedNode(DiscoveryNode discoveryNode) {
CheckScheduler checkScheduler = currentChecker.get();
if (checkScheduler != null) {
checkScheduler.handleDisconnectedNode(discoveryNode);
} else {
logger.trace("disconnect event ignored for {}, no check scheduler", discoveryNode);
}
}
private static final String RESTARTING_DISCOVERY_TEXT = "restarting discovery; more details may be available in the master node logs";
private class CheckScheduler implements Releasable {
private final DiscoveryNode leader;
private final AtomicBoolean isClosed = new AtomicBoolean();
private int rejectedCountSinceLastSuccess;
private int timeoutCountSinceLastSuccess;
CheckScheduler(final DiscoveryNode leader) {
this.leader = leader;
}
@Override
public void close() {
if (isClosed.compareAndSet(false, true) == false) {
logger.trace("already closed, doing nothing");
} else {
logger.debug("closed");
}
}
void handleWakeUp() {
if (isClosed.get()) {
logger.trace("closed check scheduler woken up, doing nothing");
return;
}
logger.trace("checking {} with [{}] = {}", leader, LEADER_CHECK_TIMEOUT_SETTING.getKey(), leaderCheckTimeout);
final String actionName;
final TransportRequest transportRequest;
if (Coordinator.isZen1Node(leader)) {
actionName = MasterFaultDetection.MASTER_PING_ACTION_NAME;
transportRequest = new MasterFaultDetection.MasterPingRequest(
transportService.getLocalNode(),
leader,
ClusterName.CLUSTER_NAME_SETTING.get(settings)
);
} else {
actionName = LEADER_CHECK_ACTION_NAME;
transportRequest = new LeaderCheckRequest(transportService.getLocalNode());
}
// TODO lag detection:
// In the PoC, the leader sent its current version to the follower in the response to a LeaderCheck, so the follower
// could detect if it was lagging. We'd prefer this to be implemented on the leader, so the response is just
// TransportResponse.Empty here.
transportService.sendRequest(
leader,
actionName,
transportRequest,
TransportRequestOptions.of(leaderCheckTimeout, Type.PING),
new TransportResponseHandler.Empty() {
@Override
public void handleResponse(TransportResponse.Empty response) {
if (isClosed.get()) {
logger.debug("closed check scheduler received a response, doing nothing");
return;
}
rejectedCountSinceLastSuccess = 0;
timeoutCountSinceLastSuccess = 0;
scheduleNextWakeUp(); // logs trace message indicating success
}
@Override
public void handleException(TransportException exp) {
if (isClosed.get()) {
logger.debug("closed check scheduler received a response, doing nothing");
return;
}
if (exp instanceof ConnectTransportException || exp.getCause() instanceof ConnectTransportException) {
logger.debug(new ParameterizedMessage("leader [{}] disconnected during check", leader), exp);
leaderFailed(
() -> new ParameterizedMessage(
"master node [{}] disconnected, restarting discovery [{}]",
leader.descriptionWithoutAttributes(),
ExceptionsHelper.unwrapCause(exp).getMessage()
),
exp
);
return;
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
logger.debug(new ParameterizedMessage("leader [{}] health check failed", leader), exp);
leaderFailed(
() -> new ParameterizedMessage(
"master node [{}] reported itself as unhealthy [{}], {}",
leader.descriptionWithoutAttributes(),
exp.getCause().getMessage(),
RESTARTING_DISCOVERY_TEXT
),
exp
);
return;
}
if (exp instanceof ReceiveTimeoutTransportException) {
timeoutCountSinceLastSuccess += 1;
} else {
rejectedCountSinceLastSuccess += 1;
}
long failureCount = rejectedCountSinceLastSuccess + timeoutCountSinceLastSuccess;
if (failureCount >= leaderCheckRetryCount) {
logger.debug(
new ParameterizedMessage(
"leader [{}] failed {} consecutive checks (rejected [{}], timed out [{}], limit [{}] is {})",
leader,
failureCount,
rejectedCountSinceLastSuccess,
timeoutCountSinceLastSuccess,
LEADER_CHECK_RETRY_COUNT_SETTING.getKey(),
leaderCheckRetryCount
),
exp
);
leaderFailed(
() -> new ParameterizedMessage(
"[{}] consecutive checks of the master node [{}] were unsuccessful ([{}] rejected, [{}] timed out), "
+ "{} [last unsuccessful check: {}]",
failureCount,
leader.descriptionWithoutAttributes(),
rejectedCountSinceLastSuccess,
timeoutCountSinceLastSuccess,
RESTARTING_DISCOVERY_TEXT,
ExceptionsHelper.unwrapCause(exp).getMessage()
),
exp
);
return;
}
logger.debug(
new ParameterizedMessage(
"{} consecutive failures (limit [{}] is {}) with leader [{}]",
failureCount,
LEADER_CHECK_RETRY_COUNT_SETTING.getKey(),
leaderCheckRetryCount,
leader
),
exp
);
scheduleNextWakeUp();
}
}
);
}
void leaderFailed(MessageSupplier messageSupplier, Exception e) {
if (isClosed.compareAndSet(false, true)) {
transportService.getThreadPool().generic().execute(new Runnable() {
@Override
public void run() {
leaderFailureListener.onLeaderFailure(messageSupplier, e);
}
@Override
public String toString() {
return "notification of leader failure: " + e.getMessage();
}
});
} else {
logger.trace("already closed, not failing leader");
}
}
void handleDisconnectedNode(DiscoveryNode discoveryNode) {
if (discoveryNode.equals(leader)) {
logger.debug("leader [{}] disconnected", leader);
leaderFailed(
() -> new ParameterizedMessage(
"master node [{}] disconnected, restarting discovery",
leader.descriptionWithoutAttributes()
),
new NodeDisconnectedException(discoveryNode, "disconnected")
);
}
}
private void scheduleNextWakeUp() {
logger.trace("scheduling next check of {} for [{}] = {}", leader, LEADER_CHECK_INTERVAL_SETTING.getKey(), leaderCheckInterval);
transportService.getThreadPool().schedule(new Runnable() {
@Override
public void run() {
handleWakeUp();
}
@Override
public String toString() {
return "scheduled check of leader " + leader;
}
}, leaderCheckInterval, Names.SAME);
}
}
static class LeaderCheckRequest extends TransportRequest {
private final DiscoveryNode sender;
LeaderCheckRequest(final DiscoveryNode sender) {
this.sender = sender;
}
LeaderCheckRequest(final StreamInput in) throws IOException {
super(in);
sender = new DiscoveryNode(in);
}
@Override
public void writeTo(final StreamOutput out) throws IOException {
super.writeTo(out);
sender.writeTo(out);
}
public DiscoveryNode getSender() {
return sender;
}
@Override
public boolean equals(final Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
final LeaderCheckRequest that = (LeaderCheckRequest) o;
return Objects.equals(sender, that.sender);
}
@Override
public int hashCode() {
return Objects.hash(sender);
}
@Override
public String toString() {
return "LeaderCheckRequest{" + "sender=" + sender + '}';
}
}
@FunctionalInterface
interface LeaderFailureListener {
/**
* Called when a leader failure is detected. Checking the leader health is somewhat asynchronous, so this method may report a leader
* failure after the node has already decided there's no known leader for some other reason. This method is called on the {@code
* GENERIC} thread pool.
*
* @param messageSupplier The message to log if prior to this failure there was a known master in the cluster.
* @param exception An exception that gives more detail of the leader failure.
*/
void onLeaderFailure(MessageSupplier messageSupplier, Exception exception);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy