
org.elasticsearch.cluster.coordination.FollowersChecker Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.coordination;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.coordination.Coordinator.Mode;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.discovery.zen.NodesFaultDetection;
import org.elasticsearch.monitor.NodeHealthService;
import org.elasticsearch.monitor.StatusInfo;
import org.elasticsearch.threadpool.ThreadPool.Names;
import org.elasticsearch.transport.ConnectTransportException;
import org.elasticsearch.transport.ReceiveTimeoutTransportException;
import org.elasticsearch.transport.Transport;
import org.elasticsearch.transport.TransportChannel;
import org.elasticsearch.transport.TransportConnectionListener;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequest;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportRequestOptions.Type;
import org.elasticsearch.transport.TransportResponse;
import org.elasticsearch.transport.TransportResponse.Empty;
import org.elasticsearch.transport.TransportResponseHandler;
import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.util.HashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
import java.util.function.Predicate;
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMap;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
/**
* The FollowersChecker is responsible for allowing a leader to check that its followers are still connected and healthy. On deciding that a
* follower has failed the leader will remove it from the cluster. We are fairly lenient, possibly allowing multiple checks to fail before
* considering a follower to be faulty, to allow for a brief network partition or a long GC cycle to occur without triggering the removal of
* a node and the consequent shard reallocation.
*/
public class FollowersChecker {
private static final Logger logger = LogManager.getLogger(FollowersChecker.class);
public static final String FOLLOWER_CHECK_ACTION_NAME = "internal:coordination/fault_detection/follower_check";
// the time between checks sent to each node
public static final Setting FOLLOWER_CHECK_INTERVAL_SETTING = Setting.timeSetting(
"cluster.fault_detection.follower_check.interval",
TimeValue.timeValueMillis(1000),
TimeValue.timeValueMillis(100),
Setting.Property.NodeScope
);
// the timeout for each check sent to each node
public static final Setting FOLLOWER_CHECK_TIMEOUT_SETTING = Setting.timeSetting(
"cluster.fault_detection.follower_check.timeout",
TimeValue.timeValueMillis(10000),
TimeValue.timeValueMillis(1),
Setting.Property.NodeScope
);
// the number of failed checks that must happen before the follower is considered to have failed.
public static final Setting FOLLOWER_CHECK_RETRY_COUNT_SETTING = Setting.intSetting(
"cluster.fault_detection.follower_check.retry_count",
3,
1,
Setting.Property.NodeScope
);
private final Settings settings;
private final TimeValue followerCheckInterval;
private final TimeValue followerCheckTimeout;
private final int followerCheckRetryCount;
private final BiConsumer onNodeFailure;
private final Consumer handleRequestAndUpdateState;
private final Object mutex = new Object(); // protects writes to this state; read access does not need sync
private final Map followerCheckers = newConcurrentMap();
private final Set faultyNodes = new HashSet<>();
private final TransportService transportService;
private final NodeHealthService nodeHealthService;
private volatile FastResponseState fastResponseState;
public FollowersChecker(
Settings settings,
TransportService transportService,
Consumer handleRequestAndUpdateState,
BiConsumer onNodeFailure,
NodeHealthService nodeHealthService
) {
this.settings = settings;
this.transportService = transportService;
this.handleRequestAndUpdateState = handleRequestAndUpdateState;
this.onNodeFailure = onNodeFailure;
this.nodeHealthService = nodeHealthService;
followerCheckInterval = FOLLOWER_CHECK_INTERVAL_SETTING.get(settings);
followerCheckTimeout = FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings);
followerCheckRetryCount = FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings);
updateFastResponseState(0, Mode.CANDIDATE);
transportService.registerRequestHandler(
FOLLOWER_CHECK_ACTION_NAME,
Names.SAME,
false,
false,
FollowerCheckRequest::new,
(request, transportChannel, task) -> handleFollowerCheck(request, transportChannel)
);
transportService.registerRequestHandler(
NodesFaultDetection.PING_ACTION_NAME,
Names.SAME,
false,
false,
NodesFaultDetection.PingRequest::new,
(request, channel, task) -> // TODO: check that we're a follower of the requesting node?
channel.sendResponse(new NodesFaultDetection.PingResponse())
);
transportService.addConnectionListener(new TransportConnectionListener() {
@Override
public void onNodeDisconnected(DiscoveryNode node, Transport.Connection connection) {
handleDisconnectedNode(node);
}
});
}
/**
* Update the set of known nodes, starting to check any new ones and stopping checking any previously-known-but-now-unknown ones.
*/
public void setCurrentNodes(DiscoveryNodes discoveryNodes) {
synchronized (mutex) {
final Predicate isUnknownNode = n -> discoveryNodes.nodeExists(n) == false;
followerCheckers.keySet().removeIf(isUnknownNode);
faultyNodes.removeIf(isUnknownNode);
discoveryNodes.mastersFirstStream().forEach(discoveryNode -> {
if (discoveryNode.equals(discoveryNodes.getLocalNode()) == false
&& followerCheckers.containsKey(discoveryNode) == false
&& faultyNodes.contains(discoveryNode) == false) {
final FollowerChecker followerChecker = new FollowerChecker(discoveryNode);
followerCheckers.put(discoveryNode, followerChecker);
followerChecker.start();
}
});
}
}
/**
* Clear the set of known nodes, stopping all checks.
*/
public void clearCurrentNodes() {
setCurrentNodes(DiscoveryNodes.EMPTY_NODES);
}
/**
* The system is normally in a state in which every follower remains a follower of a stable leader in a single term for an extended
* period of time, and therefore our response to every follower check is the same. We handle this case with a single volatile read
* entirely on the network thread, and only if the fast path fails do we perform some work in the background, by notifying the
* FollowersChecker whenever our term or mode changes here.
*/
public void updateFastResponseState(final long term, final Mode mode) {
fastResponseState = new FastResponseState(term, mode);
}
private void handleFollowerCheck(FollowerCheckRequest request, TransportChannel transportChannel) throws IOException {
final StatusInfo statusInfo = nodeHealthService.getHealth();
if (statusInfo.getStatus() == UNHEALTHY) {
final String message = "handleFollowerCheck: node is unhealthy ["
+ statusInfo.getInfo()
+ "], rejecting "
+ statusInfo.getInfo();
logger.debug(message);
throw new NodeHealthCheckFailureException(message);
}
final FastResponseState responder = this.fastResponseState;
if (responder.mode == Mode.FOLLOWER && responder.term == request.term) {
logger.trace("responding to {} on fast path", request);
transportChannel.sendResponse(Empty.INSTANCE);
return;
}
if (request.term < responder.term) {
throw new CoordinationStateRejectedException("rejecting " + request + " since local state is " + this);
}
transportService.getThreadPool().generic().execute(new AbstractRunnable() {
@Override
protected void doRun() throws IOException {
logger.trace("responding to {} on slow path", request);
try {
handleRequestAndUpdateState.accept(request);
} catch (Exception e) {
transportChannel.sendResponse(e);
return;
}
transportChannel.sendResponse(Empty.INSTANCE);
}
@Override
public void onFailure(Exception e) {
logger.debug(new ParameterizedMessage("exception while responding to {}", request), e);
}
@Override
public String toString() {
return "slow path response to " + request;
}
});
}
/**
* @return nodes in the current cluster state which have failed their follower checks.
*/
public Set getFaultyNodes() {
synchronized (mutex) {
return new HashSet<>(this.faultyNodes);
}
}
@Override
public String toString() {
return "FollowersChecker{"
+ "followerCheckInterval="
+ followerCheckInterval
+ ", followerCheckTimeout="
+ followerCheckTimeout
+ ", followerCheckRetryCount="
+ followerCheckRetryCount
+ ", followerCheckers="
+ followerCheckers
+ ", faultyNodes="
+ faultyNodes
+ ", fastResponseState="
+ fastResponseState
+ '}';
}
// For assertions
FastResponseState getFastResponseState() {
return fastResponseState;
}
// For assertions
Set getKnownFollowers() {
synchronized (mutex) {
final Set knownFollowers = new HashSet<>(faultyNodes);
knownFollowers.addAll(followerCheckers.keySet());
return knownFollowers;
}
}
private void handleDisconnectedNode(DiscoveryNode discoveryNode) {
FollowerChecker followerChecker = followerCheckers.get(discoveryNode);
if (followerChecker != null) {
followerChecker.failNode("disconnected");
}
}
static class FastResponseState {
final long term;
final Mode mode;
FastResponseState(final long term, final Mode mode) {
this.term = term;
this.mode = mode;
}
@Override
public String toString() {
return "FastResponseState{" + "term=" + term + ", mode=" + mode + '}';
}
}
/**
* A checker for an individual follower.
*/
private class FollowerChecker {
private final DiscoveryNode discoveryNode;
private int failureCountSinceLastSuccess;
private int timeoutCountSinceLastSuccess;
FollowerChecker(DiscoveryNode discoveryNode) {
this.discoveryNode = discoveryNode;
}
private boolean running() {
return this == followerCheckers.get(discoveryNode);
}
void start() {
assert running();
handleWakeUp();
}
private void handleWakeUp() {
if (running() == false) {
logger.trace("handleWakeUp: not running");
return;
}
final FollowerCheckRequest request = new FollowerCheckRequest(fastResponseState.term, transportService.getLocalNode());
logger.trace("handleWakeUp: checking {} with {}", discoveryNode, request);
final String actionName;
final TransportRequest transportRequest;
if (Coordinator.isZen1Node(discoveryNode)) {
actionName = NodesFaultDetection.PING_ACTION_NAME;
transportRequest = new NodesFaultDetection.PingRequest(
discoveryNode,
ClusterName.CLUSTER_NAME_SETTING.get(settings),
transportService.getLocalNode(),
ClusterState.UNKNOWN_VERSION
);
} else {
actionName = FOLLOWER_CHECK_ACTION_NAME;
transportRequest = request;
}
transportService.sendRequest(
discoveryNode,
actionName,
transportRequest,
TransportRequestOptions.of(followerCheckTimeout, Type.PING),
new TransportResponseHandler.Empty() {
@Override
public void handleResponse(TransportResponse.Empty response) {
if (running() == false) {
logger.trace("{} no longer running", FollowerChecker.this);
return;
}
failureCountSinceLastSuccess = 0;
timeoutCountSinceLastSuccess = 0;
logger.trace("{} check successful", FollowerChecker.this);
scheduleNextWakeUp();
}
@Override
public void handleException(TransportException exp) {
if (running() == false) {
logger.debug(new ParameterizedMessage("{} no longer running", FollowerChecker.this), exp);
return;
}
if (exp instanceof ReceiveTimeoutTransportException) {
timeoutCountSinceLastSuccess++;
} else {
failureCountSinceLastSuccess++;
}
final String reason;
if (exp instanceof ConnectTransportException || exp.getCause() instanceof ConnectTransportException) {
logger.debug(() -> new ParameterizedMessage("{} disconnected", FollowerChecker.this), exp);
reason = "disconnected";
} else if (exp.getCause() instanceof NodeHealthCheckFailureException) {
logger.debug(() -> new ParameterizedMessage("{} health check failed", FollowerChecker.this), exp);
reason = "health check failed";
} else if (failureCountSinceLastSuccess + timeoutCountSinceLastSuccess >= followerCheckRetryCount) {
logger.debug(() -> new ParameterizedMessage("{} failed too many times", FollowerChecker.this), exp);
reason = "followers check retry count exceeded [timeouts="
+ timeoutCountSinceLastSuccess
+ ", failures="
+ failureCountSinceLastSuccess
+ "]";
} else {
logger.debug(() -> new ParameterizedMessage("{} failed, retrying", FollowerChecker.this), exp);
scheduleNextWakeUp();
return;
}
failNode(reason);
}
}
);
}
void failNode(String reason) {
transportService.getThreadPool().generic().execute(new Runnable() {
@Override
public void run() {
synchronized (mutex) {
if (running() == false) {
logger.trace("{} no longer running, not marking faulty", FollowerChecker.this);
return;
}
logger.debug("{} marking node as faulty", FollowerChecker.this);
faultyNodes.add(discoveryNode);
followerCheckers.remove(discoveryNode);
}
onNodeFailure.accept(discoveryNode, reason);
}
@Override
public String toString() {
return "detected failure of " + discoveryNode;
}
});
}
private void scheduleNextWakeUp() {
transportService.getThreadPool().schedule(new Runnable() {
@Override
public void run() {
handleWakeUp();
}
@Override
public String toString() {
return FollowerChecker.this + "::handleWakeUp";
}
}, followerCheckInterval, Names.SAME);
}
@Override
public String toString() {
return "FollowerChecker{"
+ "discoveryNode="
+ discoveryNode
+ ", failureCountSinceLastSuccess="
+ failureCountSinceLastSuccess
+ ", timeoutCountSinceLastSuccess="
+ timeoutCountSinceLastSuccess
+ ", ["
+ FOLLOWER_CHECK_RETRY_COUNT_SETTING.getKey()
+ "]="
+ followerCheckRetryCount
+ '}';
}
}
public static class FollowerCheckRequest extends TransportRequest {
private final long term;
private final DiscoveryNode sender;
public long getTerm() {
return term;
}
public DiscoveryNode getSender() {
return sender;
}
public FollowerCheckRequest(final long term, final DiscoveryNode sender) {
this.term = term;
this.sender = sender;
}
public FollowerCheckRequest(final StreamInput in) throws IOException {
super(in);
term = in.readLong();
sender = new DiscoveryNode(in);
}
@Override
public void writeTo(final StreamOutput out) throws IOException {
super.writeTo(out);
out.writeLong(term);
sender.writeTo(out);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
FollowerCheckRequest that = (FollowerCheckRequest) o;
return term == that.term && Objects.equals(sender, that.sender);
}
@Override
public String toString() {
return "FollowerCheckRequest{" + "term=" + term + ", sender=" + sender + '}';
}
@Override
public int hashCode() {
return Objects.hash(term, sender);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy