org.elasticsearch.cluster.coordination.JoinHelper Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.coordination;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.ChannelActionListener;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateTaskConfig;
import org.elasticsearch.cluster.ClusterStateTaskListener;
import org.elasticsearch.cluster.NotMasterException;
import org.elasticsearch.cluster.coordination.Coordinator.Mode;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.RerouteService;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.service.MasterService;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.discovery.zen.MembershipAction;
import org.elasticsearch.discovery.zen.ZenDiscovery;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.NodeEnvironment;
import org.elasticsearch.monitor.NodeHealthService;
import org.elasticsearch.monitor.StatusInfo;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;
import org.elasticsearch.transport.ConnectTransportException;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequest;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportResponse;
import org.elasticsearch.transport.TransportResponse.Empty;
import org.elasticsearch.transport.TransportResponseHandler;
import org.elasticsearch.transport.TransportService;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.function.LongSupplier;
import java.util.function.Supplier;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;
public class JoinHelper {
private static final Logger logger = LogManager.getLogger(JoinHelper.class);
public static final String START_JOIN_ACTION_NAME = "internal:cluster/coordination/start_join";
public static final String JOIN_ACTION_NAME = "internal:cluster/coordination/join";
public static final String JOIN_VALIDATE_ACTION_NAME = "internal:cluster/coordination/join/validate";
public static final String JOIN_PING_ACTION_NAME = "internal:cluster/coordination/join/ping";
// the timeout for Zen1 join attempts
public static final Setting JOIN_TIMEOUT_SETTING = Setting.timeSetting(
"cluster.join.timeout",
TimeValue.timeValueMillis(60000),
TimeValue.timeValueMillis(1),
Setting.Property.NodeScope,
Setting.Property.Deprecated
);
private final MasterService masterService;
private final TransportService transportService;
private volatile JoinTaskExecutor joinTaskExecutor;
private final TimeValue joinTimeout; // only used for Zen1 joining
private final NodeHealthService nodeHealthService;
private final Set> pendingOutgoingJoins = Collections.synchronizedSet(new HashSet<>());
private final AtomicReference lastFailedJoinAttempt = new AtomicReference<>();
private final Map joinConnections = new HashMap<>(); // synchronized on itself
private final Supplier joinTaskExecutorGenerator;
JoinHelper(
Settings settings,
AllocationService allocationService,
MasterService masterService,
TransportService transportService,
LongSupplier currentTermSupplier,
Supplier currentStateSupplier,
BiConsumer> joinHandler,
Function joinLeaderInTerm,
Collection> joinValidators,
RerouteService rerouteService,
NodeHealthService nodeHealthService
) {
this.masterService = masterService;
this.transportService = transportService;
this.nodeHealthService = nodeHealthService;
this.joinTimeout = JOIN_TIMEOUT_SETTING.get(settings);
this.joinTaskExecutorGenerator = () -> new JoinTaskExecutor(settings, allocationService, logger, rerouteService) {
private final long term = currentTermSupplier.getAsLong();
@Override
public ClusterTasksResult execute(ClusterState currentState, List joiningTasks)
throws Exception {
// The current state that MasterService uses might have been updated by a (different) master in a higher term already
// Stop processing the current cluster state update, as there's no point in continuing to compute it as
// it will later be rejected by Coordinator.publish(...) anyhow
if (currentState.term() > term) {
logger.trace("encountered higher term {} than current {}, there is a newer master", currentState.term(), term);
throw new NotMasterException(
"Higher term encountered (current: " + currentState.term() + " > used: " + term + "), there is a newer master"
);
} else if (currentState.nodes().getMasterNodeId() == null && joiningTasks.stream().anyMatch(Task::isBecomeMasterTask)) {
assert currentState.term() < term : "there should be at most one become master task per election (= by term)";
final CoordinationMetadata coordinationMetadata = CoordinationMetadata.builder(currentState.coordinationMetadata())
.term(term)
.build();
final Metadata metadata = Metadata.builder(currentState.metadata()).coordinationMetadata(coordinationMetadata).build();
currentState = ClusterState.builder(currentState).metadata(metadata).build();
} else if (currentState.nodes().isLocalNodeElectedMaster()) {
assert currentState.term() == term : "term should be stable for the same master";
}
return super.execute(currentState, joiningTasks);
}
};
transportService.registerRequestHandler(
JOIN_ACTION_NAME,
ThreadPool.Names.GENERIC,
false,
false,
JoinRequest::new,
(request, channel, task) -> joinHandler.accept(
request,
new ChannelActionListener(channel, JOIN_ACTION_NAME, request).map(ignored -> Empty.INSTANCE)
)
);
transportService.registerRequestHandler(
MembershipAction.DISCOVERY_JOIN_ACTION_NAME,
ThreadPool.Names.GENERIC,
false,
false,
MembershipAction.JoinRequest::new,
(request, channel, task) -> joinHandler.accept(
new JoinRequest(request.getNode(), 0L, Optional.empty()), // treat as non-voting join
new ChannelActionListener(
channel,
MembershipAction.DISCOVERY_JOIN_ACTION_NAME,
request
).map(ignored -> Empty.INSTANCE)
)
);
transportService.registerRequestHandler(
START_JOIN_ACTION_NAME,
Names.GENERIC,
false,
false,
StartJoinRequest::new,
(request, channel, task) -> {
final DiscoveryNode destination = request.getSourceNode();
sendJoinRequest(destination, currentTermSupplier.getAsLong(), Optional.of(joinLeaderInTerm.apply(request)));
channel.sendResponse(Empty.INSTANCE);
}
);
transportService.registerRequestHandler(
JOIN_PING_ACTION_NAME,
ThreadPool.Names.SAME,
false,
false,
TransportRequest.Empty::new,
(request, channel, task) -> channel.sendResponse(Empty.INSTANCE)
);
final List dataPaths = Environment.PATH_DATA_SETTING.get(settings);
final int maxLocalStorageNodes = NodeEnvironment.MAX_LOCAL_STORAGE_NODES_SETTING.get(settings);
transportService.registerRequestHandler(
JOIN_VALIDATE_ACTION_NAME,
ThreadPool.Names.GENERIC,
ValidateJoinRequest::new,
(request, channel, task) -> {
final ClusterState localState = currentStateSupplier.get();
if (localState.metadata().clusterUUIDCommitted()
&& localState.metadata().clusterUUID().equals(request.getState().metadata().clusterUUID()) == false) {
throw new CoordinationStateRejectedException(
"This node previously joined a cluster with UUID ["
+ localState.metadata().clusterUUID()
+ "] and is now trying to join a different cluster with UUID ["
+ request.getState().metadata().clusterUUID()
+ "]. "
+ getClusterUuidMismatchExplanation(dataPaths, maxLocalStorageNodes)
);
}
joinValidators.forEach(action -> action.accept(transportService.getLocalNode(), request.getState()));
channel.sendResponse(Empty.INSTANCE);
}
);
transportService.registerRequestHandler(
MembershipAction.DISCOVERY_JOIN_VALIDATE_ACTION_NAME,
ThreadPool.Names.GENERIC,
ValidateJoinRequest::new,
(request, channel, task) -> {
final ClusterState localState = currentStateSupplier.get();
if (localState.metadata().clusterUUIDCommitted()
&& localState.metadata().clusterUUID().equals(request.getState().metadata().clusterUUID()) == false) {
throw new CoordinationStateRejectedException(
"This node previously joined a cluster with UUID ["
+ localState.metadata().clusterUUID()
+ "] and is now trying to join a different cluster with UUID ["
+ request.getState().metadata().clusterUUID()
+ "] and a mix of versions. "
+ getClusterUuidMismatchExplanation(dataPaths, maxLocalStorageNodes)
);
}
joinValidators.forEach(action -> action.accept(transportService.getLocalNode(), request.getState()));
channel.sendResponse(Empty.INSTANCE);
}
);
transportService.registerRequestHandler(
ZenDiscovery.DISCOVERY_REJOIN_ACTION_NAME,
ThreadPool.Names.SAME,
ZenDiscovery.RejoinClusterRequest::new,
(request, channel, task) -> channel.sendResponse(Empty.INSTANCE)
); // TODO: do we need to implement anything here?
transportService.registerRequestHandler(
MembershipAction.DISCOVERY_LEAVE_ACTION_NAME,
ThreadPool.Names.SAME,
MembershipAction.LeaveRequest::new,
(request, channel, task) -> channel.sendResponse(Empty.INSTANCE)
); // TODO: do we need to implement anything here?
}
static String getClusterUuidMismatchExplanation(List dataPaths, int maxLocalStorageNodes) {
return "This is forbidden and usually indicates an incorrect discovery or cluster bootstrapping configuration. Note that the "
+ "cluster UUID persists across restarts and can only be changed by deleting the contents of the node's data "
+ (dataPaths.size() == 1 ? "path " : "paths ")
+ dataPaths
+ " which will also remove any data held by "
+ (maxLocalStorageNodes == 1
? "this node."
: "all nodes that use " + (dataPaths.size() == 1 ? "this data path." : "these data paths."));
}
boolean isJoinPending() {
return pendingOutgoingJoins.isEmpty() == false;
}
public void sendJoinRequest(DiscoveryNode destination, long term, Optional optionalJoin) {
sendJoinRequest(destination, term, optionalJoin, () -> {});
}
public void onClusterStateApplied() {
// we applied a cluster state as LEADER or FOLLOWER which means the NodeConnectionsService has taken ownership of any connections to
// nodes in the cluster and therefore we can release the connection(s) that we were using for joining
final List releasables;
synchronized (joinConnections) {
if (joinConnections.isEmpty()) {
return;
}
releasables = new ArrayList<>(joinConnections.values());
joinConnections.clear();
}
logger.debug("releasing [{}] connections on successful cluster state application", releasables.size());
releasables.forEach(Releasables::close);
}
private void registerConnection(DiscoveryNode destination, Releasable connectionReference) {
final Releasable previousConnection;
synchronized (joinConnections) {
previousConnection = joinConnections.put(destination, connectionReference);
}
Releasables.close(previousConnection);
}
private void unregisterAndReleaseConnection(DiscoveryNode destination, Releasable connectionReference) {
synchronized (joinConnections) {
joinConnections.remove(destination, connectionReference);
}
Releasables.close(connectionReference);
}
// package-private for testing
static class FailedJoinAttempt {
private final DiscoveryNode destination;
private final JoinRequest joinRequest;
private final TransportException exception;
private final long timestamp;
FailedJoinAttempt(DiscoveryNode destination, JoinRequest joinRequest, TransportException exception) {
this.destination = destination;
this.joinRequest = joinRequest;
this.exception = exception;
this.timestamp = System.nanoTime();
}
void logNow() {
logger.log(
getLogLevel(exception),
() -> new ParameterizedMessage("failed to join {} with {}", destination, joinRequest),
exception
);
}
static Level getLogLevel(TransportException e) {
Throwable cause = e.unwrapCause();
if (cause instanceof CoordinationStateRejectedException
|| cause instanceof FailedToCommitClusterStateException
|| cause instanceof NotMasterException) {
return Level.DEBUG;
}
return Level.INFO;
}
void logWarnWithTimestamp() {
logger.warn(
() -> new ParameterizedMessage(
"last failed join attempt was {} ago, failed to join {} with {}",
TimeValue.timeValueMillis(TimeValue.nsecToMSec(System.nanoTime() - timestamp)),
destination,
joinRequest
),
exception
);
}
}
void logLastFailedJoinAttempt() {
FailedJoinAttempt attempt = lastFailedJoinAttempt.get();
if (attempt != null) {
attempt.logWarnWithTimestamp();
lastFailedJoinAttempt.compareAndSet(attempt, null);
}
}
public void sendJoinRequest(DiscoveryNode destination, long term, Optional optionalJoin, Runnable onCompletion) {
assert destination.isMasterNode() : "trying to join master-ineligible " + destination;
final StatusInfo statusInfo = nodeHealthService.getHealth();
if (statusInfo.getStatus() == UNHEALTHY) {
logger.debug("dropping join request to [{}]: [{}]", destination, statusInfo.getInfo());
return;
}
final JoinRequest joinRequest = new JoinRequest(transportService.getLocalNode(), term, optionalJoin);
final Tuple dedupKey = Tuple.tuple(destination, joinRequest);
if (pendingOutgoingJoins.add(dedupKey)) {
logger.debug("attempting to join {} with {}", destination, joinRequest);
// Typically we're already connected to the destination at this point, the PeerFinder holds a reference to this connection to
// keep it open, but we need to acquire our own reference to keep the connection alive through the joining process.
transportService.connectToNode(destination, new ActionListener() {
@Override
public void onResponse(Releasable connectionReference) {
logger.trace("acquired connection for joining join {} with {}", destination, joinRequest);
// Register the connection in joinConnections so it can be released once we successfully apply the cluster state, at
// which point the NodeConnectionsService will have taken ownership of it.
registerConnection(destination, connectionReference);
if (Coordinator.isZen1Node(destination)) {
transportService.sendRequest(
destination,
MembershipAction.DISCOVERY_JOIN_ACTION_NAME,
new MembershipAction.JoinRequest(transportService.getLocalNode()),
TransportRequestOptions.timeout(joinTimeout),
new TransportResponseHandler.Empty() {
@Override
public void handleResponse(TransportResponse.Empty response) {
pendingOutgoingJoins.remove(dedupKey);
logger.debug("successfully joined {} with {}", destination, joinRequest);
lastFailedJoinAttempt.set(null);
onCompletion.run();
}
@Override
public void handleException(TransportException exp) {
pendingOutgoingJoins.remove(dedupKey);
logger.info(() -> new ParameterizedMessage("failed to join {} with {}", destination, joinRequest), exp);
FailedJoinAttempt attempt = new FailedJoinAttempt(destination, joinRequest, exp);
attempt.logNow();
lastFailedJoinAttempt.set(attempt);
unregisterAndReleaseConnection(destination, connectionReference);
onCompletion.run();
}
}
);
return;
}
transportService.sendRequest(
destination,
JOIN_ACTION_NAME,
joinRequest,
TransportRequestOptions.of(null, TransportRequestOptions.Type.PING),
new TransportResponseHandler.Empty() {
@Override
public void handleResponse(TransportResponse.Empty response) {
pendingOutgoingJoins.remove(dedupKey);
logger.debug("successfully joined {} with {}", destination, joinRequest);
lastFailedJoinAttempt.set(null);
}
@Override
public void handleException(TransportException exp) {
pendingOutgoingJoins.remove(dedupKey);
FailedJoinAttempt attempt = new FailedJoinAttempt(destination, joinRequest, exp);
attempt.logNow();
lastFailedJoinAttempt.set(attempt);
unregisterAndReleaseConnection(destination, connectionReference);
}
}
);
}
@Override
public void onFailure(Exception e) {
pendingOutgoingJoins.remove(dedupKey);
FailedJoinAttempt attempt = new FailedJoinAttempt(
destination,
joinRequest,
new ConnectTransportException(destination, "failed to acquire connection", e)
);
attempt.logNow();
lastFailedJoinAttempt.set(attempt);
onCompletion.run();
}
});
} else {
logger.debug("already attempting to join {} with request {}, not sending request", destination, joinRequest);
}
}
public void sendStartJoinRequest(final StartJoinRequest startJoinRequest, final DiscoveryNode destination) {
assert startJoinRequest.getSourceNode().isMasterNode()
: "sending start-join request for master-ineligible " + startJoinRequest.getSourceNode();
transportService.sendRequest(destination, START_JOIN_ACTION_NAME, startJoinRequest, new TransportResponseHandler.Empty() {
@Override
public void handleResponse(TransportResponse.Empty response) {
logger.debug("successful response to {} from {}", startJoinRequest, destination);
}
@Override
public void handleException(TransportException exp) {
logger.debug(new ParameterizedMessage("failure in response to {} from {}", startJoinRequest, destination), exp);
}
});
}
static class JoinTaskListener implements ClusterStateTaskListener {
private final JoinTaskExecutor.Task task;
private final ActionListener joinListener;
JoinTaskListener(JoinTaskExecutor.Task task, ActionListener joinListener) {
this.task = task;
this.joinListener = joinListener;
}
@Override
public void onFailure(String source, Exception e) {
joinListener.onFailure(e);
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
joinListener.onResponse(null);
}
@Override
public String toString() {
return "JoinTaskListener{task=" + task + "}";
}
}
interface JoinAccumulator {
void handleJoinRequest(DiscoveryNode sender, ActionListener joinListener);
default void close(Mode newMode) {}
}
class LeaderJoinAccumulator implements JoinAccumulator {
@Override
public void handleJoinRequest(DiscoveryNode sender, ActionListener joinListener) {
final JoinTaskExecutor.Task task = new JoinTaskExecutor.Task(sender, "join existing leader");
assert joinTaskExecutor != null;
masterService.submitStateUpdateTask(
"node-join",
task,
ClusterStateTaskConfig.build(Priority.URGENT),
joinTaskExecutor,
new JoinTaskListener(task, joinListener)
);
}
@Override
public String toString() {
return "LeaderJoinAccumulator";
}
}
static class InitialJoinAccumulator implements JoinAccumulator {
@Override
public void handleJoinRequest(DiscoveryNode sender, ActionListener joinListener) {
assert false : "unexpected join from " + sender + " during initialisation";
joinListener.onFailure(new CoordinationStateRejectedException("join target is not initialised yet"));
}
@Override
public String toString() {
return "InitialJoinAccumulator";
}
}
static class FollowerJoinAccumulator implements JoinAccumulator {
@Override
public void handleJoinRequest(DiscoveryNode sender, ActionListener joinListener) {
joinListener.onFailure(new CoordinationStateRejectedException("join target is a follower"));
}
@Override
public String toString() {
return "FollowerJoinAccumulator";
}
}
class CandidateJoinAccumulator implements JoinAccumulator {
private final Map> joinRequestAccumulator = new HashMap<>();
boolean closed;
@Override
public void handleJoinRequest(DiscoveryNode sender, ActionListener joinListener) {
assert closed == false : "CandidateJoinAccumulator closed";
ActionListener prev = joinRequestAccumulator.put(sender, joinListener);
if (prev != null) {
prev.onFailure(new CoordinationStateRejectedException("received a newer join from " + sender));
}
}
@Override
public void close(Mode newMode) {
assert closed == false : "CandidateJoinAccumulator closed";
closed = true;
if (newMode == Mode.LEADER) {
final Map pendingAsTasks = new LinkedHashMap<>();
joinRequestAccumulator.forEach((key, value) -> {
final JoinTaskExecutor.Task task = new JoinTaskExecutor.Task(key, "elect leader");
pendingAsTasks.put(task, new JoinTaskListener(task, value));
});
final String stateUpdateSource = "elected-as-master ([" + pendingAsTasks.size() + "] nodes joined)";
pendingAsTasks.put(JoinTaskExecutor.newBecomeMasterTask(), (source, e) -> {});
pendingAsTasks.put(JoinTaskExecutor.newFinishElectionTask(), (source, e) -> {});
joinTaskExecutor = joinTaskExecutorGenerator.get();
masterService.submitStateUpdateTasks(
stateUpdateSource,
pendingAsTasks,
ClusterStateTaskConfig.build(Priority.URGENT),
joinTaskExecutor
);
} else {
assert newMode == Mode.FOLLOWER : newMode;
joinTaskExecutor = null;
joinRequestAccumulator.values()
.forEach(joinCallback -> joinCallback.onFailure(new CoordinationStateRejectedException("became follower")));
}
// CandidateJoinAccumulator is only closed when becoming leader or follower, otherwise it accumulates all joins received
// regardless of term.
}
@Override
public String toString() {
return "CandidateJoinAccumulator{" + joinRequestAccumulator.keySet() + ", closed=" + closed + '}';
}
}
}