
com.hazelcast.internal.cluster.impl.ClusterJoinManager Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.cluster.impl;
import com.hazelcast.auditlog.AuditlogTypeIds;
import com.hazelcast.cluster.Address;
import com.hazelcast.cluster.ClusterState;
import com.hazelcast.cluster.Member;
import com.hazelcast.cluster.impl.MemberImpl;
import com.hazelcast.instance.BuildInfo;
import com.hazelcast.instance.impl.Node;
import com.hazelcast.instance.impl.NodeExtension;
import com.hazelcast.internal.cluster.Joiner;
import com.hazelcast.internal.cluster.MemberInfo;
import com.hazelcast.internal.cluster.impl.operations.AuthenticationFailureOp;
import com.hazelcast.internal.cluster.impl.operations.BeforeJoinCheckFailureOp;
import com.hazelcast.internal.cluster.impl.operations.ClusterMismatchOp;
import com.hazelcast.internal.cluster.impl.operations.ConfigMismatchOp;
import com.hazelcast.internal.cluster.impl.operations.FinalizeJoinOp;
import com.hazelcast.internal.cluster.impl.operations.JoinRequestOp;
import com.hazelcast.internal.cluster.impl.operations.MasterResponseOp;
import com.hazelcast.internal.cluster.impl.operations.MembersUpdateOp;
import com.hazelcast.internal.cluster.impl.operations.OnJoinOp;
import com.hazelcast.internal.cluster.impl.operations.WhoisMasterOp;
import com.hazelcast.internal.hotrestart.InternalHotRestartService;
import com.hazelcast.internal.nio.Connection;
import com.hazelcast.internal.nio.Packet;
import com.hazelcast.internal.partition.InternalPartitionService;
import com.hazelcast.internal.partition.PartitionRuntimeState;
import com.hazelcast.internal.server.ServerConnection;
import com.hazelcast.internal.util.BiTuple;
import com.hazelcast.internal.util.Clock;
import com.hazelcast.internal.util.UuidUtil;
import com.hazelcast.logging.ILogger;
import com.hazelcast.security.Credentials;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.spi.impl.operationservice.Operation;
import com.hazelcast.spi.impl.operationservice.OperationService;
import com.hazelcast.spi.properties.ClusterProperty;
import com.hazelcast.version.MemberVersion;
import com.hazelcast.version.Version;
import javax.security.auth.login.LoginContext;
import javax.security.auth.login.LoginException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.stream.Collectors;
import static com.hazelcast.cluster.memberselector.MemberSelectors.DATA_MEMBER_SELECTOR;
import static com.hazelcast.instance.EndpointQualifier.MEMBER;
import static com.hazelcast.internal.cluster.impl.MemberMap.SINGLETON_MEMBER_LIST_VERSION;
import static com.hazelcast.internal.cluster.impl.SplitBrainJoinMessage.SplitBrainMergeCheckResult.CANNOT_MERGE;
import static com.hazelcast.internal.cluster.impl.SplitBrainJoinMessage.SplitBrainMergeCheckResult.LOCAL_NODE_SHOULD_MERGE;
import static com.hazelcast.internal.cluster.impl.SplitBrainJoinMessage.SplitBrainMergeCheckResult.REMOTE_NODE_SHOULD_MERGE;
import static com.hazelcast.internal.hotrestart.InternalHotRestartService.PERSISTENCE_ENABLED_ATTRIBUTE;
import static com.hazelcast.internal.util.Preconditions.checkNotNull;
import static java.lang.String.format;
/**
* ClusterJoinManager manages member join process.
*
* If this node is not master,
* then it will answer with sending master node's address to a join request.
*
* If this is master node, it will handle join request and notify all other members
* about newly joined member.
*/
@SuppressWarnings({"checkstyle:methodcount", "checkstyle:classfanoutcomplexity", "checkstyle:npathcomplexity"})
public class ClusterJoinManager {
public static final String STALE_JOIN_PREVENTION_DURATION_PROP = "hazelcast.stale.join.prevention.duration.seconds";
private static final int DEFAULT_STALE_JOIN_PREVENTION_DURATION_IN_SECS = 30;
private static final int CLUSTER_OPERATION_RETRY_COUNT = 100;
// Map of members batched to join, along with their passed OnJoinOp pre-join operation
final Map
> joiningMembers = new LinkedHashMap<>();
private final ILogger logger;
private final Node node;
private final NodeEngineImpl nodeEngine;
private final ClusterServiceImpl clusterService;
private final Lock clusterServiceLock;
private final ClusterClockImpl clusterClock;
private final ClusterStateManager clusterStateManager;
private final Map recentlyJoinedMemberUuids = new HashMap<>();
/**
* Recently left member UUIDs: when a recently crashed member is joining
* with same UUID, typically it will have Persistence feature enabled
* (otherwise it will restart probably on the same address but definitely
* with a new random UUID). In order to support crashed members recovery
* with Persistence, partition table validation does not expect an
* identical partition table.
*
* Accessed by operation & cluster heartbeat threads
*/
private final ConcurrentMap leftMembersUuids = new ConcurrentHashMap<>();
private final long maxWaitMillisBeforeJoin;
private final long waitMillisBeforeJoin;
private final long staleJoinPreventionDurationInMillis;
private final AtomicBoolean migrationDelayActive = new AtomicBoolean();
private final ClusterJoinManagerSyncJoinStrategy syncJoinStrategy;
private volatile boolean joinInProgress;
private ScheduledFuture> minDelayFuture;
private ScheduledFuture> maxDelayFuture;
ClusterJoinManager(Node node, ClusterServiceImpl clusterService, Lock clusterServiceLock) {
this.node = node;
this.clusterService = clusterService;
this.clusterServiceLock = clusterServiceLock;
this.nodeEngine = clusterService.getNodeEngine();
logger = node.getLogger(getClass());
clusterStateManager = clusterService.getClusterStateManager();
clusterClock = clusterService.getClusterClock();
maxWaitMillisBeforeJoin = node.getProperties().getMillis(ClusterProperty.MAX_WAIT_SECONDS_BEFORE_JOIN);
waitMillisBeforeJoin = node.getProperties().getMillis(ClusterProperty.WAIT_SECONDS_BEFORE_JOIN);
staleJoinPreventionDurationInMillis = TimeUnit.SECONDS.toMillis(
Integer.getInteger(STALE_JOIN_PREVENTION_DURATION_PROP, DEFAULT_STALE_JOIN_PREVENTION_DURATION_IN_SECS));
syncJoinStrategy = node.getProperties().getBoolean(ClusterProperty.ASYNC_JOIN_STRATEGY_ENABLED)
? null : new ClusterJoinManagerSyncJoinStrategy(this, logger, maxWaitMillisBeforeJoin, waitMillisBeforeJoin);
}
boolean isJoinInProgress() {
if (joinInProgress) {
return true;
}
clusterServiceLock.lock();
try {
return joinInProgress || !joiningMembers.isEmpty();
} finally {
clusterServiceLock.unlock();
}
}
boolean isMastershipClaimInProgress() {
clusterServiceLock.lock();
try {
return joinInProgress && joiningMembers.isEmpty();
} finally {
clusterServiceLock.unlock();
}
}
/**
* Handle a {@link JoinRequestOp}. If this node is not master, reply with a {@link MasterResponseOp} to let the
* joining node know the current master. Otherwise, if no other join is in progress, execute the {@link JoinRequest}
*
* @param joinRequest the join request
* @param connection the connection to the joining node
* @see JoinRequestOp
*/
public void handleJoinRequest(JoinRequest joinRequest, ServerConnection connection) {
if (!ensureNodeIsReady()) {
return;
}
if (!ensureValidConfiguration(joinRequest)) {
return;
}
Address target = joinRequest.getAddress();
boolean isRequestFromCurrentMaster = target.equals(clusterService.getMasterAddress());
// if the join request from current master, do not send a master answer,
// because master can somehow dropped its connection and wants to join back
if (!clusterService.isMaster() && !isRequestFromCurrentMaster) {
sendMasterAnswer(target);
return;
}
if (joinInProgress) {
if (logger.isFineEnabled()) {
logger.fine(format("Join or membership claim is in progress, cannot handle join request from %s at the moment",
target));
}
return;
}
executeJoinRequest(joinRequest, connection);
}
private boolean ensureNodeIsReady() {
if (clusterService.isJoined() && node.isRunning()) {
return true;
}
if (logger.isFineEnabled()) {
logger.fine("Node is not ready to process join request...");
}
return false;
}
private boolean ensureValidConfiguration(JoinMessage joinMessage) {
Address address = joinMessage.getAddress();
try {
if (isValidJoinMessage(joinMessage)) {
return true;
}
logger.warning(format("Received an invalid join request from %s, cause: members part of different cluster",
address));
nodeEngine.getOperationService().send(new ClusterMismatchOp(), address);
} catch (ConfigMismatchException e) {
logger.warning(format("Received an invalid join request from %s, cause: %s", address, e.getMessage()));
OperationService operationService = nodeEngine.getOperationService();
operationService.send(new ConfigMismatchOp(e.getMessage()), address);
}
return false;
}
// wraps validateJoinMessage to check configuration of this vs joining node,
// rethrows only on ConfigMismatchException; in case of other exception, returns false.
private boolean isValidJoinMessage(JoinMessage joinMessage) {
try {
return validateJoinMessage(joinMessage);
} catch (ConfigMismatchException e) {
throw e;
} catch (Exception e) {
return false;
}
}
/**
* Validate that the configuration received from the remote node in {@code joinMessage} is compatible with the
* configuration of this node.
*
* @param joinMessage the {@link JoinMessage} received from another node.
* @return {@code true} if packet version of join message matches this node's packet version and configurations
* are found to be compatible, otherwise {@code false}.
* @throws Exception in case any exception occurred while checking compatibility
* @see ConfigCheck
*/
public boolean validateJoinMessage(JoinMessage joinMessage) {
if (joinMessage.getPacketVersion() != Packet.VERSION) {
return false;
}
try {
ConfigCheck newMemberConfigCheck = joinMessage.getConfigCheck();
ConfigCheck clusterConfigCheck = node.createConfigCheck();
return clusterConfigCheck.isCompatible(newMemberConfigCheck);
} catch (Exception e) {
logger.warning(format("Invalid join request from %s, cause: %s", joinMessage.getAddress(), e.getMessage()));
throw e;
}
}
/**
* Executed by a master node to process the {@link JoinRequest} sent by a node attempting to join the cluster.
*
* @param joinRequest the join request from a node attempting to join
* @param connection the connection of this node to the joining node
*/
private void executeJoinRequest(JoinRequest joinRequest, ServerConnection connection) {
clusterServiceLock.lock();
try {
if (checkJoinRequest(joinRequest, connection)) {
return;
}
if (!authenticate(joinRequest, connection)) {
return;
}
if (!validateJoinRequest(joinRequest, joinRequest.getAddress())) {
return;
}
if (syncJoinStrategy == null) {
startJoin(joinRequest.toMemberInfo(), joinRequest.getPreJoinOperation());
} else {
syncJoinStrategy.startJoinRequest(joinRequest.toMemberInfo(), joinRequest.getPreJoinOperation());
}
} finally {
clusterServiceLock.unlock();
}
}
@SuppressWarnings("checkstyle:npathcomplexity")
private boolean checkJoinRequest(JoinRequest joinRequest, ServerConnection connection) {
if (checkIfJoinRequestFromAnExistingMember(joinRequest, connection)) {
return true;
}
final InternalHotRestartService hotRestartService = node.getNodeExtension().getInternalHotRestartService();
Address target = joinRequest.getAddress();
UUID targetUuid = joinRequest.getUuid();
if (hotRestartService.isMemberExcluded(target, targetUuid)) {
logger.fine("cannot join " + target + " because it is excluded in cluster start.");
hotRestartService.notifyExcludedMember(target);
return true;
}
if (joinRequest.getExcludedMemberUuids().contains(clusterService.getThisUuid())) {
logger.warning("cannot join " + target + " since this node is excluded in its list...");
hotRestartService.handleExcludedMemberUuids(target, joinRequest.getExcludedMemberUuids());
return true;
}
return checkClusterStateBeforeJoin(target, targetUuid);
}
private boolean checkClusterStateBeforeJoin(Address target, UUID uuid) {
ClusterState state = clusterStateManager.getState();
if (state == ClusterState.IN_TRANSITION) {
logger.warning("Cluster state is in transition process. Join is not allowed until "
+ "transaction is completed -> "
+ clusterStateManager.stateToString());
return true;
}
if (state.isJoinAllowed()) {
return checkRecentlyJoinedMemberUuidBeforeJoin(target, uuid);
}
if (clusterService.isMissingMember(target, uuid)) {
return false;
}
if (node.getNodeExtension().isStartCompleted()) {
String message = "Cluster state either is locked or doesn't allow new members to join -> "
+ clusterStateManager.stateToString();
logger.warning(message);
OperationService operationService = nodeEngine.getOperationService();
BeforeJoinCheckFailureOp op = new BeforeJoinCheckFailureOp(message);
operationService.send(op, target);
} else {
String message = "Cluster state either is locked or doesn't allow new members to join -> "
+ clusterStateManager.stateToString() + ". Silently ignored join request of " + target
+ " because start not completed.";
logger.warning(message);
}
return true;
}
void insertIntoRecentlyJoinedMemberSet(Collection extends Member> members) {
cleanupRecentlyJoinedMemberUuids();
if (clusterService.getClusterState().isJoinAllowed()) {
long localTime = Clock.currentTimeMillis();
for (Member member : members) {
recentlyJoinedMemberUuids.put(member.getUuid(), localTime);
}
}
}
private boolean checkRecentlyJoinedMemberUuidBeforeJoin(Address target, UUID uuid) {
cleanupRecentlyJoinedMemberUuids();
boolean recentlyJoined = recentlyJoinedMemberUuids.containsKey(uuid);
if (recentlyJoined) {
logger.warning("Cannot allow join request from " + target + ", since it has been already joined with " + uuid);
}
return recentlyJoined;
}
private void cleanupRecentlyJoinedMemberUuids() {
long currentTime = Clock.currentTimeMillis();
recentlyJoinedMemberUuids.values().removeIf(joinTime -> (currentTime - joinTime) >= staleJoinPreventionDurationInMillis);
}
private boolean authenticate(JoinRequest joinRequest, Connection connection) {
if (!joiningMembers.containsKey(joinRequest.getAddress())) {
try {
secureLogin(joinRequest, connection);
} catch (Exception e) {
ILogger securityLogger = node.loggingService.getLogger("com.hazelcast.security");
nodeEngine.getOperationService().send(new AuthenticationFailureOp(), joinRequest.getAddress());
securityLogger.severe(e);
return false;
}
}
return true;
}
private void secureLogin(JoinRequest joinRequest, Connection connection) {
if (node.securityContext != null) {
Credentials credentials = joinRequest.getCredentials();
if (credentials == null) {
throw new SecurityException("Expecting security credentials, but credentials could not be found in join request");
}
String endpoint = joinRequest.getAddress().getHost();
Boolean passed = Boolean.FALSE;
try {
String remoteClusterName = joinRequest.getConfigCheck().getClusterName();
LoginContext loginContext = node.securityContext.createMemberLoginContext(remoteClusterName, credentials,
connection);
loginContext.login();
connection.attributeMap().put(LoginContext.class, loginContext);
passed = Boolean.TRUE;
} catch (LoginException e) {
throw new SecurityException(format("Authentication has failed for %s @%s, cause: %s",
String.valueOf(credentials), endpoint, e.getMessage()));
} finally {
Address remoteAddr = connection == null ? null : connection.getRemoteAddress();
nodeEngine.getNode().getNodeExtension().getAuditlogService()
.eventBuilder(AuditlogTypeIds.AUTHENTICATION_MEMBER)
.message("Member connection authentication.")
.addParameter("credentials", credentials)
.addParameter("remoteAddress", remoteAddr)
.addParameter("endpoint", endpoint)
.addParameter("passed", passed)
.log();
}
}
}
/**
* Invoked from master node while executing a join request to validate it, delegating to
* {@link NodeExtension#validateJoinRequest(JoinMessage)}
*/
private boolean validateJoinRequest(JoinRequest joinRequest, Address target) {
if (clusterService.isMaster()) {
try {
node.getNodeExtension().validateJoinRequest(joinRequest);
} catch (Exception e) {
logger.warning(e.getMessage());
nodeEngine.getOperationService().send(new BeforeJoinCheckFailureOp(e.getMessage()), target);
return false;
}
}
return true;
}
/**
* Send join request to {@code toAddress}.
*
* @param toAddress the currently known master address.
* @return {@code true} if join request was sent successfully, otherwise {@code false}.
*/
public boolean sendJoinRequest(Address toAddress) {
if (toAddress == null) {
toAddress = clusterService.getMasterAddress();
}
JoinRequestOp joinRequest = new JoinRequestOp(node.createJoinRequest(toAddress));
return nodeEngine.getOperationService().send(joinRequest, toAddress);
}
public boolean setThisMemberAsMaster() {
clusterServiceLock.lock();
try {
if (clusterService.isJoined()) {
logger.warning("Cannot set as master because node is already joined!");
return false;
}
logger.finest("This node is being set as the master");
Address thisAddress = node.getThisAddress();
MemberVersion version = node.getVersion();
clusterService.setMasterAddress(thisAddress);
if (clusterService.getClusterVersion().isUnknown()) {
clusterService.getClusterStateManager().setClusterVersion(version.asVersion());
}
clusterService.getClusterClock().setClusterStartTime(Clock.currentTimeMillis());
clusterService.setClusterId(UuidUtil.newUnsecureUUID());
clusterService.getMembershipManager().setLocalMemberListJoinVersion(SINGLETON_MEMBER_LIST_VERSION);
clusterService.setJoined(true);
return true;
} finally {
clusterServiceLock.unlock();
}
}
/**
* Set master address, if required.
*
* @param masterAddress address of cluster's master, as provided in {@link MasterResponseOp}
* @param callerAddress address of node that sent the {@link MasterResponseOp}
* @see MasterResponseOp
*/
public void handleMasterResponse(Address masterAddress, Address callerAddress) {
clusterServiceLock.lock();
try {
if (logger.isFineEnabled()) {
logger.fine(format("Handling master response %s from %s", masterAddress, callerAddress));
}
if (clusterService.isJoined()) {
if (logger.isFineEnabled()) {
logger.fine(format("Master address information (%s) came from %s. This node is already joined. "
+ "The received master address will be suggested as a temporary member address "
+ "in the TCP joiner configuration.", masterAddress, callerAddress));
}
suggestAddressToKnownMembers(masterAddress);
return;
}
if (node.getThisAddress().equals(masterAddress)) {
logger.warning("Received my address as master address from " + callerAddress);
return;
}
Address currentMaster = clusterService.getMasterAddress();
if (currentMaster == null || currentMaster.equals(masterAddress)) {
setMasterAndJoin(masterAddress);
return;
}
if (currentMaster.equals(callerAddress)) {
logger.warning(format("Setting master to %s since %s says it is not master anymore", masterAddress,
currentMaster));
setMasterAndJoin(masterAddress);
return;
}
Connection conn = node.getServer().getConnectionManager(MEMBER).get(currentMaster);
if (conn != null && conn.isAlive()) {
logger.info(format("Ignoring master response %s from %s since this node has an active master %s",
masterAddress, callerAddress, currentMaster));
sendJoinRequest(currentMaster);
} else {
logger.warning(format("Ambiguous master response! Received master response %s from %s. "
+ "This node has a master %s, but does not have an active connection to it. "
+ "Master field will be unset now.",
masterAddress, callerAddress, currentMaster));
clusterService.setMasterAddress(null);
}
} finally {
clusterServiceLock.unlock();
}
}
private void suggestAddressToKnownMembers(Address masterAddress) {
if (node.getThisAddress().equals(masterAddress)) {
return;
}
Joiner joiner = node.getJoiner();
if (joiner != null && joiner.getClass() == TcpIpJoiner.class) {
logger.info(
format("The address (%s) will be added as a temporary member address to the TCP-IP joiner configuration.",
masterAddress));
((TcpIpJoiner) joiner).addTemporaryMemberAddress(masterAddress);
}
}
private void setMasterAndJoin(Address masterAddress) {
clusterService.setMasterAddress(masterAddress);
node.getServer().getConnectionManager(MEMBER).getOrConnect(masterAddress);
if (!sendJoinRequest(masterAddress)) {
logger.warning("Could not create connection to possible master " + masterAddress);
}
}
/**
* Send a {@link WhoisMasterOp} to designated address.
*
* @param toAddress the address to which the operation will be sent.
* @return {@code true} if the operation was sent, otherwise {@code false}.
*/
public boolean sendMasterQuestion(Address toAddress) {
checkNotNull(toAddress, "No endpoint is specified!");
BuildInfo buildInfo = node.getBuildInfo();
Address thisAddress = node.getThisAddress();
JoinMessage joinMessage = new JoinMessage(Packet.VERSION, buildInfo.getBuildNumber(), node.getVersion(),
thisAddress, clusterService.getThisUuid(), node.isLiteMember(), node.createConfigCheck());
return nodeEngine.getOperationService().send(new WhoisMasterOp(joinMessage), toAddress);
}
/**
* Respond to a {@link WhoisMasterOp}.
*
* @param joinMessage the {@code JoinMessage} from the request.
* @param connection the connection to operation caller, to which response will be sent.
* @see WhoisMasterOp
*/
public void answerWhoisMasterQuestion(JoinMessage joinMessage, ServerConnection connection) {
if (!ensureValidConfiguration(joinMessage)) {
return;
}
if (clusterService.isJoined()) {
if (!checkIfJoinRequestFromAnExistingMember(joinMessage, connection)) {
sendMasterAnswer(joinMessage.getAddress());
}
} else {
if (logger.isFineEnabled()) {
logger.fine(format("Received a master question from %s,"
+ " but this node is not master itself or doesn't have a master yet!", joinMessage.getAddress()));
}
}
}
/**
* Respond to a join request by sending the master address in a {@link MasterResponseOp}. This happens when current node
* receives a join request but is not the cluster's master.
*
* @param target the node receiving the master answer
*/
void sendMasterAnswer(Address target) {
Address masterAddress = clusterService.getMasterAddress();
if (masterAddress == null) {
logger.info(format("Cannot send master answer to %s since master node is not known yet", target));
return;
}
if (masterAddress.equals(node.getThisAddress())
&& node.getNodeExtension().getInternalHotRestartService()
.isMemberExcluded(masterAddress, clusterService.getThisUuid())) {
// I already know that I will do a force-start, so I will not allow target to join me
logger.info("Cannot send master answer because " + target + " should not join to this master node.");
return;
}
if (masterAddress.equals(target)) {
logger.fine("Cannot send master answer to " + target + " since it is the known master");
return;
}
MasterResponseOp op = new MasterResponseOp(masterAddress);
nodeEngine.getOperationService().send(op, target);
}
@SuppressWarnings("checkstyle:cyclomaticcomplexity")
private boolean checkIfJoinRequestFromAnExistingMember(JoinMessage joinMessage, ServerConnection connection) {
Address targetAddress = joinMessage.getAddress();
MemberImpl member = clusterService.getMember(targetAddress);
if (member == null) {
return checkIfUsingAnExistingMemberUuid(joinMessage);
}
if (joinMessage.getUuid().equals(member.getUuid())) {
sendMasterAnswer(targetAddress);
if (clusterService.isMaster() && !isMastershipClaimInProgress()) {
if (logger.isFineEnabled()) {
logger.fine(format("Ignoring join request, member already exists: %s", joinMessage));
}
// send members update back to node trying to join again...
MemberMap memberMap = clusterService.getMembershipManager().getMemberMap();
boolean deferPartitionProcessing = isMemberRestartingWithPersistence(member.getAttributes())
&& isMemberRejoining(memberMap, member.getAddress(), member.getUuid());
OnJoinOp preJoinOp = preparePreJoinOps();
OnJoinOp postJoinOp = preparePostJoinOp();
PartitionRuntimeState partitionRuntimeState = node.getPartitionService().createPartitionState();
Operation op = new FinalizeJoinOp(member.getUuid(),
clusterService.getMembershipManager().getMembersView(), preJoinOp, postJoinOp,
clusterClock.getClusterTime(), clusterService.getClusterId(),
clusterClock.getClusterStartTime(), clusterStateManager.getState(),
clusterService.getClusterVersion(), partitionRuntimeState, deferPartitionProcessing,
node.getClusterTopologyIntent());
op.setCallerUuid(clusterService.getThisUuid());
invokeClusterOp(op, targetAddress);
}
return true;
}
// If I am the master, I will just suspect from the target. If it sends a new join request, it will be processed.
// If I am not the current master, I can turn into the new master and start the claim process
// after I suspect from the target.
if (!hasMemberLeft(joinMessage.getUuid())
&& (clusterService.isMaster() || targetAddress.equals(clusterService.getMasterAddress()))) {
String msg = format("New join request has been received from an existing endpoint %s."
+ " Removing old member and processing join request with UUID %s", member, joinMessage.getUuid());
logger.warning(msg);
clusterService.suspectMember(member, msg, false);
ServerConnection existing = node.getServer().getConnectionManager(MEMBER).get(targetAddress);
if (existing != connection) {
if (existing != null) {
existing.close(msg, null);
}
node.getServer().getConnectionManager(MEMBER).register(targetAddress, joinMessage.getUuid(), connection);
}
}
return true;
}
/** check if member is joining with persistence enabled */
private boolean isMemberRestartingWithPersistence(Map attributes) {
return attributes.get(PERSISTENCE_ENABLED_ATTRIBUTE) != null
&& attributes.get(PERSISTENCE_ENABLED_ATTRIBUTE).equals("true");
}
private boolean isMemberRejoining(MemberMap previousMembersMap, Address address, UUID memberUuid) {
// may be already detected as crashed member
return (hasMemberLeft(memberUuid)
// or it is still in member list because connection timeout hasn't been reached yet
|| previousMembersMap.contains(memberUuid)
// or it is a known missing member
|| clusterService.getMembershipManager().isMissingMember(address, memberUuid))
&& (node.getPartitionService().getLeftMemberSnapshot(memberUuid) != null);
}
private boolean checkIfUsingAnExistingMemberUuid(JoinMessage joinMessage) {
Member member = clusterService.getMember(joinMessage.getUuid());
Address target = joinMessage.getAddress();
if (member != null && !member.getAddress().equals(joinMessage.getAddress())) {
if (clusterService.isMaster() && !isMastershipClaimInProgress()) {
String message = "There's already an existing member " + member + " with the same UUID. "
+ target + " is not allowed to join.";
logger.warning(message);
} else {
sendMasterAnswer(target);
}
return true;
}
return false;
}
void setMastershipClaimInProgress() {
clusterServiceLock.lock();
try {
joinInProgress = true;
joiningMembers.clear();
} finally {
clusterServiceLock.unlock();
}
}
/**
* Starts join process on master member.
*/
void startJoin(MemberInfo memberInfo, OnJoinOp preJoinOperation) {
logger.fine("Starting join...");
if (syncJoinStrategy == null) {
sendMasterAnswer(memberInfo.getAddress());
}
clusterServiceLock.lock();
try {
InternalPartitionService partitionService = node.getPartitionService();
boolean migrationPaused = false;
try {
joinInProgress = true;
if (syncJoinStrategy == null && (maxWaitMillisBeforeJoin > 0 && waitMillisBeforeJoin > 0)) {
migrationPaused = scheduleMigrationDelay();
} else {
// pause migrations until join, member-update and post-join operations are completed
partitionService.pauseMigration();
migrationPaused = true;
}
MemberMap memberMap = clusterService.getMembershipManager().getMemberMap();
MembersView newMembersView = MembersView.cloneAdding(memberMap.toMembersView(),
syncJoinStrategy == null ? Collections.singletonList(memberInfo)
: joiningMembers.values().stream().map(BiTuple::element1).collect(Collectors.toList()));
long time = clusterClock.getClusterTime();
// member list must be updated on master before preparation of pre-/post-join ops so other operations which have
// to be executed on stable cluster can detect the member list version change and retry in case of topology change
UUID thisUuid = clusterService.getThisUuid();
if (!clusterService.updateMembers(newMembersView, node.getThisAddress(), thisUuid, thisUuid)) {
return;
}
if (syncJoinStrategy != null) {
// Run all joining members' provided pre join operations now, but only
// execute them locally and on existing members of this cluster (do
// not broadcast to other members joining within this batch)
runProvidedPostJoinOpsWithoutBroadcastSyncStrategyOnly();
} else {
if (preJoinOperation != null) {
nodeEngine.getOperationService().run(preJoinOperation);
}
}
// Prepare our normal pre-join operations, which will be broadcast remotely;
// this must be done AFTER pre-join ops from all joining members are applied
// to master, via #runProvidedPostJoinOpsWithoutBroadcast() above
OnJoinOp preJoinOp = preparePreJoinOps();
// post join operations must be lock free, that means no locks at all:
// no partition locks, no key-based locks, no service level locks!
OnJoinOp postJoinOp = preparePostJoinOp();
// this is the current partition assignment state, not taking into account the
// currently joining members
PartitionRuntimeState partitionRuntimeState = partitionService.createPartitionState();
migrationPaused &= shouldTriggerRepartition(memberMap, memberInfo);
if (syncJoinStrategy != null) {
for (BiTuple tuple : joiningMembers.values()) {
MemberInfo member = tuple.element1();
sendFinalizeJoinOp(member, thisUuid, newMembersView, preJoinOp, postJoinOp,
partitionRuntimeState, time, migrationPaused);
}
} else {
sendFinalizeJoinOp(memberInfo, thisUuid, newMembersView, preJoinOp, postJoinOp,
partitionRuntimeState, time, migrationPaused);
}
updateMembers(memberInfo, memberMap, newMembersView, thisUuid, time, partitionRuntimeState);
} finally {
reset();
if (migrationPaused) {
partitionService.resumeMigration();
}
}
} finally {
clusterServiceLock.unlock();
}
}
private boolean shouldTriggerRepartition(MemberMap memberMap, MemberInfo joiningMemberInfo) {
if (syncJoinStrategy != null) {
for (BiTuple tuple : joiningMembers.values()) {
MemberInfo member = tuple.element1();
if (delayRepartitionForHotRestartRecovery(memberMap, member)) {
return false;
}
}
} else {
if (delayRepartitionForHotRestartRecovery(memberMap, joiningMemberInfo)) {
return false;
}
}
return true;
}
private boolean delayRepartitionForHotRestartRecovery(MemberMap memberMap, MemberInfo member) {
if (isMemberRestartingWithPersistence(member.getAttributes())
&& isMemberRejoining(memberMap, member.getAddress(), member.getUuid())) {
logger.info(member + " is rejoining the cluster");
// do not trigger repartition immediately, wait for joining member to load hot-restart data
return true;
}
return false;
}
private void sendFinalizeJoinOp(MemberInfo member, UUID thisUuid, MembersView newMembersView,
OnJoinOp preJoinOp, OnJoinOp postJoinOp,
PartitionRuntimeState partitionRuntimeState, long time,
boolean shouldTriggerRepartition) {
long startTime = clusterClock.getClusterStartTime();
Operation op = new FinalizeJoinOp(member.getUuid(), newMembersView, preJoinOp, postJoinOp, time,
clusterService.getClusterId(), startTime, clusterStateManager.getState(),
clusterService.getClusterVersion(), partitionRuntimeState, !shouldTriggerRepartition,
node.getClusterTopologyIntent());
op.setCallerUuid(thisUuid);
invokeClusterOp(op, member.getAddress());
}
private void updateMembers(MemberInfo memberInfo, MemberMap memberMap, MembersView newMembersView, UUID thisUuid,
long time, PartitionRuntimeState partitionRuntimeState) {
for (MemberImpl member : memberMap.getMembers()) {
if (member.localMember() || memberInfo.getAddress().equals(member.getAddress())) {
continue;
}
if (joiningMembers.containsKey(member.getAddress())) {
continue;
}
Operation op = new MembersUpdateOp(member.getUuid(), newMembersView, time, partitionRuntimeState, true);
op.setCallerUuid(thisUuid);
invokeClusterOp(op, member.getAddress());
}
}
// Accessible for testing
boolean isBatchingJoins(long now) {
return syncJoinStrategy != null && now < syncJoinStrategy.timeToStartJoin;
}
private void runProvidedPostJoinOpsWithoutBroadcastSyncStrategyOnly() {
for (BiTuple tuple : joiningMembers.values()) {
if (tuple.element2() != null) {
OnJoinOp onJoinOp = tuple.element2();
try {
onJoinOp.beforeRun();
} catch (Exception e) {
throw new RuntimeException(e);
}
onJoinOp.runWithoutBroadcastTo(joiningMembers.keySet());
}
}
}
private OnJoinOp preparePostJoinOp() {
Collection postJoinOps = nodeEngine.getPostJoinOperations();
return (postJoinOps != null && !postJoinOps.isEmpty()) ? new OnJoinOp(postJoinOps) : null;
}
private OnJoinOp preparePreJoinOps() {
Collection preJoinOps = nodeEngine.getPreJoinOperations();
return (preJoinOps != null && !preJoinOps.isEmpty()) ? new OnJoinOp(preJoinOps) : null;
}
private Future invokeClusterOp(Operation op, Address target) {
return nodeEngine.getOperationService()
.createInvocationBuilder(ClusterServiceImpl.SERVICE_NAME, op, target)
.setTryCount(CLUSTER_OPERATION_RETRY_COUNT).invoke();
}
@SuppressWarnings({"checkstyle:returncount", "checkstyle:npathcomplexity"})
public SplitBrainJoinMessage.SplitBrainMergeCheckResult shouldMerge(SplitBrainJoinMessage joinMessage) {
if (joinMessage == null) {
return CANNOT_MERGE;
}
if (logger.isFineEnabled()) {
logger.fine("Checking if we should merge to: " + joinMessage);
}
if (!checkValidSplitBrainJoinMessage(joinMessage)) {
return CANNOT_MERGE;
}
if (!checkCompatibleSplitBrainJoinMessage(joinMessage)) {
return CANNOT_MERGE;
}
if (!checkMergeTargetIsNotMember(joinMessage)) {
return CANNOT_MERGE;
}
if (!checkClusterStateAllowsJoinBeforeMerge(joinMessage)) {
return CANNOT_MERGE;
}
if (!checkMembershipIntersectionSetEmpty(joinMessage)) {
return CANNOT_MERGE;
}
int targetDataMemberCount = joinMessage.getDataMemberCount();
int currentDataMemberCount = clusterService.getSize(DATA_MEMBER_SELECTOR);
if (targetDataMemberCount > currentDataMemberCount) {
logger.info("We should merge to " + joinMessage.getAddress()
+ " because their data member count is bigger than ours ["
+ (targetDataMemberCount + " > " + currentDataMemberCount) + ']');
suggestAddressToKnownMembers(joinMessage.getAddress());
return LOCAL_NODE_SHOULD_MERGE;
}
if (targetDataMemberCount < currentDataMemberCount) {
logger.info(joinMessage.getAddress() + " should merge to us "
+ "because our data member count is bigger than theirs ["
+ (currentDataMemberCount + " > " + targetDataMemberCount) + ']');
return REMOTE_NODE_SHOULD_MERGE;
}
// targetDataMemberCount == currentDataMemberCount
if (shouldMergeTo(node.getThisAddress(), joinMessage.getAddress())) {
logger.info("We should merge to " + joinMessage.getAddress()
+ ", both have the same data member count: " + currentDataMemberCount);
suggestAddressToKnownMembers(joinMessage.getAddress());
return LOCAL_NODE_SHOULD_MERGE;
}
logger.info(joinMessage.getAddress() + " should merge to us"
+ ", both have the same data member count: " + currentDataMemberCount);
return REMOTE_NODE_SHOULD_MERGE;
}
private boolean checkValidSplitBrainJoinMessage(SplitBrainJoinMessage joinMessage) {
try {
if (!validateJoinMessage(joinMessage)) {
logger.fine("Cannot process split brain merge message from " + joinMessage.getAddress()
+ ", since join-message could not be validated.");
return false;
}
} catch (Exception e) {
logger.fine("failure during validating join message", e);
return false;
}
return true;
}
private boolean checkCompatibleSplitBrainJoinMessage(SplitBrainJoinMessage joinMessage) {
Version clusterVersion = clusterService.getClusterVersion();
if (!clusterVersion.isEqualTo(joinMessage.getClusterVersion())) {
if (logger.isFineEnabled()) {
logger.fine("Should not merge to " + joinMessage.getAddress() + " because other cluster version is "
+ joinMessage.getClusterVersion() + " while this cluster version is "
+ clusterVersion);
}
return false;
}
return true;
}
private boolean checkMergeTargetIsNotMember(SplitBrainJoinMessage joinMessage) {
if (clusterService.getMember(joinMessage.getAddress()) != null) {
if (logger.isFineEnabled()) {
logger.fine("Should not merge to " + joinMessage.getAddress()
+ ", because it is already member of this cluster.");
}
return false;
}
return true;
}
private boolean checkClusterStateAllowsJoinBeforeMerge(SplitBrainJoinMessage joinMessage) {
ClusterState clusterState = clusterService.getClusterState();
if (!clusterState.isJoinAllowed()) {
if (logger.isFineEnabled()) {
logger.fine("Should not merge to " + joinMessage.getAddress() + ", because this cluster is in "
+ clusterState + " state.");
}
return false;
}
return true;
}
private boolean checkMembershipIntersectionSetEmpty(SplitBrainJoinMessage joinMessage) {
Collection targetMemberAddresses = joinMessage.getMemberAddresses();
Address joinMessageAddress = joinMessage.getAddress();
if (targetMemberAddresses.contains(node.getThisAddress())) {
// Join request is coming from master of the split, and it thinks that I am its member.
// This is partial split case, and we want to convert it to a full split.
// So it should remove me from its cluster.
MembersViewMetadata membersViewMetadata = new MembersViewMetadata(joinMessageAddress, joinMessage.getUuid(),
joinMessageAddress, joinMessage.getMemberListVersion());
clusterService.sendExplicitSuspicion(membersViewMetadata);
logger.info(node.getThisAddress() + " CANNOT merge to " + joinMessageAddress
+ ", because it thinks this-node as its member.");
return false;
}
for (Address address : clusterService.getMemberAddresses()) {
if (targetMemberAddresses.contains(address)) {
logger.info(node.getThisAddress() + " CANNOT merge to " + joinMessageAddress
+ ", because it thinks " + address + " is its member. "
+ "But " + address + " is member of this cluster.");
return false;
}
}
return true;
}
/**
* Determines whether this address should merge to target address and called when two sides are equal on all aspects.
* This is a pure function that must produce always the same output when called with the same parameters.
* This logic should not be changed, otherwise compatibility will be broken.
*
* @param thisAddress this address
* @param targetAddress target address
* @return true if this address should merge to target, false otherwise
*/
private boolean shouldMergeTo(Address thisAddress, Address targetAddress) {
String thisAddressStr = "[" + thisAddress.getHost() + "]:" + thisAddress.getPort();
String targetAddressStr = "[" + targetAddress.getHost() + "]:" + targetAddress.getPort();
if (thisAddressStr.equals(targetAddressStr)) {
throw new IllegalArgumentException("Addresses must be different! This: "
+ thisAddress + ", Target: " + targetAddress);
}
// Since strings are guaranteed to be different, result will always be non-zero.
int result = thisAddressStr.compareTo(targetAddressStr);
return result > 0;
}
void reset() {
clusterServiceLock.lock();
try {
joinInProgress = false;
joiningMembers.clear();
if (syncJoinStrategy == null) {
if (cancelMigrationTimeout()) {
node.getPartitionService().resumeMigration();
}
} else {
syncJoinStrategy.reset();
}
} finally {
clusterServiceLock.unlock();
}
}
/**
* assumes clusterServiceLock is locked
*
* @return true if timeout needed to be canceled
*/
private boolean cancelMigrationTimeout() {
boolean timedOut = migrationDelayActive.getAndSet(false);
if (timedOut) {
minDelayFuture.cancel(false);
maxDelayFuture.cancel(false);
// only for posterity's sake
minDelayFuture = null;
maxDelayFuture = null;
}
return timedOut;
}
/**
* assumes clusterServiceLock is locked
*/
private boolean scheduleMigrationDelay() {
boolean subsequentJoinAttempt = migrationDelayActive.getAndSet(true);
if (subsequentJoinAttempt) {
assert minDelayFuture.cancel(false) : "Something went wrong canceling min delay future";
}
minDelayFuture = nodeEngine.getExecutionService().schedule(this::reset,
waitMillisBeforeJoin, TimeUnit.MILLISECONDS);
if (!subsequentJoinAttempt) {
// pause migrations until no more members are trying to join in the same period
node.getPartitionService().pauseMigration();
maxDelayFuture = nodeEngine.getExecutionService().schedule(this::reset,
maxWaitMillisBeforeJoin, TimeUnit.MILLISECONDS);
return true;
}
return false;
}
// only used for sync join strategy
void removeJoin(Address address) {
joiningMembers.remove(address);
}
void addLeftMember(Member member) {
leftMembersUuids.put(member.getUuid(), Clock.currentTimeMillis());
}
public boolean hasMemberLeft(UUID memberUuid) {
return leftMembersUuids.containsKey(memberUuid);
}
public void removeLeftMember(UUID memberUuid) {
leftMembersUuids.remove(memberUuid);
}
}