org.infinispan.topology.ClusterTopologyManagerImpl Maven / Gradle / Ivy
package org.infinispan.topology;
import static java.lang.String.format;
import static org.infinispan.factories.KnownComponentNames.ASYNC_TRANSPORT_EXECUTOR;
import static org.infinispan.factories.KnownComponentNames.STATE_TRANSFER_EXECUTOR;
import static org.infinispan.util.logging.LogFactory.CLUSTER;
import static;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.infinispan.commands.ReplicableCommand;
import org.infinispan.commons.CacheException;
import org.infinispan.commons.util.CollectionFactory;
import org.infinispan.commons.util.InfinispanCollections;
import org.infinispan.commons.util.ProcessorInfo;
import org.infinispan.commons.util.Util;
import org.infinispan.configuration.cache.CacheMode;
import org.infinispan.configuration.cache.Configuration;
import org.infinispan.executors.LimitedExecutor;
import org.infinispan.factories.GlobalComponentRegistry;
import org.infinispan.factories.annotations.ComponentName;
import org.infinispan.factories.annotations.Inject;
import org.infinispan.factories.annotations.Start;
import org.infinispan.factories.annotations.Stop;
import org.infinispan.globalstate.GlobalStateManager;
import org.infinispan.globalstate.ScopedPersistentState;
import org.infinispan.manager.EmbeddedCacheManager;
import org.infinispan.notifications.Listener;
import org.infinispan.notifications.cachemanagerlistener.CacheManagerNotifier;
import org.infinispan.notifications.cachemanagerlistener.annotation.Merged;
import org.infinispan.notifications.cachemanagerlistener.annotation.ViewChanged;
import org.infinispan.notifications.cachemanagerlistener.event.ViewChangedEvent;
import org.infinispan.partitionhandling.AvailabilityMode;
import org.infinispan.partitionhandling.PartitionHandling;
import org.infinispan.partitionhandling.impl.AvailabilityStrategy;
import org.infinispan.partitionhandling.impl.LostDataCheck;
import org.infinispan.partitionhandling.impl.PreferAvailabilityStrategy;
import org.infinispan.partitionhandling.impl.PreferConsistencyStrategy;
import org.infinispan.remoting.inboundhandler.DeliverOrder;
import org.infinispan.remoting.responses.CacheNotFoundResponse;
import org.infinispan.remoting.responses.ExceptionResponse;
import org.infinispan.remoting.responses.Response;
import org.infinispan.remoting.responses.SuccessfulResponse;
import org.infinispan.remoting.rpc.ResponseFilter;
import org.infinispan.remoting.rpc.ResponseMode;
import org.infinispan.remoting.transport.Address;
import org.infinispan.remoting.transport.Transport;
import org.infinispan.remoting.transport.jgroups.SuspectException;
import org.infinispan.statetransfer.RebalanceType;
import org.infinispan.util.concurrent.CompletableFutures;
import org.infinispan.util.concurrent.TimeoutException;
import org.infinispan.util.logging.Log;
import org.infinispan.util.logging.LogFactory;
import net.jcip.annotations.GuardedBy;
* The {@code ClusterTopologyManager} implementation.
* @author Dan Berindei
* @author Pedro Ruivo
* @since 5.2
public class ClusterTopologyManagerImpl implements ClusterTopologyManager {
public static final int INITIAL_CONNECTION_ATTEMPTS = 10;
public static final int CLUSTER_RECOVERY_ATTEMPTS = 10;
private static final Log log = LogFactory.getLog(ClusterTopologyManagerImpl.class);
private static final boolean trace = log.isTraceEnabled();
@Inject private Transport transport;
@Inject private GlobalConfiguration globalConfiguration;
@Inject private GlobalComponentRegistry gcr;
@Inject private CacheManagerNotifier cacheManagerNotifier;
@Inject private EmbeddedCacheManager cacheManager;
private ExecutorService asyncTransportExecutor;
private ExecutorService stateTransferExecutor;
@Inject private EventLogManager eventLogManager;
@Inject private PersistentUUIDManager persistentUUIDManager;
// These need to be volatile because they are sometimes read without holding the view handling lock.
private volatile int viewId = -1;
private volatile ClusterManagerStatus clusterManagerStatus = ClusterManagerStatus.INITIALIZING;
private final Lock clusterManagerLock = new ReentrantLock();
private final Condition clusterStateChanged = clusterManagerLock.newCondition();
private final ConcurrentMap cacheStatusMap = CollectionFactory.makeConcurrentMap();
private ClusterViewListener viewListener;
private LimitedExecutor viewHandlingExecutor;
// The global rebalancing status
private volatile boolean globalRebalancingEnabled = true;
@Start(priority = 100)
public void start() {
viewHandlingExecutor = new LimitedExecutor("ViewHandling", asyncTransportExecutor, 1);
viewListener = new ClusterViewListener();
// The listener already missed the initial view
viewHandlingExecutor.execute(() -> handleClusterView(false, transport.getViewId()));
protected void fetchRebalancingStatusFromCoordinator() {
if (!transport.isCoordinator()) {
// Assume any timeout is because the coordinator doesn't have a CommandAwareRpcDispatcher yet
// (possible with a JGroupsChannelLookup and shouldConnect = false), and retry.
ReplicableCommand command = new CacheTopologyControlCommand(null,
CacheTopologyControlCommand.Type.POLICY_GET_STATUS, transport.getAddress(), -1);
Address coordinator = null;
Response response = null;
for (int i = INITIAL_CONNECTION_ATTEMPTS - 1; i >= 0; i--) {
try {
coordinator = transport.getCoordinator();
Map responseMap = transport
.invokeRemotely(Collections.singleton(coordinator), command, ResponseMode.SYNCHRONOUS,
getGlobalTimeout() / INITIAL_CONNECTION_ATTEMPTS, null, DeliverOrder.NONE, false);
response = responseMap.get(coordinator);
} catch (Exception e) {
if (i == 0 || !(e instanceof TimeoutException)) {
log.errorReadingRebalancingStatus(coordinator, e);
response = SuccessfulResponse.create(Boolean.TRUE);
log.debug("Timed out waiting for rebalancing status from coordinator, trying again");
if (response instanceof SuccessfulResponse) {
globalRebalancingEnabled = ((Boolean) ((SuccessfulResponse) response).getResponseValue());
} else {
log.errorReadingRebalancingStatus(coordinator, new CacheException(Objects.toString(response)));
@Stop(priority = 100)
public void stop() {
// Stop blocking cache topology commands.
try {
clusterManagerStatus = ClusterManagerStatus.STOPPING;
} finally {
if (viewListener != null) {
if (viewHandlingExecutor != null) {
public ClusterManagerStatus getStatus() {
return clusterManagerStatus;
public CacheStatusResponse handleJoin(String cacheName, Address joiner, CacheJoinInfo joinInfo,
int joinerViewId) throws Exception {
ClusterCacheStatus cacheStatus;
try {
waitForJoinerView(joiner, joinerViewId, joinInfo.getTimeout());
if (!clusterManagerStatus.isRunning()) {
log.debugf("Ignoring join request from %s for cache %s, the local cache manager is shutting down",
joiner, cacheName);
return null;
if (joinerViewId < viewId) {
log.debugf("Ignoring join request from %s for cache %s, joiner's view id is too old: %d", joiner,
cacheName, joinerViewId);
return null;
cacheStatus = initCacheStatusIfAbsent(cacheName, joinInfo.getCacheMode());
} finally {
return cacheStatus.doJoin(joiner, joinInfo);
public void handleLeave(String cacheName, Address leaver, int viewId) throws Exception {
if (!clusterManagerStatus.isRunning()) {
log.debugf("Ignoring leave request from %s for cache %s, the local cache manager is shutting down",
leaver, cacheName);
ClusterCacheStatus cacheStatus = cacheStatusMap.get(cacheName);
if (cacheStatus == null) {
// This can happen if we've just become coordinator
log.tracef("Ignoring leave request from %s for cache %s because it doesn't have a cache status entry", leaver, cacheName);
if (cacheStatus.doLeave(leaver)) {
public void handleRebalancePhaseConfirm(String cacheName, Address node, int topologyId, Throwable throwable, int viewId) throws Exception {
if (throwable != null) {
// TODO We could try to update the pending CH such that nodes reporting errors are not considered to hold any state
// For now we are just logging the error and proceeding as if the rebalance was successful everywhere
log.rebalanceError(cacheName, node, topologyId, throwable);
ClusterCacheStatus cacheStatus = cacheStatusMap.get(cacheName);
if (cacheStatus == null) {
log.debugf("Ignoring rebalance confirmation from %s " +
"for cache %s because it doesn't have a cache status entry", node, cacheName);
cacheStatus.confirmRebalancePhase(node, topologyId);
private static class CacheTopologyFilterReuser implements ResponseFilter {
Map seenTopologies = new HashMap<>();
Map seenInfos = new HashMap<>();
public boolean isAcceptable(Response response, Address sender) {
if (response.isSuccessful()) {
ManagerStatusResponse value = (ManagerStatusResponse) ((SuccessfulResponse)response).getResponseValue();
for (Entry entry : value.getCaches().entrySet()) {
CacheStatusResponse csr = entry.getValue();
CacheTopology cacheTopology = csr.getCacheTopology();
CacheTopology stableTopology = csr.getStableTopology();
CacheTopology replaceCacheTopology = seenTopologies.get(cacheTopology);
if (replaceCacheTopology == null) {
seenTopologies.put(cacheTopology, cacheTopology);
replaceCacheTopology = cacheTopology;
CacheTopology replaceStableTopology;
// If the don't equal check if we replace - note stableTopology can be null
if (!Objects.equals(cacheTopology, stableTopology)) {
replaceStableTopology = seenTopologies.get(stableTopology);
if (replaceStableTopology == null) {
seenTopologies.put(stableTopology, stableTopology);
replaceStableTopology = stableTopology;
} else {
// Since they were equal replace it with the cache topology we are going to use
replaceStableTopology = replaceCacheTopology;
CacheJoinInfo info = csr.getCacheJoinInfo();
CacheJoinInfo replaceInfo = seenInfos.get(info);
if (replaceInfo == null) {
seenInfos.put(info, info);
if (replaceCacheTopology != null || replaceStableTopology != null || replaceInfo != null) {
entry.setValue(new CacheStatusResponse(replaceInfo != null ? replaceInfo : info,
replaceCacheTopology, replaceStableTopology, csr.getAvailabilityMode()));
return true;
public boolean needMoreResponses() {
return true;
private void handleClusterView(boolean mergeView, int newViewId) {
try {
if (!updateClusterState(mergeView, newViewId)) {
// The LimitedExecutor acts as a critical section, so we don't need to worry about multiple threads.
if (clusterManagerStatus == ClusterManagerStatus.RECOVERING_CLUSTER) {
if (!becomeCoordinator(newViewId)) {
if (clusterManagerStatus == ClusterManagerStatus.COORDINATOR) {
// If we have recovered the cluster status, we rebalance the caches to include minor partitions
// If we processed a regular view, we prune members that left.
} catch (Throwable t) {
log.viewHandlingError(newViewId, t);
private boolean becomeCoordinator(int newViewId) {
// Clean up leftover cache status information from the last time we were coordinator.
// E.g. if the local node was coordinator, started a rebalance, and then lost coordinator
// status because of a merge, the existing cache statuses may have a rebalance in progress.
try {
recoverClusterStatus(newViewId, transport.getMembers());
try {
if (viewId != newViewId) {
log.debugf("View updated while we were recovering the cluster for view %d", newViewId);
return false;
clusterManagerStatus = ClusterManagerStatus.COORDINATOR;
// notify threads that might be waiting to join
} finally {
} catch (InterruptedException e) {
if (trace)
log.tracef("Cluster state recovery interrupted because the coordinator is shutting down");
} catch (SuspectException e) {
if (trace)
log.tracef("Cluster state recovery interrupted because a member was lost. Will retry.");
} catch (Exception e) {
if (clusterManagerStatus.isRunning()) {
.fatal(EventLogCategory.CLUSTER, MESSAGES.clusterRecoveryFailed(transport.getMembers()));
} else {
log.tracef("Cluster state recovery failed because the coordinator is shutting down");
return true;
private boolean updateClusterState(boolean mergeView, int newViewId) {
try {
if (newViewId < transport.getViewId()) {
log.tracef("Ignoring old cluster view notification: %s", newViewId);
return false;
boolean isCoordinator = transport.isCoordinator();
boolean becameCoordinator = isCoordinator && !clusterManagerStatus.isCoordinator();
if (trace) {
log.tracef("Received new cluster view: %d, isCoordinator = %s, old status = %s", (Object) newViewId,
isCoordinator, clusterManagerStatus);
if (!isCoordinator) {
clusterManagerStatus = ClusterManagerStatus.REGULAR_MEMBER;
return false;
if (becameCoordinator || mergeView) {
clusterManagerStatus = ClusterManagerStatus.RECOVERING_CLUSTER;
// notify threads that might be waiting to join
viewId = newViewId;
} finally {
return true;
private ClusterCacheStatus initCacheStatusIfAbsent(String cacheName, CacheMode cacheMode) {
return cacheStatusMap.computeIfAbsent(cacheName, (name) -> {
// We assume that any cache with partition handling configured is already defined on all the nodes
// (including the coordinator) before it starts on any node.
LostDataCheck lostDataCheck;
if (cacheMode.isScattered()) {
lostDataCheck = ClusterTopologyManagerImpl::scatteredLostDataCheck;
} else {
lostDataCheck = ClusterTopologyManagerImpl::distLostDataCheck;
AvailabilityStrategy availabilityStrategy;
Configuration config = cacheManager.getCacheConfiguration(cacheName);
PartitionHandling partitionHandling = config != null ? config.clustering().partitionHandling().whenSplit() : null;
boolean resolveConflictsOnMerge = resolveConflictsOnMerge(config, cacheMode);
if (partitionHandling != null && partitionHandling != PartitionHandling.ALLOW_READ_WRITES) {
availabilityStrategy = new PreferConsistencyStrategy(eventLogManager, persistentUUIDManager, lostDataCheck);
} else {
availabilityStrategy = new PreferAvailabilityStrategy(eventLogManager, persistentUUIDManager, lostDataCheck);
Optional globalStateManager = gcr.getOptionalComponent(GlobalStateManager.class);
Optional persistedState = globalStateManager.flatMap(gsm -> gsm.readScopedState(cacheName));
return new ClusterCacheStatus(cacheManager, cacheName, availabilityStrategy, RebalanceType.from(cacheMode),
this, transport,
persistentUUIDManager, eventLogManager, persistedState, resolveConflictsOnMerge);
private boolean resolveConflictsOnMerge(Configuration config, CacheMode cacheMode) {
if (config == null || cacheMode.isScattered() || cacheMode.isInvalidation())
return false;
return config.clustering().partitionHandling().resolveConflictsOnMerge();
public void broadcastRebalanceStart(String cacheName, CacheTopology cacheTopology, boolean totalOrder, boolean distributed) {
ReplicableCommand command = new CacheTopologyControlCommand(cacheName,
CacheTopologyControlCommand.Type.REBALANCE_START, transport.getAddress(), cacheTopology, null,
executeOnClusterAsync(command, getGlobalTimeout(), totalOrder, distributed);
private void recoverClusterStatus(int newViewId, final List clusterMembers) throws Exception {
log.debugf("Recovering cluster status for view %d", newViewId);
ReplicableCommand command = new CacheTopologyControlCommand(null,
CacheTopologyControlCommand.Type.GET_STATUS, transport.getAddress(), newViewId);
Map statusResponses = null;
// Assume any timeout is because one of the nodes didn't have a CommandAwareRpcDispatcher
// installed at the time (possible with JGroupsChannelLookup and shouldConnect == false), and retry.
for (int i = CLUSTER_RECOVERY_ATTEMPTS - 1; i >= 0; i--) {
try {
statusResponses =
executeOnClusterSync(command, getGlobalTimeout() / CLUSTER_RECOVERY_ATTEMPTS, false, false,
new CacheTopologyFilterReuser());
} catch (ExecutionException e) {
if (i != 0) {
if (e.getCause() instanceof TimeoutException) {
log.debug("Timed out waiting for cluster status responses, trying again");
} else if (e.getCause() instanceof SuspectException) {
if (transport.getMembers().containsAll(clusterMembers)) {
int sleepTime = getGlobalTimeout() / CLUSTER_RECOVERY_ATTEMPTS / 2;
log.debugf(e, "Received an exception from one of the members, will try again after %d ms", sleepTime);
throw e;
log.debugf("Got %d status responses. members are %s", statusResponses.size(), clusterMembers);
Map> responsesByCache = new HashMap<>();
boolean recoveredRebalancingStatus = true;
for (Map.Entry responseEntry : statusResponses.entrySet()) {
Address sender = responseEntry.getKey();
ManagerStatusResponse nodeStatus = (ManagerStatusResponse) responseEntry.getValue();
recoveredRebalancingStatus &= nodeStatus.isRebalancingEnabled();
for (Map.Entry statusEntry : nodeStatus.getCaches().entrySet()) {
String cacheName = statusEntry.getKey();
Map cacheResponses = responsesByCache.computeIfAbsent(cacheName, k -> new HashMap<>());
cacheResponses.put(sender, statusEntry.getValue());
globalRebalancingEnabled = recoveredRebalancingStatus;
// Compute the new consistent hashes on separate threads
int maxThreads = ProcessorInfo.availableProcessors() / 2 + 1;
CountDownLatch latch = new CountDownLatch(responsesByCache.size());
LimitedExecutor cs = new LimitedExecutor("Merge-" + newViewId, stateTransferExecutor, maxThreads);
for (final Map.Entry> e : responsesByCache.entrySet()) {
CacheJoinInfo joinInfo = e.getValue().values().iterator().next().getCacheJoinInfo();
ClusterCacheStatus cacheStatus = initCacheStatusIfAbsent(e.getKey(), joinInfo.getCacheMode());
cs.execute(() -> {
try {
} finally {
latch.await(getGlobalTimeout(), TimeUnit.MILLISECONDS);
public void updateCacheMembers(List newClusterMembers) {
try {
log.tracef("Updating cluster members for all the caches. New list is %s", newClusterMembers);
try {
// If we get a SuspectException here, it means we will have a new view soon and we can ignore this one.
} catch (SuspectException e) {
log.tracef("Node %s left while updating cache members", e.getSuspect());
for (ClusterCacheStatus cacheStatus : cacheStatusMap.values()) {
} catch (Exception e) {
if (clusterManagerStatus.isRunning()) {
private void confirmMembersAvailable() throws Exception {
transport.invokeRemotely(null, HeartBeatCommand.INSTANCE, ResponseMode.SYNCHRONOUS, getGlobalTimeout(), null, DeliverOrder.NONE, false);
* Wait until we have received view {@code joinerViewId} and we have finished recovering the cluster state.
* Returns early if the node is shutting down.
* This method should be invoked with the lock hold.
* @throws TimeoutException if the timeout expired.
private void waitForJoinerView(Address joiner, int joinerViewId, long timeout)
throws InterruptedException {
if (joinerViewId > viewId || clusterManagerStatus == ClusterManagerStatus.RECOVERING_CLUSTER) {
if (trace) {
if (joinerViewId > viewId) {
log.tracef("Waiting to install view %s before processing join request from %s",
joinerViewId, joiner);
} else {
log.tracef("Waiting to recover cluster status before processing join request from %s", joiner);
long nanosTimeout = TimeUnit.MILLISECONDS.toNanos(timeout);
while ((viewId < joinerViewId || clusterManagerStatus == ClusterManagerStatus.RECOVERING_CLUSTER) &&
clusterManagerStatus.isRunning()) {
if (nanosTimeout <= 0) {
throw log.coordinatorTimeoutWaitingForView(joinerViewId, transport.getViewId(), clusterManagerStatus);
nanosTimeout = clusterStateChanged.awaitNanos(nanosTimeout);
private Map
executeOnClusterSync(final ReplicableCommand command, final int timeout,
boolean totalOrder, boolean distributed, final ResponseFilter filter)
throws Exception {
// first invoke remotely
if (totalOrder) {
Map responseMap = transport.invokeRemotely(transport.getMembers(), command,
timeout, filter, DeliverOrder.TOTAL, distributed);
return extractResponseValues(responseMap, null);