All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hazelcast.instance.impl.KubernetesTopologyIntentTracker Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2008-2024, Hazelcast, Inc. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.hazelcast.instance.impl;

import com.hazelcast.cluster.ClusterState;
import com.hazelcast.config.InvalidConfigurationException;
import com.hazelcast.internal.cluster.impl.ClusterServiceImpl;
import com.hazelcast.internal.partition.impl.InternalPartitionServiceImpl;
import com.hazelcast.internal.util.BiTuple;
import com.hazelcast.internal.util.ExceptionUtil;
import com.hazelcast.logging.ILogger;
import com.hazelcast.spi.utils.RetryUtils;

import javax.annotation.Nullable;
import java.time.Duration;
import java.time.Instant;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;

import static com.hazelcast.spi.properties.ClusterProperty.CLUSTER_SHUTDOWN_TIMEOUT_SECONDS;
import static com.hazelcast.spi.properties.ClusterProperty.PERSISTENCE_AUTO_CLUSTER_STATE_STRATEGY;
import static java.lang.Thread.sleep;

/**
 * Implementation of {@link ClusterTopologyIntentTracker} that automates cluster state management
 * according to intent of topology changes detected in kubernetes environment.
 * 
* Example flow of change in detected intent and associated {@code currentClusterSpecSize} on a cluster's master member: * (User action on left, detected intent with cluster spec size in parentheses on the right side). *
 * {@code
 * +-----------------------------------------------------------------+---------------------------------+
 * | $ helm install hz --set cluster.memberCount=3 \                 | NOT_IN_MANAGED_CONTEXT(-1) ->   |
 * |             hazelcast-enterprise-5.3.1-gcs.tgz                  | CLUSTER_START(3) ->             |
 * |                                                                 | (after pods are started)        |
 * |                                                                 | STABLE(3)                       |
 * +-----------------------------------------------------------------+---------------------------------+
 * | $ kubectl scale sts hz-hazelcast-enterprise --replicas 5        | SCALING(5) ->                   |
 * |                                                                 | (after new pods are started)    |
 * |                                                                 | STABLE(5)                       |
 * +-----------------------------------------------------------------+---------------------------------+
 * | $ kubectl delete pod hz-hazelcast-enterprise-2                  | MISSING_MEMBERS(5) ->           |
 * |   (simulating kubernetes deleted a pod)                         | (after pod is restarted)        |
 * |                                                                 | STABLE(5)                       |
 * +-----------------------------------------------------------------+---------------------------------+
 * | $ kubectl scale sts hz-hazelcast-enterprise --replicas 0        | CLUSTER_SHUTDOWN(0)             |
 * +-----------------------------------------------------------------+---------------------------------+
 * }
 * 
*/ public class KubernetesTopologyIntentTracker implements ClusterTopologyIntentTracker { /** * Currently detected cluster topology intent. */ private final AtomicReference clusterTopologyIntent = new AtomicReference<>(ClusterTopologyIntent.NOT_IN_MANAGED_CONTEXT); // single-threaded executor for actions in response to cluster topology intent changes private final ExecutorService clusterTopologyExecutor; // applies when automatic cluster state management is enabled (with persistence in kubernetes) private final ClusterState clusterStateForMissingMembers; private final ILogger logger; private final Node node; /** * The desired number of members, as specified in the runtime environment. e.g. in kubernetes * {@code kubectl scale sts hz --replicas 5} means {@code currentClusterSpecSize} is 5. */ private volatile int currentClusterSpecSize = UNKNOWN; /** * The last known cluster spec size while cluster was detected in {@link ClusterTopologyIntent#CLUSTER_STABLE} intent. * Used during cluster-wide shutdown, while {@link #currentClusterSpecSize} is {@code 0}. */ private volatile int lastKnownStableClusterSpecSize = UNKNOWN; /** * Current Hazelcast cluster size, as observed by {@link com.hazelcast.internal.cluster.ClusterService}. */ private volatile int currentClusterSize; private volatile boolean enabled; public KubernetesTopologyIntentTracker(Node node) { this.clusterStateForMissingMembers = node.getProperties() .getEnum(PERSISTENCE_AUTO_CLUSTER_STATE_STRATEGY, ClusterState.class); if (clusterStateForMissingMembers != ClusterState.FROZEN && clusterStateForMissingMembers != ClusterState.NO_MIGRATION) { throw new InvalidConfigurationException("Value of property " + PERSISTENCE_AUTO_CLUSTER_STATE_STRATEGY.getName() + " was " + clusterStateForMissingMembers + " but should be one of FROZEN, NO_MIGRATION."); } this.clusterTopologyExecutor = Executors.newSingleThreadExecutor(); this.logger = node.getLogger(ClusterTopologyIntentTracker.class); this.node = node; } @Override public void initialize() { enabled = true; } @Override public void destroy() { clusterTopologyExecutor.shutdown(); } @Override public void update(int previousSpecifiedReplicas, int updatedSpecifiedReplicas, int previousReadyReplicas, int updatedReadyReplicas, int previousCurrentReplicas, int updatedCurrentReplicas) { final int previousClusterSpecSizeValue = this.currentClusterSpecSize; this.currentClusterSpecSize = updatedSpecifiedReplicas; if (previousSpecifiedReplicas == UNKNOWN) { handleInitialUpdate(updatedSpecifiedReplicas, updatedReadyReplicas); return; } final ClusterTopologyIntent previous = clusterTopologyIntent.get(); ClusterTopologyIntent newTopologyIntent; Runnable postUpdateActionOnMaster = null; if (updatedSpecifiedReplicas == 0) { newTopologyIntent = handleShutdownUpdate(previousClusterSpecSizeValue, previous); } else if (previousSpecifiedReplicas == updatedSpecifiedReplicas) { if (ignoreUpdateWhenClusterSpecEqual(previous, updatedReadyReplicas)) { return; } BiTuple t = nextIntentWhenClusterSpecEqual(previous, previousReadyReplicas, updatedReadyReplicas, previousCurrentReplicas, updatedCurrentReplicas); newTopologyIntent = t.element1; postUpdateActionOnMaster = t.element2; } else { newTopologyIntent = ClusterTopologyIntent.SCALING; postUpdateActionOnMaster = () -> changeClusterState(ClusterState.ACTIVE); } if (clusterTopologyIntent.compareAndSet(previous, newTopologyIntent)) { onClusterTopologyIntentUpdate(previous, newTopologyIntent, postUpdateActionOnMaster); } } private void handleInitialUpdate(int currentSpecifiedReplicaCount, int readyReplicasCount) { if (currentSpecifiedReplicaCount > 0 && (readyReplicasCount == UNKNOWN || readyReplicasCount == 0)) { // startup of first member of new cluster logger.info("Cluster starting in managed context"); clusterTopologyIntent.set(ClusterTopologyIntent.CLUSTER_START); } else { logger.info("Member starting in managed context"); clusterTopologyIntent.set(ClusterTopologyIntent.IN_MANAGED_CONTEXT_UNKNOWN); } } private ClusterTopologyIntent handleShutdownUpdate(int previousClusterSpecSizeValue, ClusterTopologyIntent previous) { ClusterTopologyIntent newTopologyIntent; if (previousClusterSpecSizeValue > 0) { this.lastKnownStableClusterSpecSize = previousClusterSpecSizeValue; } newTopologyIntent = nextIntentWhenShuttingDown(previous); return newTopologyIntent; } private void onClusterTopologyIntentUpdate(ClusterTopologyIntent previous, ClusterTopologyIntent newTopologyIntent, @Nullable Runnable actionOnMaster) { logger.info("Cluster topology intent: " + previous + " -> " + newTopologyIntent); clusterTopologyExecutor.submit(() -> { node.getNodeExtension().getInternalHotRestartService().onClusterTopologyIntentChange(); if (!node.isMaster()) { return; } if (actionOnMaster != null) { actionOnMaster.run(); } }); } private ClusterTopologyIntent nextIntentWhenShuttingDown(ClusterTopologyIntent previous) { // if members were previously missing, next intent is CLUSTER_SHUTDOWN_WITH_MISSING_MEMBERS // otherwise plain CLUSTER_SHUTDOWN return previous == ClusterTopologyIntent.CLUSTER_STABLE_WITH_MISSING_MEMBERS || previous == ClusterTopologyIntent.CLUSTER_SHUTDOWN_WITH_MISSING_MEMBERS ? ClusterTopologyIntent.CLUSTER_SHUTDOWN_WITH_MISSING_MEMBERS : ClusterTopologyIntent.CLUSTER_SHUTDOWN; } /** * Decide whether an update from managed context should be ignored when cluster spec size stays the same. * @return {@code true} if update should be ignored, otherwise {@code false}. */ private boolean ignoreUpdateWhenClusterSpecEqual(ClusterTopologyIntent previous, int readyNodesCount) { if (readyNodesCount != currentClusterSpecSize && (previous == ClusterTopologyIntent.SCALING || previous == ClusterTopologyIntent.IN_MANAGED_CONTEXT_UNKNOWN || previous == ClusterTopologyIntent.CLUSTER_START)) { logger.info("Ignoring update because readyNodesCount is " + readyNodesCount + ", while spec requires " + currentClusterSpecSize + " and previous cluster topology intent is " + previous); return true; } return false; } /** * @return next {@code ClusterTopologyIntent} according to rules applicable while there is no change * in cluster size specification in managed context. (ie StatefulSetSpec.size remains the same, so * user does not intend a change in cluster size). */ private BiTuple nextIntentWhenClusterSpecEqual(ClusterTopologyIntent previous, int previousReadyReplicas, int updatedReadyReplicas, int previousCurrentReplicas, int updatedCurrentReplicas) { ClusterTopologyIntent next = previous; Runnable action = null; if (updatedReadyReplicas == currentClusterSpecSize) { if (updatedCurrentReplicas < previousCurrentReplicas) { if (updatedReadyReplicas == previousReadyReplicas) { // If updatedReady == previousReady and updatedCurrent < previousCurrent, then this is the beginning of a // rollout restart and readiness probe hasn't yet noticed. next = ClusterTopologyIntent.CLUSTER_STABLE_WITH_MISSING_MEMBERS; } else if (updatedReadyReplicas > previousReadyReplicas) { // If updatedReady > previousReady and updatedCurrent < previousCurrent, then this is a coalesced Kubernetes // event in the middle of rollout restart. It marks at the same time a previously restarted member // is now ready and the next pod is going down for restart. next = ClusterTopologyIntent.CLUSTER_STABLE_WITH_MISSING_MEMBERS; // need to fix partition table before allowing next pod to restart action = clusterStateForMissingMembers == ClusterState.NO_MIGRATION ? () -> changeClusterState(ClusterState.ACTIVE) : null; } } else if (previous != ClusterTopologyIntent.CLUSTER_STABLE) { next = ClusterTopologyIntent.CLUSTER_STABLE; action = () -> { if (getClusterService().getClusterState() != ClusterState.ACTIVE) { tryExecuteOrSetDeferredClusterStateChange(ClusterState.ACTIVE); } else if (!getPartitionService().isPartitionTableSafe()) { getPartitionService().getMigrationManager().triggerControlTask(); } }; } } else if (previous == ClusterTopologyIntent.CLUSTER_STABLE && updatedCurrentReplicas < currentClusterSpecSize) { next = ClusterTopologyIntent.CLUSTER_STABLE_WITH_MISSING_MEMBERS; } return BiTuple.of(next, action); } @Override public ClusterTopologyIntent getClusterTopologyIntent() { return clusterTopologyIntent.get(); } @Override public void initializeClusterTopologyIntent(ClusterTopologyIntent clusterTopologyIntent) { ClusterTopologyIntent current = this.clusterTopologyIntent.get(); logger.info("Current node cluster topology intent is " + current); // if not UNKNOWN, then it was already initialized if (current == ClusterTopologyIntent.IN_MANAGED_CONTEXT_UNKNOWN) { logger.info("Initializing this node's cluster topology to " + clusterTopologyIntent); this.clusterTopologyIntent.set(clusterTopologyIntent); } } @Override public void shutdownWithIntent(ClusterTopologyIntent shutdownIntent) { // consider the detected shutdown intent before triggering node shutdown if (shutdownIntent == ClusterTopologyIntent.CLUSTER_STABLE || shutdownIntent == ClusterTopologyIntent.CLUSTER_STABLE_WITH_MISSING_MEMBERS) { try { // wait for partition table to be healthy before switching to NO_MIGRATION // e.g. in "rollout restart" case, node is shutdown in NO_MIGRATION state waitCallableWithShutdownTimeout(() -> getPartitionService().isPartitionTableSafe()); changeClusterState(clusterStateForMissingMembers); } catch (Throwable t) { // let shutdown proceed even though we failed to switch to desired state logger.warning("Could not switch to transient " + clusterStateForMissingMembers + " state while cluster" + "shutdown intent was " + shutdownIntent, t); } } else if (shutdownIntent == ClusterTopologyIntent.CLUSTER_SHUTDOWN) { clusterWideShutdown(); } else if (shutdownIntent == ClusterTopologyIntent.CLUSTER_SHUTDOWN_WITH_MISSING_MEMBERS) { // If cluster is shutting down with missing members, it is possible that a member might // rejoin while attempting the graceful shutdown. In this case, races may occur, which may // lead to lack of progress on the shutting-down member, so we orchestrate shutdown: // - shutting down member waits for the missing member to rejoin // - we ensure partition table is healthy (required for next step) and switch to ACTIVE cluster state to // allow for partition rebalancing to fix potentially missing partition replica assignments // - finally switch to PASSIVE cluster state and wait for partition replica sync (similar to normal // CLUSTER_SHUTDOWN case) long remainingNanosForShutdown = waitForMissingMember(); clusterWideShutdownWithMissingMember(shutdownIntent, remainingNanosForShutdown); } } private void clusterWideShutdownWithMissingMember(ClusterTopologyIntent shutdownIntent, long remainingNanosForShutdown) { try { // The do-while loop ensures that we wait for partition table to be healthy after successfully // switching to PASSIVE cluster state (during which attempts of missing members to rejoin will // be denied), otherwise we retry by switching to ACTIVE cluster state. do { logger.info("Waiting for partition table to be healthy"); if (!getPartitionService().isPartitionTableSafe()) { logger.warning("Switching to ACTIVE state in order to allow for partition table to be healthy"); changeClusterState(ClusterState.ACTIVE); waitCallableWithShutdownTimeout(() -> getPartitionService().isPartitionTableSafe()); } changeClusterState(ClusterState.PASSIVE); } while (!getPartitionService().isPartitionTableSafe()); } catch (Throwable t) { // let shutdown proceed even though we failed to switch to PASSIVE state // and wait for replica sync logger.warning("Could not switch to transient PASSIVE state while cluster" + "shutdown intent was " + shutdownIntent, t); } try { getNodeExtension().getInternalHotRestartService() .waitPartitionReplicaSyncOnCluster(remainingNanosForShutdown, TimeUnit.NANOSECONDS); } catch (IllegalStateException e) { logger.severe("Failure while waiting for partition replica sync before shutdown", e); } } private void clusterWideShutdown() { Instant start = Instant.now(); logger.info("cluster-wide-shutdown, Starting"); try { changeClusterState(ClusterState.PASSIVE); } catch (Throwable t) { // let shutdown proceed even though we failed to switch to PASSIVE state logger.warning("cluster-wide-shutdown, Could not switch to transient PASSIVE state while cluster " + "shutdown intent was CLUSTER_SHUTDOWN.", t); } long timeoutNanos = node.getProperties().getNanos(CLUSTER_SHUTDOWN_TIMEOUT_SECONDS); logger.info("cluster-wide-shutdown, Starting partition replica sync, Timeout(s): " + node.getProperties().getSeconds(CLUSTER_SHUTDOWN_TIMEOUT_SECONDS)); Instant partitionSyncStart = Instant.now(); try { // wait for replica sync getNodeExtension().getInternalHotRestartService() .waitPartitionReplicaSyncOnCluster(timeoutNanos, TimeUnit.NANOSECONDS); logger.info("cluster-wide-shutdown, Completed partition replica sync, Took(ms): " + Duration.between(partitionSyncStart, Instant.now()).toMillis()); } catch (IllegalStateException e) { logger.severe("cluster-wide-shutdown, Failure while waiting for partition replica sync before shutdown, " + "Took(ms): " + Duration.between(partitionSyncStart, Instant.now()).toMillis(), e); } logger.info("cluster-wide-shutdown, Completed, Took(ms): " + Duration.between(start, Instant.now()).toMillis()); } /** * Wait for cluster size (as observed by Hazelcast's ClusterService) to become equal to the * last known cluster size as specified in Kubernetes {@code StatefulsetSpec.size), before cluster-wide shutdown * was requested. * @return nanos remaining until cluster shutdown timeout */ long waitForMissingMember() { long nanosRemaining = node.getProperties().getNanos(CLUSTER_SHUTDOWN_TIMEOUT_SECONDS); if (getClusterService().getClusterState() == ClusterState.PASSIVE) { // cluster is already in PASSIVE state and shutting down, so don't wait return nanosRemaining; } if (lastKnownStableClusterSpecSize == currentClusterSize) { return nanosRemaining; } logger.info("Waiting for missing members: lastKnownStableClusterSpecSize: " + lastKnownStableClusterSpecSize + ", " + "currentClusterSize " + currentClusterSize); return waitCallableWithTimeout(() -> lastKnownStableClusterSpecSize == currentClusterSize, nanosRemaining); } @Override public boolean isEnabled() { return enabled; } @Override public int getCurrentSpecifiedReplicaCount() { return currentClusterSpecSize; } @Override public void onMembershipChange() { currentClusterSize = getClusterService().getSize(); } /** * If recovery from persistence is completed, then immediately changes cluster state to given {@code newClusterState}. * Otherwise, the given {@code newClusterState} is passed on to hot-restart service and may be applied as final cluster * state after recovery is done. * * @param newClusterState the new cluster state * @see com.hazelcast.internal.hotrestart.InternalHotRestartService#trySetDeferredClusterState(ClusterState) */ private void tryExecuteOrSetDeferredClusterStateChange(ClusterState newClusterState) { if (!getNodeExtension().getInternalHotRestartService().trySetDeferredClusterState(newClusterState)) { // hot restart recovery is completed, just apply the new cluster state here changeClusterState(newClusterState); } } /** * * @param callable {@link Callable} that returns {@code true} when its condition is completed and * control should return to caller. * @return {@code true} if completed because callable completed normally or {@code false} if timeout passed or * thread was interrupted. */ private long waitCallableWithShutdownTimeout(Callable callable) { return waitCallableWithTimeout(callable, node.getProperties().getNanos(CLUSTER_SHUTDOWN_TIMEOUT_SECONDS)); } long waitCallableWithTimeout(Callable callable, long timeoutNanos) { boolean callableCompleted; long start = System.nanoTime(); do { try { callableCompleted = callable.call(); } catch (Exception e) { throw ExceptionUtil.rethrow(e); } try { sleep(TimeUnit.SECONDS.toMillis(1)); } catch (InterruptedException e) { Thread.currentThread().interrupt(); break; } timeoutNanos -= (System.nanoTime() - start); } while (!callableCompleted && timeoutNanos > 0); return timeoutNanos; } /** * Change cluster state, if current state is not already the desired one. * Retries up to 3 times. The cluster state change is transient, so if persistence * is enabled, the new cluster state is not persisted to disk. * * @param newClusterState */ private void changeClusterState(ClusterState newClusterState) { RetryUtils.retry( () -> { getClusterService().changeClusterState(newClusterState, true); return null; }, 3); } private NodeExtension getNodeExtension() { return node.getNodeExtension(); } private InternalPartitionServiceImpl getPartitionService() { return node.partitionService; } private ClusterServiceImpl getClusterService() { return node.getClusterService(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy