All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.activemq.artemis.core.server.impl.quorum;

import java.util.Objects;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.activemq.artemis.api.core.ActiveMQException;
import org.apache.activemq.artemis.core.server.ActiveMQServer;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.quorum.DistributedLock;
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
import org.apache.activemq.artemis.quorum.MutableLong;
import org.apache.activemq.artemis.quorum.UnavailableStateException;
import org.jboss.logging.Logger;

/**
 * This class contains the activation sequence logic of the pluggable quorum vote:
 * it should be used by {@link org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation}
 * and {@link org.apache.activemq.artemis.core.server.impl.ReplicationPrimaryActivation} to coordinate
 * for replication.
 */
public final class ActivationSequenceStateMachine {

   private static final long CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS = 200;
   private static final long CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS = 2000;
   private static final long LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS = 2000;

   private ActivationSequenceStateMachine() {

   }

   /**
    * It loops if the data of the broker is still valuable, but cannot become live.
    * It loops (temporarly) if data is in sync or can self-heal, but cannot yet acquire the live lock.
    * 

* It stops loop and return: *

    *
  • {@code null}: if data is stale (and there are no rights to become live) *
  • {@code !=null}: if data is in sync and the {@link DistributedLock} is correctly acquired *

*

* After successfully returning from this method ie not null return value, a broker should use * {@link #ensureSequentialAccessToNodeData(ActiveMQServer, DistributedPrimitiveManager, Logger)} to complete * the activation and guarantee the initial not-replicated ownership of data. */ public static DistributedLock tryActivate(final String nodeId, final long nodeActivationSequence, final DistributedPrimitiveManager distributedManager, final Logger logger) throws InterruptedException, ExecutionException, TimeoutException, UnavailableStateException { final DistributedLock activationLock = distributedManager.getDistributedLock(nodeId); try (MutableLong coordinatedNodeSequence = distributedManager.getMutableLong(nodeId)) { while (true) { // dirty read is sufficient to know if we are *not* an in sync replica // typically the lock owner will increment to signal our data is stale and we are happy without any // further coordination at this point switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) { case Stale: activationLock.close(); return null; case SelfRepair: case InSync: break; case MaybeInSync: if (activationLock.tryLock()) { // BAD: where's the broker that should commit it? activationLock.unlock(); logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", nodeId); TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS); continue; } // quick path while data is still valuable: wait until something change (commit/repair) TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS); continue; } // SelfRepair, InSync if (!activationLock.tryLock(LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { logger.debugf("Candidate for Node ID = %s, with local activation sequence: %d, cannot acquire live lock within %dms; retrying", nodeId, nodeActivationSequence, LIVE_LOCK_ACQUIRE_TIMEOUT_MILLIS); continue; } switch (validateActivationSequence(coordinatedNodeSequence, activationLock, nodeId, nodeActivationSequence, logger)) { case Stale: activationLock.close(); return null; case SelfRepair: // Self-repair sequence ie we were the only one with the most up to date data. // NOTE: We cannot move the sequence now, let's delay it on ensureSequentialAccessToNodeData logger.infof("Assuming live role for NodeID = %s: local activation sequence %d matches claimed coordinated activation sequence %d. Repairing sequence", nodeId, nodeActivationSequence, nodeActivationSequence); return activationLock; case InSync: // we are an in_sync_replica, good to go live as UNREPLICATED logger.infof("Assuming live role for NodeID = %s, local activation sequence %d matches current coordinated activation sequence %d", nodeId, nodeActivationSequence, nodeActivationSequence); return activationLock; case MaybeInSync: activationLock.unlock(); logger.warnf("Cannot assume live role for NodeID = %s: claimed activation sequence need to be repaired", nodeId); TimeUnit.MILLISECONDS.sleep(CHECK_REPAIRED_ACTIVATION_SEQUENCE_WAIT_MILLIS); continue; } } } } private enum ValidationResult { /** * coordinated activation sequence (claimed/committed) is far beyond the local one: data is not valuable anymore **/ Stale, /** * coordinated activation sequence is the same as local one: data is in sync **/ InSync, /** * next coordinated activation sequence is not committed yet: maybe data is in sync **/ MaybeInSync, /** * next coordinated activation sequence is not committed yet, but this broker can self-repair: data is in sync **/ SelfRepair } private static ValidationResult validateActivationSequence(final MutableLong coordinatedNodeSequence, final DistributedLock activationLock, final String lockAndLongId, final long nodeActivationSequence, final Logger logger) throws UnavailableStateException { assert coordinatedNodeSequence.getMutableLongId().equals(lockAndLongId); assert activationLock.getLockId().equals(lockAndLongId); final long currentCoordinatedNodeSequence = coordinatedNodeSequence.get(); if (nodeActivationSequence == currentCoordinatedNodeSequence) { return ValidationResult.InSync; } if (currentCoordinatedNodeSequence > 0) { logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d", lockAndLongId, nodeActivationSequence, currentCoordinatedNodeSequence); return ValidationResult.Stale; } // claimed activation sequence final long claimedCoordinatedNodeSequence = -currentCoordinatedNodeSequence; final long sequenceGap = claimedCoordinatedNodeSequence - nodeActivationSequence; if (sequenceGap == 0) { return ValidationResult.SelfRepair; } if (sequenceGap == 1) { // maybe data is still valuable return ValidationResult.MaybeInSync; } assert sequenceGap > 1; // sequence is moved so much that data is no longer valuable logger.infof("Not a candidate for NodeID = %s activation, local activation sequence %d does not match coordinated activation sequence %d", lockAndLongId, nodeActivationSequence, claimedCoordinatedNodeSequence); return ValidationResult.Stale; } /** * It wait until {@code timeoutMillis ms} has passed or the coordinated activation sequence has progressed enough */ public static boolean awaitNextCommittedActivationSequence(final DistributedPrimitiveManager distributedManager, final String coordinatedLockAndNodeId, final long activationSequence, final long timeoutMills, final Logger logger) throws ExecutionException, InterruptedException, TimeoutException, UnavailableStateException { Objects.requireNonNull(distributedManager); Objects.requireNonNull(logger); Objects.requireNonNull(coordinatedLockAndNodeId); if (activationSequence < 0) { throw new IllegalArgumentException("activationSequence must be >= 0, while is " + activationSequence); } if (!distributedManager.isStarted()) { throw new IllegalStateException("manager must be started"); } final MutableLong coordinatedActivationSequence = distributedManager.getMutableLong(coordinatedLockAndNodeId); // wait for the live to activate and run un replicated with a sequence > inSyncReplicaActivation // this read can be dirty b/c we are just looking for an increment. boolean anyNext = false; final long timeoutNs = TimeUnit.MILLISECONDS.toNanos(timeoutMills); final long started = System.nanoTime(); long elapsedNs; do { final long coordinatedValue = coordinatedActivationSequence.get(); if (coordinatedValue > activationSequence) { // all good, some activation has gone ahead logger.infof("Detected a new activation sequence with NodeID = %s: and sequence: %d", coordinatedLockAndNodeId, coordinatedValue); anyNext = true; break; } if (coordinatedValue < 0) { // commit claim final long claimedSequence = -coordinatedValue; final long activationsGap = claimedSequence - activationSequence; if (activationsGap > 1) { // all good, some activation has gone ahead logger.infof("Detected furthers sequential server activations from sequence %d, with NodeID = %s: and claimed sequence: %d", activationSequence, coordinatedLockAndNodeId, claimedSequence); anyNext = true; break; } // activation is still in progress logger.debugf("Detected claiming of activation sequence = %d for NodeID = %s", claimedSequence, coordinatedLockAndNodeId); } try { TimeUnit.MILLISECONDS.sleep(CHECK_ACTIVATION_SEQUENCE_WAIT_MILLIS); } catch (InterruptedException ignored) { } elapsedNs = System.nanoTime() - started; } while (elapsedNs < timeoutNs); return anyNext; } /** * This is going to increment the coordinated activation sequence while holding the live lock, failing with some exception otherwise.
*

* The acceptable states are {@link ValidationResult#InSync} and {@link ValidationResult#SelfRepair}, throwing some exception otherwise. *

* This must be used while holding a live lock to ensure not-exclusive ownership of data ie can be both used * while loosing connectivity with a replica or after successfully {@link #tryActivate(String, long, DistributedPrimitiveManager, Logger)}. */ public static void ensureSequentialAccessToNodeData(ActiveMQServer activeMQServer, DistributedPrimitiveManager distributedPrimitiveManager, final Logger logger) throws ActiveMQException, InterruptedException, UnavailableStateException, ExecutionException, TimeoutException { final NodeManager nodeManager = activeMQServer.getNodeManager(); final String lockAndLongId = nodeManager.getNodeId().toString(); final DistributedLock liveLock = distributedPrimitiveManager.getDistributedLock(lockAndLongId); if (!liveLock.isHeldByCaller()) { final String message = String.format("Server [%s], live lock for NodeID = %s, not held, activation sequence cannot be safely changed", activeMQServer, lockAndLongId); logger.info(message); throw new UnavailableStateException(message); } final long nodeActivationSequence = nodeManager.readNodeActivationSequence(); final MutableLong coordinatedNodeActivationSequence = distributedPrimitiveManager.getMutableLong(lockAndLongId); final long currentCoordinatedActivationSequence = coordinatedNodeActivationSequence.get(); final long nextActivationSequence; if (currentCoordinatedActivationSequence < 0) { // Check Self-Repair if (nodeActivationSequence != -currentCoordinatedActivationSequence) { final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current claimed coordinated sequence %d: need repair", activeMQServer, lockAndLongId, nodeActivationSequence, -currentCoordinatedActivationSequence); logger.info(message); throw new ActiveMQException(message); } // auto-repair: this is the same server that failed to commit its claimed sequence nextActivationSequence = nodeActivationSequence; } else { // Check InSync if (nodeActivationSequence != currentCoordinatedActivationSequence) { final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, local activation sequence %d does not match current coordinated sequence %d", activeMQServer, lockAndLongId, nodeActivationSequence, currentCoordinatedActivationSequence); logger.info(message); throw new ActiveMQException(message); } nextActivationSequence = nodeActivationSequence + 1; } // UN_REPLICATED STATE ENTER: auto-repair doesn't need to claim and write locally if (nodeActivationSequence != nextActivationSequence) { // claim if (!coordinatedNodeActivationSequence.compareAndSet(nodeActivationSequence, -nextActivationSequence)) { final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence claim failed, local activation sequence %d no longer matches current coordinated sequence %d", activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get()); logger.infof(message); throw new ActiveMQException(message); } // claim success: write locally try { nodeManager.writeNodeActivationSequence(nextActivationSequence); } catch (NodeManager.NodeManagerException fatal) { logger.errorf("Server [%s] failed to set local activation sequence to: %d for NodeId =%s. Cannot continue committing coordinated activation sequence: REQUIRES ADMIN INTERVENTION", activeMQServer, nextActivationSequence, lockAndLongId); throw new UnavailableStateException(fatal); } logger.infof("Server [%s], incremented local activation sequence to: %d for NodeId = %s", activeMQServer, nextActivationSequence, lockAndLongId); } else { // self-heal need to update the in-memory sequence, because no writes will do it nodeManager.setNodeActivationSequence(nextActivationSequence); } // commit if (!coordinatedNodeActivationSequence.compareAndSet(-nextActivationSequence, nextActivationSequence)) { final String message = String.format("Server [%s], cannot assume live role for NodeID = %s, activation sequence commit failed, local activation sequence %d no longer matches current coordinated sequence %d", activeMQServer, lockAndLongId, nodeActivationSequence, coordinatedNodeActivationSequence.get()); logger.infof(message); throw new ActiveMQException(message); } logger.infof("Server [%s], incremented coordinated activation sequence to: %d for NodeId = %s", activeMQServer, nextActivationSequence, lockAndLongId); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy