All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.activemq.artemis.core.server.impl.ReplicationBackupActivation Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.activemq.artemis.core.server.impl;

import javax.annotation.concurrent.GuardedBy;

import java.util.Objects;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;

import org.apache.activemq.artemis.api.core.ActiveMQException;
import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException;
import org.apache.activemq.artemis.api.core.Pair;
import org.apache.activemq.artemis.api.core.SimpleString;
import org.apache.activemq.artemis.api.core.TransportConfiguration;
import org.apache.activemq.artemis.core.protocol.core.Channel;
import org.apache.activemq.artemis.core.replication.ReplicationEndpoint;
import org.apache.activemq.artemis.core.server.ActiveMQServer;
import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
import org.apache.activemq.artemis.core.server.LiveNodeLocator;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.core.server.cluster.ClusterControl;
import org.apache.activemq.artemis.core.server.cluster.ClusterController;
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicationBackupPolicy;
import org.apache.activemq.artemis.quorum.DistributedLock;
import org.apache.activemq.artemis.quorum.DistributedPrimitiveManager;
import org.apache.activemq.artemis.quorum.UnavailableStateException;
import org.jboss.logging.Logger;

import static org.apache.activemq.artemis.core.server.impl.ReplicationObserver.ReplicationFailure;
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.ensureSequentialAccessToNodeData;
import static org.apache.activemq.artemis.core.server.impl.quorum.ActivationSequenceStateMachine.tryActivate;

/**
 * This activation can be used by a primary while trying to fail-back ie {@code failback == true} or
 * by a natural-born backup ie {@code failback == false}.
*/ public final class ReplicationBackupActivation extends Activation implements DistributedPrimitiveManager.UnavailableManagerListener { private static final Logger LOGGER = Logger.getLogger(ReplicationBackupActivation.class); private final ReplicationBackupPolicy policy; private final ActiveMQServerImpl activeMQServer; // This field is != null iff this node is a primary during a fail-back ie acting as a backup in order to become live again. private final String expectedNodeID; @GuardedBy("this") private boolean closed; private final DistributedPrimitiveManager distributedManager; // Used for monitoring purposes private volatile ReplicationObserver replicationObserver; // Used for testing purposes private volatile ReplicationEndpoint replicationEndpoint; // Used for testing purposes private Consumer onReplicationEndpointCreation; // Used to arbiter one-shot server stop/restart private final AtomicBoolean stopping; public ReplicationBackupActivation(final ActiveMQServerImpl activeMQServer, final DistributedPrimitiveManager distributedManager, final ReplicationBackupPolicy policy) { this.activeMQServer = activeMQServer; if (policy.isTryFailback()) { final SimpleString serverNodeID = activeMQServer.getNodeID(); if (serverNodeID == null || serverNodeID.isEmpty()) { throw new IllegalStateException("A failback activation must be biased around a specific NodeID"); } this.expectedNodeID = serverNodeID.toString(); } else { this.expectedNodeID = null; } this.distributedManager = distributedManager; this.policy = policy; this.replicationObserver = null; this.replicationEndpoint = null; this.stopping = new AtomicBoolean(false); } /** * used for testing purposes. */ public DistributedPrimitiveManager getDistributedManager() { return distributedManager; } @Override public void onUnavailableManagerEvent() { synchronized (this) { if (closed) { return; } } LOGGER.info("Unavailable quorum service detected: try restart server"); asyncRestartServer(activeMQServer, true); } /** * This util class exists because {@link LiveNodeLocator} need a {@link LiveNodeLocator.BackupRegistrationListener} * to forward backup registration failure events: this is used to switch on/off backup registration event listening * on an existing locator. */ private static final class RegistrationFailureForwarder implements LiveNodeLocator.BackupRegistrationListener, AutoCloseable { private static final LiveNodeLocator.BackupRegistrationListener NOOP_LISTENER = ignore -> { }; private volatile LiveNodeLocator.BackupRegistrationListener listener = NOOP_LISTENER; public RegistrationFailureForwarder to(LiveNodeLocator.BackupRegistrationListener listener) { this.listener = listener; return this; } @Override public void onBackupRegistrationFailed(boolean alreadyReplicating) { listener.onBackupRegistrationFailed(alreadyReplicating); } @Override public void close() { listener = NOOP_LISTENER; } } @Override public void run() { synchronized (this) { if (closed) { return; } } try { distributedManager.start(); final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); // only a backup with positive local activation sequence could contain valuable data if (nodeActivationSequence > 0) { final String nodeId = activeMQServer.getNodeManager().getNodeId().toString(); DistributedLock liveLockWithInSyncReplica; while (true) { distributedManager.start(); try { liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); break; } catch (UnavailableStateException canRecoverEx) { distributedManager.stop(); } } if (liveLockWithInSyncReplica != null) { // retain state and start as live if (!activeMQServer.initialisePart1(false)) { return; } activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED); startAsLive(liveLockWithInSyncReplica); return; } } distributedManager.addUnavailableManagerListener(this); // Stop the previous node manager and create a new one with NodeManager::replicatedBackup == true: // NodeManager::start skip setup lock file with NodeID, until NodeManager::stopBackup is called. activeMQServer.resetNodeManager(); // A primary need to preserve NodeID across runs activeMQServer.moveServerData(policy.getMaxSavedReplicatedJournalsSize(), policy.isTryFailback()); activeMQServer.getNodeManager().start(); if (!activeMQServer.initialisePart1(false)) { return; } synchronized (this) { if (closed) return; } final ClusterController clusterController = activeMQServer.getClusterManager().getClusterController(); LOGGER.infof("Apache ActiveMQ Artemis Backup Server version %s [%s] started, awaiting connection to a live cluster member to start replication", activeMQServer.getVersion().getFullVersion(), activeMQServer.toString()); clusterController.awaitConnectionToReplicationCluster(); activeMQServer.getBackupManager().start(); activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED); final DistributedLock liveLock = replicateAndFailover(clusterController); if (liveLock == null) { return; } startAsLive(liveLock); } catch (Exception e) { if ((e instanceof InterruptedException || e instanceof IllegalStateException) && !activeMQServer.isStarted()) { // do not log these errors if the server is being stopped. return; } ActiveMQServerLogger.LOGGER.initializationError(e); } } private void startAsLive(final DistributedLock liveLock) throws Exception { activeMQServer.setHAPolicy(policy.getLivePolicy()); synchronized (activeMQServer) { if (!activeMQServer.isStarted()) { liveLock.close(); return; } try { ensureSequentialAccessToNodeData(activeMQServer, distributedManager, LOGGER); } catch (Throwable fatal) { LOGGER.warn(fatal); // policy is already live one, but there's no activation yet: we can just stop asyncRestartServer(activeMQServer, false, false); throw new ActiveMQIllegalStateException("This server cannot ensure sequential access to broker data: activation is failed"); } ActiveMQServerLogger.LOGGER.becomingLive(activeMQServer); // stopBackup is going to write the NodeID previously set on the NodeManager, // because activeMQServer.resetNodeManager() has created a NodeManager with replicatedBackup == true. activeMQServer.getNodeManager().stopBackup(); activeMQServer.getStorageManager().start(); activeMQServer.getBackupManager().activated(); // IMPORTANT: // we're setting this activation JUST because it would allow the server to use its // getActivationChannelHandler to handle replication final ReplicationPrimaryActivation primaryActivation = new ReplicationPrimaryActivation(activeMQServer, distributedManager, policy.getLivePolicy()); liveLock.addListener(primaryActivation); activeMQServer.setActivation(primaryActivation); activeMQServer.initialisePart2(false); // calling primaryActivation.stateChanged !isHelByCaller is necessary in case the lock was unavailable // before liveLock.addListener: just throwing an exception won't stop the broker. final boolean stillLive; try { stillLive = liveLock.isHeldByCaller(); } catch (UnavailableStateException e) { LOGGER.warn(e); primaryActivation.onUnavailableLockEvent(); throw new ActiveMQIllegalStateException("This server cannot check its role as a live: activation is failed"); } if (!stillLive) { primaryActivation.onUnavailableLockEvent(); throw new ActiveMQIllegalStateException("This server is not live anymore: activation is failed"); } if (activeMQServer.getIdentity() != null) { ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity()); } else { ActiveMQServerLogger.LOGGER.serverIsLive(); } activeMQServer.completeActivation(true); } } private LiveNodeLocator createLiveNodeLocator(final LiveNodeLocator.BackupRegistrationListener registrationListener) { if (expectedNodeID != null) { assert policy.isTryFailback(); return new NamedLiveNodeIdLocatorForReplication(expectedNodeID, registrationListener, policy.getRetryReplicationWait()); } return policy.getGroupName() == null ? new AnyLiveNodeLocatorForReplication(registrationListener, activeMQServer, policy.getRetryReplicationWait()) : new NamedLiveNodeLocatorForReplication(policy.getGroupName(), registrationListener, policy.getRetryReplicationWait()); } private DistributedLock replicateAndFailover(final ClusterController clusterController) throws ActiveMQException, InterruptedException { final RegistrationFailureForwarder registrationFailureForwarder = new RegistrationFailureForwarder(); // node locator isn't stateless and contains a live-list of candidate nodes to connect too, hence // it MUST be reused for each replicateLive attempt final LiveNodeLocator nodeLocator = createLiveNodeLocator(registrationFailureForwarder); clusterController.addClusterTopologyListenerForReplication(nodeLocator); try { while (true) { synchronized (this) { if (closed) { return null; } } final ReplicationFailure failure = replicateLive(clusterController, nodeLocator, registrationFailureForwarder); if (failure == null) { Thread.sleep(clusterController.getRetryIntervalForReplicatedCluster()); continue; } if (!activeMQServer.isStarted()) { return null; } LOGGER.debugf("ReplicationFailure = %s", failure); switch (failure) { case VoluntaryFailOver: case NonVoluntaryFailover: // from now on we're meant to stop: // - due to failover // - due to restart/stop if (!stopping.compareAndSet(false, true)) { return null; } // no more interested into these events: handling it manually from here distributedManager.removeUnavailableManagerListener(this); final long nodeActivationSequence = activeMQServer.getNodeManager().readNodeActivationSequence(); final String nodeId = activeMQServer.getNodeManager().getNodeId().toString(); DistributedLock liveLockWithInSyncReplica = null; if (nodeActivationSequence > 0) { try { liveLockWithInSyncReplica = tryActivate(nodeId, nodeActivationSequence, distributedManager, LOGGER); } catch (Throwable error) { // no need to retry here, can just restart as backup that will handle a more resilient tryActivate LOGGER.warn("Errored while attempting failover", error); liveLockWithInSyncReplica = null; } } else { LOGGER.warnf("We expect local activation sequence for NodeID = %s to be > 0 on a fail-over, while is %d", nodeId, nodeActivationSequence); } assert stopping.get(); if (liveLockWithInSyncReplica != null) { return liveLockWithInSyncReplica; } ActiveMQServerLogger.LOGGER.restartingAsBackupBasedOnQuorumVoteResults(); // let's ignore the stopping flag here, we're already in control of it asyncRestartServer(activeMQServer, true, false); return null; case RegistrationError: LOGGER.error("Stopping broker because of critical registration error"); asyncRestartServer(activeMQServer, false); return null; case AlreadyReplicating: // can just retry here, data should be clean and nodeLocator // should remove the live node that has answered this LOGGER.info("Live broker was already replicating: retry sync with another live"); continue; case ClosedObserver: return null; case BackupNotInSync: // cleanup any dirty activation sequence to save a leaked activation sequence/nodeID to cause activation final long activationSequence = activeMQServer.getNodeManager().getNodeActivationSequence(); boolean restart = true; if (activationSequence != 0) { final SimpleString syncNodeId = activeMQServer.getNodeManager().getNodeId(); try { activeMQServer.getNodeManager().writeNodeActivationSequence(0); } catch (Throwable fatal) { LOGGER.errorf(fatal, "Errored while resetting local activation sequence %d for NodeID = %s: stopping broker", activationSequence, syncNodeId); restart = false; } } if (restart) { LOGGER.info("Replication failure while initial sync not yet completed: restart as backup"); } asyncRestartServer(activeMQServer, restart); return null; case WrongNodeId: LOGGER.error("Stopping broker because of wrong node ID communication from live: maybe a misbehaving live?"); asyncRestartServer(activeMQServer, false); return null; default: throw new AssertionError("Unsupported failure " + failure); } } } finally { silentExecution("Error on cluster topology listener for replication cleanup", () -> clusterController.removeClusterTopologyListenerForReplication(nodeLocator)); } } private ReplicationObserver replicationObserver() { if (policy.isTryFailback()) { return ReplicationObserver.failbackObserver(activeMQServer.getNodeManager(), activeMQServer.getBackupManager(), activeMQServer.getScheduledPool(), expectedNodeID); } return ReplicationObserver.failoverObserver(activeMQServer.getNodeManager(), activeMQServer.getBackupManager(), activeMQServer.getScheduledPool()); } private ReplicationFailure replicateLive(final ClusterController clusterController, final LiveNodeLocator liveLocator, final RegistrationFailureForwarder registrationFailureForwarder) throws ActiveMQException { try (ReplicationObserver replicationObserver = replicationObserver(); RegistrationFailureForwarder ignored = registrationFailureForwarder.to(replicationObserver)) { this.replicationObserver = replicationObserver; clusterController.addClusterTopologyListener(replicationObserver); // ReplicationError notifies backup registration failures to live locator -> forwarder -> observer final ReplicationError replicationError = new ReplicationError(liveLocator); clusterController.addIncomingInterceptorForReplication(replicationError); try { final ClusterControl liveControl = tryLocateAndConnectToLive(liveLocator, clusterController); if (liveControl == null) { return null; } try { final ReplicationEndpoint replicationEndpoint = tryAuthorizeAndAsyncRegisterAsBackupToLive(liveControl, replicationObserver); if (replicationEndpoint == null) { return ReplicationFailure.RegistrationError; } this.replicationEndpoint = replicationEndpoint; try { return replicationObserver.awaitReplicationFailure(); } finally { this.replicationEndpoint = null; ActiveMQServerImpl.stopComponent(replicationEndpoint); closeChannelOf(replicationEndpoint); } } finally { silentExecution("Error on live control close", liveControl::close); } } finally { silentExecution("Error on cluster topology listener cleanup", () -> clusterController.removeClusterTopologyListener(replicationObserver)); silentExecution("Error while removing incoming interceptor for replication", () -> clusterController.removeIncomingInterceptorForReplication(replicationError)); } } finally { this.replicationObserver = null; } } private static void silentExecution(String debugErrorMessage, Runnable task) { try { task.run(); } catch (Throwable ignore) { LOGGER.debug(debugErrorMessage, ignore); } } private static void closeChannelOf(final ReplicationEndpoint replicationEndpoint) { if (replicationEndpoint == null) { return; } if (replicationEndpoint.getChannel() != null) { silentExecution("Error while closing replication endpoint channel", () -> replicationEndpoint.getChannel().close()); replicationEndpoint.setChannel(null); } } private boolean asyncRestartServer(final ActiveMQServer server, boolean restart) { return asyncRestartServer(server, restart, true); } private boolean asyncRestartServer(final ActiveMQServer server, boolean restart, boolean checkStopping) { if (checkStopping) { if (!stopping.compareAndSet(false, true)) { return false; } } new Thread(() -> { if (server.getState() != ActiveMQServer.SERVER_STATE.STOPPED && server.getState() != ActiveMQServer.SERVER_STATE.STOPPING) { // this is necessary to make the restart an atomic operation from the server perspective synchronized (server) { if (server.getState() == ActiveMQServer.SERVER_STATE.STOPPED) { return; } try { server.stop(!restart); if (restart) { server.start(); } } catch (Exception e) { if (restart) { ActiveMQServerLogger.LOGGER.errorRestartingBackupServer(e, server); } else { ActiveMQServerLogger.LOGGER.errorStoppingServer(e); } } } } }).start(); return true; } private ClusterControl tryLocateAndConnectToLive(final LiveNodeLocator liveLocator, final ClusterController clusterController) throws ActiveMQException { liveLocator.locateNode(); final Pair possibleLive = liveLocator.getLiveConfiguration(); final String nodeID = liveLocator.getNodeID(); if (nodeID == null) { throw new RuntimeException("Could not establish the connection with any live"); } if (!policy.isTryFailback()) { assert expectedNodeID == null; activeMQServer.getNodeManager().setNodeID(nodeID); } else { assert expectedNodeID.equals(nodeID); } if (possibleLive == null) { return null; } final ClusterControl liveControl = tryConnectToNodeInReplicatedCluster(clusterController, possibleLive.getA()); if (liveControl != null) { return liveControl; } return tryConnectToNodeInReplicatedCluster(clusterController, possibleLive.getB()); } private static ClusterControl tryConnectToNodeInReplicatedCluster(final ClusterController clusterController, final TransportConfiguration tc) { try { if (tc != null) { return clusterController.connectToNodeInReplicatedCluster(tc); } } catch (Exception e) { LOGGER.debug(e.getMessage(), e); } return null; } @Override public void close(final boolean permanently, final boolean restarting) throws Exception { synchronized (this) { closed = true; final ReplicationObserver replicationObserver = this.replicationObserver; if (replicationObserver != null) { replicationObserver.close(); } } //we have to check as the server policy may have changed try { if (activeMQServer.getHAPolicy().isBackup()) { // To avoid a NPE cause by the stop final NodeManager nodeManager = activeMQServer.getNodeManager(); activeMQServer.interruptActivationThread(nodeManager); if (nodeManager != null) { nodeManager.stopBackup(); } } } finally { // this one need to happen after interrupting the activation thread // in order to unblock distributedManager::start distributedManager.stop(); } } @Override public void preStorageClose() throws Exception { // TODO replication endpoint close? } private ReplicationEndpoint tryAuthorizeAndAsyncRegisterAsBackupToLive(final ClusterControl liveControl, final ReplicationObserver liveObserver) { ReplicationEndpoint replicationEndpoint = null; try { liveControl.getSessionFactory().setReconnectAttempts(1); liveObserver.listenConnectionFailuresOf(liveControl.getSessionFactory()); liveControl.authorize(); replicationEndpoint = new ReplicationEndpoint(activeMQServer, policy.isTryFailback(), liveObserver); final Consumer onReplicationEndpointCreation = this.onReplicationEndpointCreation; if (onReplicationEndpointCreation != null) { onReplicationEndpointCreation.accept(replicationEndpoint); } replicationEndpoint.setExecutor(activeMQServer.getExecutorFactory().getExecutor()); connectToReplicationEndpoint(liveControl, replicationEndpoint); replicationEndpoint.start(); liveControl.announceReplicatingBackupToLive(policy.isTryFailback(), policy.getClusterName()); return replicationEndpoint; } catch (Exception e) { ActiveMQServerLogger.LOGGER.replicationStartProblem(e); ActiveMQServerImpl.stopComponent(replicationEndpoint); closeChannelOf(replicationEndpoint); return null; } } private static boolean connectToReplicationEndpoint(final ClusterControl liveControl, final ReplicationEndpoint replicationEndpoint) { final Channel replicationChannel = liveControl.createReplicationChannel(); replicationChannel.setHandler(replicationEndpoint); replicationEndpoint.setChannel(replicationChannel); return true; } @Override public boolean isReplicaSync() { // NOTE: this method is just for monitoring purposes, not suitable to perform logic! // During a failover this backup won't have any active liveObserver and will report `false`!! final ReplicationObserver liveObserver = this.replicationObserver; if (liveObserver == null) { return false; } return liveObserver.isBackupUpToDate(); } public ReplicationEndpoint getReplicationEndpoint() { return replicationEndpoint; } /** * This must be used just for testing purposes. */ public void spyReplicationEndpointCreation(Consumer onReplicationEndpointCreation) { Objects.requireNonNull(onReplicationEndpointCreation); this.onReplicationEndpointCreation = onReplicationEndpointCreation; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy