All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.activemq.artemis.core.server.impl.SharedNothingBackupActivation Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.activemq.artemis.core.server.impl;

import java.util.Map;
import java.util.concurrent.TimeUnit;

import org.apache.activemq.artemis.api.core.ActiveMQException;
import org.apache.activemq.artemis.api.core.ActiveMQInternalErrorException;
import org.apache.activemq.artemis.api.core.Pair;
import org.apache.activemq.artemis.api.core.TransportConfiguration;
import org.apache.activemq.artemis.api.core.client.TopologyMember;
import org.apache.activemq.artemis.core.config.Configuration;
import org.apache.activemq.artemis.core.io.IOCriticalErrorListener;
import org.apache.activemq.artemis.core.paging.PagingManager;
import org.apache.activemq.artemis.core.persistence.StorageManager;
import org.apache.activemq.artemis.core.postoffice.PostOffice;
import org.apache.activemq.artemis.core.protocol.core.Channel;
import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage;
import org.apache.activemq.artemis.core.replication.ReplicationEndpoint;
import org.apache.activemq.artemis.core.replication.ReplicationEndpoint.ReplicationEndpointEventListener;
import org.apache.activemq.artemis.core.server.ActivationParams;
import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle;
import org.apache.activemq.artemis.core.server.ActiveMQServer;
import org.apache.activemq.artemis.core.server.ActiveMQServerLogger;
import org.apache.activemq.artemis.core.server.LiveNodeLocator;
import org.apache.activemq.artemis.core.server.NetworkHealthCheck;
import org.apache.activemq.artemis.core.server.NodeManager;
import org.apache.activemq.artemis.core.server.QueueFactory;
import org.apache.activemq.artemis.core.server.cluster.ClusterControl;
import org.apache.activemq.artemis.core.server.cluster.ClusterController;
import org.apache.activemq.artemis.core.server.cluster.ha.ReplicaPolicy;
import org.apache.activemq.artemis.core.server.cluster.ha.ScaleDownPolicy;
import org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum;
import org.apache.activemq.artemis.core.server.group.GroupingHandler;
import org.apache.activemq.artemis.core.server.management.ManagementService;
import org.apache.activemq.artemis.utils.ReusableLatch;
import org.jboss.logging.Logger;

import static org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum.BACKUP_ACTIVATION.FAILURE_REPLICATING;
import static org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum.BACKUP_ACTIVATION.FAIL_OVER;
import static org.apache.activemq.artemis.core.server.cluster.qourum.SharedNothingBackupQuorum.BACKUP_ACTIVATION.STOP;

public final class SharedNothingBackupActivation extends Activation implements ReplicationEndpointEventListener {

   private static final Logger logger = Logger.getLogger(SharedNothingBackupActivation.class);

   //this is how we act when we start as a backup
   private ReplicaPolicy replicaPolicy;

   //this is the endpoint where we replicate too
   private ReplicationEndpoint replicationEndpoint;

   private final ActiveMQServerImpl activeMQServer;
   private SharedNothingBackupQuorum backupQuorum;
   private final boolean attemptFailBack;
   private final Map activationParams;
   private final IOCriticalErrorListener ioCriticalErrorListener;
   private String nodeID;
   ClusterControl clusterControl;
   private boolean closed;
   private volatile boolean backupUpToDate = true;
   private final NetworkHealthCheck networkHealthCheck;

   private final ReusableLatch backupSyncLatch = new ReusableLatch(0);

   public SharedNothingBackupActivation(ActiveMQServerImpl activeMQServer,
                                        boolean attemptFailBack,
                                        Map activationParams,
                                        IOCriticalErrorListener ioCriticalErrorListener,
                                        ReplicaPolicy replicaPolicy,
                                        NetworkHealthCheck networkHealthCheck) {
      this.activeMQServer = activeMQServer;
      this.attemptFailBack = attemptFailBack;
      this.activationParams = activationParams;
      this.ioCriticalErrorListener = ioCriticalErrorListener;
      this.replicaPolicy = replicaPolicy;
      backupSyncLatch.setCount(1);
      this.networkHealthCheck = networkHealthCheck;
   }

   public void init() throws Exception {
      assert replicationEndpoint == null;
      activeMQServer.resetNodeManager();
      backupUpToDate = false;
      replicationEndpoint = new ReplicationEndpoint(activeMQServer, attemptFailBack, this);
   }

   @Override
   public void run() {
      try {

         logger.trace("SharedNothingBackupActivation..start");
         synchronized (activeMQServer) {
            activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED);
         }
         // move all data away:
         activeMQServer.getNodeManager().stop();
         activeMQServer.moveServerData(replicaPolicy.getMaxSavedReplicatedJournalsSize());
         activeMQServer.getNodeManager().start();
         synchronized (this) {
            if (closed) {
               logger.trace("SharedNothingBackupActivation is closed, ignoring activation!");
               return;
            }
         }

         boolean scalingDown = replicaPolicy.getScaleDownPolicy() != null && replicaPolicy.getScaleDownPolicy().isEnabled();

         if (!activeMQServer.initialisePart1(scalingDown)) {
            if (logger.isTraceEnabled()) {
               logger.trace("could not initialize part1 " + scalingDown);
            }
            return;
         }

         synchronized (this) {
            if (closed)
               return;
            backupQuorum = new SharedNothingBackupQuorum(activeMQServer.getNodeManager(), activeMQServer.getScheduledPool(), networkHealthCheck, replicaPolicy.getQuorumSize(), replicaPolicy.getVoteRetries(), replicaPolicy.getVoteRetryWait(), replicaPolicy.getQuorumVoteWait(), attemptFailBack);
            activeMQServer.getClusterManager().getQuorumManager().registerQuorum(backupQuorum);
            activeMQServer.getClusterManager().getQuorumManager().registerQuorumHandler(new ServerConnectVoteHandler(activeMQServer));
         }

         //use a Node Locator to connect to the cluster
         LiveNodeLocator nodeLocator;
         if (activationParams.get(ActivationParams.REPLICATION_ENDPOINT) != null) {
            TopologyMember member = (TopologyMember) activationParams.get(ActivationParams.REPLICATION_ENDPOINT);
            nodeLocator = new NamedNodeIdNodeLocator(member.getNodeId(), new Pair<>(member.getLive(), member.getBackup()));
         } else {
            nodeLocator = replicaPolicy.getGroupName() == null ? new AnyLiveNodeLocatorForReplication(backupQuorum, activeMQServer, replicaPolicy.getRetryReplicationWait()) : new NamedLiveNodeLocatorForReplication(replicaPolicy.getGroupName(), backupQuorum, replicaPolicy.getRetryReplicationWait());
         }
         ClusterController clusterController = activeMQServer.getClusterManager().getClusterController();
         clusterController.addClusterTopologyListenerForReplication(nodeLocator);

         logger.trace("Waiting on cluster connection");
         clusterController.awaitConnectionToReplicationCluster();

         logger.trace("Cluster Connected");

         clusterController.addIncomingInterceptorForReplication(new ReplicationError(nodeLocator));

         logger.debug("Starting backup manager");
         activeMQServer.getBackupManager().start();

         replicationEndpoint.setExecutor(activeMQServer.getExecutorFactory().getExecutor());
         EndpointConnector endpointConnector = new EndpointConnector();

         logger.debug("Starting Backup Server");
         ActiveMQServerLogger.LOGGER.backupServerStarted(activeMQServer.getVersion().getFullVersion(), activeMQServer.getNodeManager().getNodeId());
         activeMQServer.setState(ActiveMQServerImpl.SERVER_STATE.STARTED);

         if (logger.isTraceEnabled())
            logger.trace("Setting server state as started");

         SharedNothingBackupQuorum.BACKUP_ACTIVATION signal;
         do {
            if (closed) {
               logger.debug("Activation is closed, so giving up");
               return;
            }

            if (logger.isTraceEnabled()) {
               logger.trace("looking up the node through nodeLocator.locateNode()");
            }
            //locate the first live server to try to replicate
            nodeLocator.locateNode();
            Pair possibleLive = nodeLocator.getLiveConfiguration();
            nodeID = nodeLocator.getNodeID();
            if (logger.isDebugEnabled()) {
               logger.debug("Connecting towards a possible live, connection information=" + possibleLive + ", nodeID=" + nodeID);
            }

            //in a normal (non failback) scenario if we couldn't find our live server we should fail
            if (!attemptFailBack) {
               logger.debug("attemptFailback=false, nodeID=" + nodeID);

               //this shouldn't happen
               if (nodeID == null) {
                  logger.debug("Throwing a RuntimeException as nodeID==null ant attemptFailback=false");
                  throw new RuntimeException("Could not establish the connection");
               }
               activeMQServer.getNodeManager().setNodeID(nodeID);
            }

            if (possibleLive != null) {
               clusterControl = tryConnectToNodeInReplicatedCluster(clusterController, possibleLive.getA());
               if (clusterControl == null) {
                  clusterControl = tryConnectToNodeInReplicatedCluster(clusterController, possibleLive.getB());
               }
            } else {
               clusterControl = null;
            }
            if (clusterControl == null) {

               if (logger.isTraceEnabled()) {
                  logger.trace("sleeping " + clusterController.getRetryIntervalForReplicatedCluster() + " it should retry");
               }
               //its ok to retry here since we haven't started replication yet
               //it may just be the server has gone since discovery
               Thread.sleep(clusterController.getRetryIntervalForReplicatedCluster());
               signal = SharedNothingBackupQuorum.BACKUP_ACTIVATION.ALREADY_REPLICATING;
               continue;
            }

            activeMQServer.getThreadPool().execute(endpointConnector);
            /**
             * Wait for a signal from the the quorum manager, at this point if replication has been successful we can
             * fail over or if there is an error trying to replicate (such as already replicating) we try the
             * process again on the next live server.  All the action happens inside {@link BackupQuorum}
             */
            signal = backupQuorum.waitForStatusChange();

            if (logger.isTraceEnabled()) {
               logger.trace("Got a signal " + signal + " through backupQuorum.waitForStatusChange()");
            }

            /**
             * replicationEndpoint will be holding lots of open files. Make sure they get
             * closed/sync'ed.
             */
            ActiveMQServerImpl.stopComponent(replicationEndpoint);
            // time to give up
            if (!activeMQServer.isStarted() || signal == STOP) {
               if (logger.isTraceEnabled()) {
                  logger.trace("giving up on the activation:: activemqServer.isStarted=" + activeMQServer.isStarted() + " while signal = " + signal);
               }
               return;
            } else if (signal == FAIL_OVER) {
               // time to fail over
               if (logger.isTraceEnabled()) {
                  logger.trace("signal == FAIL_OVER, breaking the loop");
               }
               break;
            } else if (signal == SharedNothingBackupQuorum.BACKUP_ACTIVATION.FAILURE_REPLICATING || signal == SharedNothingBackupQuorum.BACKUP_ACTIVATION.FAILURE_RETRY) {
               // something has gone badly run restart from scratch
               if (logger.isTraceEnabled()) {
                  logger.trace("Starting a new thread to stop the server!");
               }

               final SharedNothingBackupQuorum.BACKUP_ACTIVATION signalToStop = signal;

               Thread startThread = new Thread(new Runnable() {
                  @Override
                  public void run() {
                     try {
                        if (logger.isTraceEnabled()) {
                           logger.trace("Calling activeMQServer.stop() as initialization failed");
                        }
                        if (activeMQServer.getState() != ActiveMQServer.SERVER_STATE.STOPPED &&
                            activeMQServer.getState() != ActiveMQServer.SERVER_STATE.STOPPING) {

                           if (signalToStop == SharedNothingBackupQuorum.BACKUP_ACTIVATION.FAILURE_RETRY) {
                              activeMQServer.stop(false);
                              logger.trace("The server was shutdown for a network isolation, we keep retrying");
                              activeMQServer.start();
                           } else {
                              activeMQServer.stop();
                           }
                        }
                     } catch (Exception e) {
                        ActiveMQServerLogger.LOGGER.errorRestartingBackupServer(e, activeMQServer);
                     }
                  }
               });
               startThread.start();
               return;
            }
            //ok, this live is no good, let's reset and try again
            //close this session factory, we're done with it
            clusterControl.close();
            backupQuorum.reset();
            if (replicationEndpoint.getChannel() != null) {
               replicationEndpoint.getChannel().close();
               replicationEndpoint.setChannel(null);
            }
         }
         while (signal == SharedNothingBackupQuorum.BACKUP_ACTIVATION.ALREADY_REPLICATING);

         if (logger.isTraceEnabled()) {
            logger.trace("Activation loop finished, current signal = " + signal);
         }

         activeMQServer.getClusterManager().getQuorumManager().unRegisterQuorum(backupQuorum);

         if (!isRemoteBackupUpToDate()) {
            logger.debug("throwing exception for !isRemoteBackupUptoDate");
            throw ActiveMQMessageBundle.BUNDLE.backupServerNotInSync();
         }

         if (logger.isTraceEnabled()) {
            logger.trace("@@@ setReplicaPolicy::" + replicaPolicy);
         }

         replicaPolicy.getReplicatedPolicy().setReplicaPolicy(replicaPolicy);
         activeMQServer.setHAPolicy(replicaPolicy.getReplicatedPolicy());

         synchronized (activeMQServer) {
            if (!activeMQServer.isStarted()) {
               logger.trace("Server is stopped, giving up right before becomingLive");
               return;
            }
            ActiveMQServerLogger.LOGGER.becomingLive(activeMQServer);
            logger.trace("stop backup");
            activeMQServer.getNodeManager().stopBackup();
            logger.trace("start store manager");
            activeMQServer.getStorageManager().start();
            logger.trace("activated");
            activeMQServer.getBackupManager().activated();
            if (scalingDown) {
               logger.trace("Scalling down...");
               activeMQServer.initialisePart2(true);
            } else {
               logger.trace("Setting up new activation");
               activeMQServer.setActivation(new SharedNothingLiveActivation(activeMQServer, replicaPolicy.getReplicatedPolicy()));
               logger.trace("initialize part 2");
               activeMQServer.initialisePart2(false);

               if (activeMQServer.getIdentity() != null) {
                  ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity());
               } else {
                  ActiveMQServerLogger.LOGGER.serverIsLive();
               }

            }

            logger.trace("completeActivation at the end");

            activeMQServer.completeActivation(true);
         }
      } catch (Exception e) {
         if (logger.isTraceEnabled()) {
            logger.trace(e.getMessage() + ", serverStarted=" + activeMQServer.isStarted(), e);
         }
         if ((e instanceof InterruptedException || e instanceof IllegalStateException) && !activeMQServer.isStarted())
            // do not log these errors if the server is being stopped.
            return;
         ActiveMQServerLogger.LOGGER.initializationError(e);
      }
   }

   private static ClusterControl tryConnectToNodeInReplicatedCluster(ClusterController clusterController, TransportConfiguration tc) {
      try {
         if (logger.isTraceEnabled()) {
            logger.trace("Calling clusterController.connectToNodeInReplicatedCluster(" + tc + ")");
         }
         if (tc != null) {
            return clusterController.connectToNodeInReplicatedCluster(tc);
         }
      } catch (Exception e) {
         logger.debug(e.getMessage(), e);
      }
      return null;
   }

   @Override
   public void close(final boolean permanently, boolean restarting) throws Exception {
      synchronized (this) {
         if (backupQuorum != null)
            backupQuorum.causeExit(STOP);
         replicationEndpoint = null;
         closed = true;
      }
      //we have to check as the server policy may have changed
      if (activeMQServer.getHAPolicy().isBackup()) {
         // To avoid a NPE cause by the stop
         NodeManager nodeManagerInUse = activeMQServer.getNodeManager();

         activeMQServer.interruptActivationThread(nodeManagerInUse);

         if (nodeManagerInUse != null) {
            nodeManagerInUse.stopBackup();
         }
      }
   }

   @Override
   public void preStorageClose() throws Exception {
      if (replicationEndpoint != null) {
         replicationEndpoint.stop();
      }
   }

   @Override
   public JournalLoader createJournalLoader(PostOffice postOffice,
                                            PagingManager pagingManager,
                                            StorageManager storageManager,
                                            QueueFactory queueFactory,
                                            NodeManager nodeManager,
                                            ManagementService managementService,
                                            GroupingHandler groupingHandler,
                                            Configuration configuration,
                                            ActiveMQServer parentServer) throws ActiveMQException {
      if (replicaPolicy.getScaleDownPolicy() != null && replicaPolicy.getScaleDownPolicy().isEnabled()) {
         return new BackupRecoveryJournalLoader(postOffice, pagingManager, storageManager, queueFactory, nodeManager, managementService, groupingHandler, configuration, parentServer, ScaleDownPolicy.getScaleDownConnector(replicaPolicy.getScaleDownPolicy(), activeMQServer), activeMQServer.getClusterManager().getClusterController());
      } else {
         return super.createJournalLoader(postOffice, pagingManager, storageManager, queueFactory, nodeManager, managementService, groupingHandler, configuration, parentServer);
      }
   }

   @Override
   public void haStarted() {
      activeMQServer.getClusterManager().getClusterController().setReplicatedClusterName(replicaPolicy.getClusterName());
   }

   /**
    * Wait for backup synchronization when using synchronization
    *
    * @param timeout
    * @param unit
    * @return {@code true} if the server was already initialized or if it was initialized within the
    * timeout period, {@code false} otherwise.
    * @throws InterruptedException
    * @see java.util.concurrent.CountDownLatch#await(long, TimeUnit)
    */
   public boolean waitForBackupSync(long timeout, TimeUnit unit) throws InterruptedException {
      return backupSyncLatch.await(timeout, unit);
   }

   /**
    * Live has notified this server that it is going to stop.
    */
   public void failOver(final ReplicationLiveIsStoppingMessage.LiveStopping finalMessage) {
      if (finalMessage == null) {
         backupQuorum.causeExit(FAILURE_REPLICATING);
      } else {
         backupQuorum.failOver(finalMessage);
      }
   }

   public ReplicationEndpoint getReplicationEndpoint() {
      return replicationEndpoint;
   }

   /**
    * Whether a remote backup server was in sync with its live server. If it was not in sync, it may
    * not take over the live's functions.
    * 

* A local backup server or a live server should always return {@code true} * * @return whether the backup is up-to-date, if the server is not a backup it always returns * {@code true}. */ public boolean isRemoteBackupUpToDate() { return backupUpToDate; } @Override public void onLiveNodeId(String nodeId) { backupQuorum.liveIDSet(nodeId); } @Override public void onRemoteBackupUpToDate() { activeMQServer.getBackupManager().announceBackup(); backupUpToDate = true; backupSyncLatch.countDown(); } /** * @throws ActiveMQException */ @Override public void onLiveStopping(ReplicationLiveIsStoppingMessage.LiveStopping finalMessage) throws ActiveMQException { if (logger.isTraceEnabled()) { logger.trace("Remote fail-over, got message=" + finalMessage + ", backupUpToDate=" + backupUpToDate); } if (!activeMQServer.getHAPolicy().isBackup() || activeMQServer.getHAPolicy().isSharedStore()) { throw new ActiveMQInternalErrorException(); } if (!backupUpToDate) { failOver(null); } else { failOver(finalMessage); } } private class EndpointConnector implements Runnable { @Override public void run() { try { //we should only try once, if its not there we should move on. clusterControl.getSessionFactory().setReconnectAttempts(1); backupQuorum.setSessionFactory(clusterControl.getSessionFactory()); //get the connection and request replication to live clusterControl.authorize(); connectToReplicationEndpoint(clusterControl); replicationEndpoint.start(); clusterControl.announceReplicatingBackupToLive(attemptFailBack, replicaPolicy.getClusterName()); } catch (Exception e) { //we shouldn't stop the server just mark the connector as tried and unavailable ActiveMQServerLogger.LOGGER.replicationStartProblem(e); backupQuorum.causeExit(FAILURE_REPLICATING); } } private synchronized ReplicationEndpoint connectToReplicationEndpoint(final ClusterControl control) throws Exception { if (!activeMQServer.isStarted()) return null; if (!activeMQServer.getHAPolicy().isBackup()) { throw ActiveMQMessageBundle.BUNDLE.serverNotBackupServer(); } Channel replicationChannel = control.createReplicationChannel(); replicationChannel.setHandler(replicationEndpoint); if (replicationEndpoint.getChannel() != null) { throw ActiveMQMessageBundle.BUNDLE.alreadyHaveReplicationServer(); } replicationEndpoint.setChannel(replicationChannel); return replicationEndpoint; } } @Override public boolean isReplicaSync() { return isRemoteBackupUpToDate(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy