All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.ha.ZKFailoverController Maven / Gradle / Ivy

There is a newer version: 3.4.1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in org.apache.hadoop.shaded.com.liance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org.apache.hadoop.shaded.org.licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.shaded.org.apache.hadoop.ha;

import java.org.apache.hadoop.shaded.io.IOException;
import java.org.apache.hadoop.shaded.net.InetSocketAddress;
import java.security.PrivilegedAction;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.shaded.org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.shaded.org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.shaded.org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.shaded.org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.shaded.org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException;
import org.apache.hadoop.shaded.org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
import org.apache.hadoop.shaded.org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.shaded.org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
import org.apache.hadoop.shaded.org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
import org.apache.hadoop.shaded.org.apache.hadoop.security.ProviderUtils;
import org.apache.hadoop.shaded.org.apache.hadoop.util.ZKUtil;
import org.apache.hadoop.shaded.org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
import org.apache.hadoop.shaded.org.apache.hadoop.ha.HealthMonitor.State;
import org.apache.hadoop.shaded.org.apache.hadoop.ipc.Server;
import org.apache.hadoop.shaded.org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.shaded.org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.shaded.org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.shaded.org.apache.hadoop.security.authorize.PolicyProvider;
import org.apache.hadoop.shaded.org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.shaded.org.apache.zookeeper.KeeperException;
import org.apache.hadoop.shaded.org.apache.zookeeper.ZooDefs.Ids;
import org.apache.hadoop.shaded.org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.shaded.org.apache.zookeeper.data.ACL;

import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.annotations.VisibleForTesting;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.base.Preconditions;
import org.apache.hadoop.shaded.org.apache.hadoop.thirdparty.org.apache.hadoop.shaded.com.google.org.apache.hadoop.shaded.com.on.util.concurrent.ThreadFactoryBuilder;
import org.apache.hadoop.shaded.org.slf4j.Logger;
import org.apache.hadoop.shaded.org.slf4j.LoggerFactory;

@InterfaceAudience.LimitedPrivate("HDFS")
public abstract class ZKFailoverController {

  static final Logger LOG = LoggerFactory.getLogger(ZKFailoverController.class);
  
  public static final String ZK_QUORUM_KEY = "ha.zookeeper.quorum";
  private static final String ZK_SESSION_TIMEOUT_KEY = "ha.zookeeper.session-timeout.ms";
  private static final int ZK_SESSION_TIMEOUT_DEFAULT = 10*1000;
  private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
  public static final String ZK_ACL_KEY = "ha.zookeeper.acl";
  private static final String ZK_ACL_DEFAULT = "world:anyone:rwcda";
  public static final String ZK_AUTH_KEY = "ha.zookeeper.auth";
  static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";

  /**
   * All of the conf keys used by the ZKFC. This is used in order to allow
   * them to be overridden on a per-nameservice or per-namenode basis.
   */
  protected static final String[] ZKFC_CONF_KEYS = new String[] {
    ZK_QUORUM_KEY,
    ZK_SESSION_TIMEOUT_KEY,
    ZK_PARENT_ZNODE_KEY,
    ZK_ACL_KEY,
    ZK_AUTH_KEY
  };
  
  protected static final String USAGE =
      "Usage: hdfs zkfc [ -formatZK [-force] [-nonInteractive] ]\n"
      + "\t-force: formats the znode if the znode exists.\n"
      + "\t-nonInteractive: formats the znode aborts if the znode exists,\n"
      + "\tunless -force option is specified.";

  /** Unable to format the parent znode in ZK */
  static final int ERR_CODE_FORMAT_DENIED = 2;
  /** The parent znode doesn't exist in ZK */
  static final int ERR_CODE_NO_PARENT_ZNODE = 3;
  /** Fencing is not properly configured */
  static final int ERR_CODE_NO_FENCER = 4;
  /** Automatic failover is not enabled */
  static final int ERR_CODE_AUTO_FAILOVER_NOT_ENABLED = 5;
  /** Cannot connect to ZooKeeper */
  static final int ERR_CODE_NO_ZK = 6;
  
  protected Configuration conf;
  private String zkQuorum;
  protected final HAServiceTarget localTarget;

  private HealthMonitor healthMonitor;
  private ActiveStandbyElector elector;
  protected ZKFCRpcServer rpcServer;

  private State lastHealthState = State.INITIALIZING;

  private volatile HAServiceState serviceState = HAServiceState.INITIALIZING;

  /** Set if a fatal error occurs */
  private String fatalError = null;

  /**
   * A future nanotime before which the ZKFC will not join the election.
   * This is used during graceful failover.
   */
  private long delayJoiningUntilNanotime = 0;

  /** Executor on which {@link #scheduleRecheck(long)} schedules events */
  private ScheduledExecutorService delayExecutor =
    Executors.newScheduledThreadPool(1,
        new ThreadFactoryBuilder().setDaemon(true)
            .setNameFormat("ZKFC Delay timer #%d")
            .build());

  private ActiveAttemptRecord lastActiveAttemptRecord;
  private Object activeAttemptRecordLock = new Object();

  protected ZKFailoverController(Configuration conf, HAServiceTarget localTarget) {
    this.localTarget = localTarget;
    this.conf = conf;
  }
  

  protected abstract byte[] targetToData(HAServiceTarget target);
  protected abstract HAServiceTarget dataToTarget(byte[] data);
  protected abstract void loginAsFCUser() throws IOException;
  protected abstract void checkRpcAdminAccess()
      throws AccessControlException, IOException;
  protected abstract InetSocketAddress getRpcAddressToBindTo();
  protected abstract PolicyProvider getPolicyProvider();
  protected abstract List getAllOtherNodes();

  /**
   * Return the name of a znode inside the configured parent znode in which
   * the ZKFC will do all of its work. This is so that multiple federated
   * nameservices can run on the same ZK quorum without having to manually
   * configure them to separate subdirectories.
   */
  protected abstract String getScopeInsideParentNode();

  public HAServiceTarget getLocalTarget() {
    return localTarget;
  }

  @VisibleForTesting
  public HAServiceState getServiceState() {
    return serviceState;
  }

  public int run(final String[] args) throws Exception {
    if (!localTarget.isAutoFailoverEnabled()) {
      LOG.error("Automatic failover is not enabled for " + localTarget + "." +
          " Please ensure that automatic failover is enabled in the " +
          "configuration before running the ZK failover controller.");
      return ERR_CODE_AUTO_FAILOVER_NOT_ENABLED;
    }
    loginAsFCUser();
    try {
      return SecurityUtil.doAsLoginUserOrFatal(new PrivilegedAction() {
        @Override
        public Integer run() {
          try {
            return doRun(args);
          } catch (Exception t) {
            throw new RuntimeException(t);
          } finally {
            if (elector != null) {
              elector.terminateConnection();
            }
          }
        }
      });
    } catch (RuntimeException rte) {
      throw (Exception)rte.getCause();
    }
  }
  

  private int doRun(String[] args)
      throws Exception {
    try {
      initZK();
    } catch (KeeperException ke) {
      LOG.error("Unable to start failover controller. Unable to connect "
          + "to ZooKeeper quorum at " + zkQuorum + ". Please check the "
          + "configured value for " + ZK_QUORUM_KEY + " and ensure that "
          + "ZooKeeper is running.", ke);
      return ERR_CODE_NO_ZK;
    }
    try {
      if (args.length > 0) {
        if ("-formatZK".equals(args[0])) {
          boolean force = false;
          boolean interactive = true;
          for (int i = 1; i < args.length; i++) {
            if ("-force".equals(args[i])) {
              force = true;
            } else if ("-nonInteractive".equals(args[i])) {
              interactive = false;
            } else {
              badArg(args[i]);
            }
          }
          return formatZK(force, interactive);
        }
        else {
          badArg(args[0]);
        }
      }
    } catch (Exception e){
      LOG.error("The failover controller encounters runtime error", e);
      throw e;
    }

    if (!elector.parentZNodeExists()) {
      LOG.error("Unable to start failover controller. "
          + "Parent znode does not exist.\n"
          + "Run with -formatZK flag to initialize ZooKeeper.");
      return ERR_CODE_NO_PARENT_ZNODE;
    }

    try {
      localTarget.checkFencingConfigured();
    } catch (BadFencingConfigurationException e) {
      LOG.error("Fencing is not configured for " + localTarget + ".\n" +
          "You must configure a fencing method before using automatic " +
          "failover.", e);
      return ERR_CODE_NO_FENCER;
    }

    try {
      initRPC();
      initHM();
      startRPC();
      mainLoop();
    } catch (Exception e) {
      LOG.error("The failover controller encounters runtime error: ", e);
      throw e;
    } finally {
      rpcServer.stopAndJoin();
      
      elector.quitElection(true);
      healthMonitor.shutdown();
      healthMonitor.join();
    }
    return 0;
  }

  private void badArg(String arg) {
    printUsage();
    throw new HadoopIllegalArgumentException(
        "Bad argument: " + arg);
  }

  private void printUsage() {
    System.err.println(USAGE + "\n");
  }

  private int formatZK(boolean force, boolean interactive)
      throws IOException, InterruptedException, KeeperException {
    if (elector.parentZNodeExists()) {
      if (!force && (!interactive || !confirmFormat())) {
        return ERR_CODE_FORMAT_DENIED;
      }
      
      try {
        elector.clearParentZNode();
      } catch (IOException e) {
        LOG.error("Unable to clear zk parent znode", e);
        return 1;
      }
    }
    
    elector.ensureParentZNode();
    return 0;
  }

  private boolean confirmFormat() {
    String parentZnode = getParentZnode();
    System.err.println(
        "===============================================\n" +
        "The configured parent znode " + parentZnode + " already exists.\n" +
        "Are you sure you want to clear all failover information from\n" +
        "ZooKeeper?\n" +
        "WARNING: Before proceeding, ensure that all HDFS services and\n" +
        "failover controllers are stopped!\n" +
        "===============================================");
    try {
      return ToolRunner.confirmPrompt("Proceed formatting " + parentZnode + "?");
    } catch (IOException e) {
      LOG.debug("Failed to confirm", e);
      return false;
    }
  }

  // ------------------------------------------
  // Begin actual guts of failover controller
  // ------------------------------------------
  
  private void initHM() {
    healthMonitor = new HealthMonitor(conf, localTarget);
    healthMonitor.addCallback(new HealthCallbacks());
    healthMonitor.addServiceStateCallback(new ServiceStateCallBacks());
    healthMonitor.start();
  }

  protected void initRPC() throws IOException {
    InetSocketAddress bindAddr = getRpcAddressToBindTo();
    LOG.info("ZKFC RpcServer binding to {}", bindAddr);
    rpcServer = new ZKFCRpcServer(conf, bindAddr, this, getPolicyProvider());
  }

  protected void startRPC() throws IOException {
    rpcServer.start();
  }


  private void initZK() throws HadoopIllegalArgumentException, IOException,
      KeeperException {
    zkQuorum = conf.get(ZK_QUORUM_KEY);
    int zkTimeout = conf.getInt(ZK_SESSION_TIMEOUT_KEY,
        ZK_SESSION_TIMEOUT_DEFAULT);
    // Parse ACLs from configuration.
    String zkAclConf = conf.get(ZK_ACL_KEY, ZK_ACL_DEFAULT);
    zkAclConf = ZKUtil.resolveConfIndirection(zkAclConf);
    List zkAcls = ZKUtil.parseACLs(zkAclConf);
    if (zkAcls.isEmpty()) {
      zkAcls = Ids.CREATOR_ALL_ACL;
    }
    
    // Parse authentication from configuration. Exclude any Credential providers
    // using the hdfs scheme to avoid a circular dependency. As HDFS is likely
    // not started when ZKFC is started, we cannot read the credentials from it.
    Configuration c = conf;
    try {
      c = ProviderUtils.excludeIncompatibleCredentialProviders(
          conf, FileSystem.getFileSystemClass("hdfs", conf));
    } catch (UnsupportedFileSystemException e) {
      // Should not happen in a real cluster, as the hdfs FS will always be
      // present. Inside tests, the hdfs filesystem will not be present
      LOG.debug("No filesystem found for the hdfs scheme", e);
    }
    List zkAuths = SecurityUtil.getZKAuthInfos(c, ZK_AUTH_KEY);

    // Sanity check configuration.
    Preconditions.checkArgument(zkQuorum != null,
        "Missing required configuration '%s' for ZooKeeper quorum",
        ZK_QUORUM_KEY);
    Preconditions.checkArgument(zkTimeout > 0,
        "Invalid ZK session timeout %s", zkTimeout);
    
    int maxRetryNum = conf.getInt(
        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
    elector = new ActiveStandbyElector(zkQuorum,
        zkTimeout, getParentZnode(), zkAcls, zkAuths,
        new ElectorCallbacks(), maxRetryNum);
  }
  
  private String getParentZnode() {
    String znode = conf.get(ZK_PARENT_ZNODE_KEY,
        ZK_PARENT_ZNODE_DEFAULT);
    if (!znode.endsWith("/")) {
      znode += "/";
    }
    return znode + getScopeInsideParentNode();
  }

  private synchronized void mainLoop() throws InterruptedException {
    while (fatalError == null) {
      wait();
    }
    assert fatalError != null; // only get here on fatal
    throw new RuntimeException(
        "ZK Failover Controller failed: " + fatalError);
  }
  
  private synchronized void fatalError(String err) {
    LOG.error("Fatal error occurred:" + err);
    fatalError = err;
    notifyAll();
  }
  
  private synchronized void becomeActive() throws ServiceFailedException {
    LOG.info("Trying to make " + localTarget + " active...");
    try {
      HAServiceProtocolHelper.transitionToActive(localTarget.getProxy(
          conf, FailoverController.getRpcTimeoutToNewActive(conf)),
          createReqInfo());
      String msg = "Successfully transitioned " + localTarget +
          " to active state";
      LOG.info(msg);
      serviceState = HAServiceState.ACTIVE;
      recordActiveAttempt(new ActiveAttemptRecord(true, msg));

    } catch (Throwable t) {
      String msg = "Couldn't make " + localTarget + " active";
      LOG.error(msg, t);
      
      recordActiveAttempt(new ActiveAttemptRecord(false, msg + "\n" +
          StringUtils.stringifyException(t)));

      if (t instanceof ServiceFailedException) {
        throw (ServiceFailedException)t;
      } else {
        throw new ServiceFailedException("Couldn't transition to active",
            t);
      }
/*
* TODO:
* we need to make sure that if we get fenced and then quickly restarted,
* none of these calls will retry across the restart boundary
* perhaps the solution is that, whenever the nn starts, it gets a unique
* ID, and when we start becoming active, we record it, and then any future
* calls use the same ID
*/
      
    }
  }

  /**
   * Store the results of the last attempt to become active.
   * This is used so that, during manually initiated failover,
   * we can report back the results of the attempt to become active
   * to the initiator of the failover.
   */
  private void recordActiveAttempt(
      ActiveAttemptRecord record) {
    synchronized (activeAttemptRecordLock) {
      lastActiveAttemptRecord = record;
      activeAttemptRecordLock.notifyAll();
    }
  }

  /**
   * Wait until one of the following events:
   * 
    *
  • Another thread publishes the results of an attempt to become active * using {@link #recordActiveAttempt(ActiveAttemptRecord)}
  • *
  • The node enters bad health status
  • *
  • The specified timeout elapses
  • *
* * @param timeoutMillis number of millis to wait * @param onlyAfterNanoTime accept attempt records only after a given * timestamp. Use this parameter to ignore the old attempt records from a * previous fail-over attempt. * @return the published record, or null if the timeout elapses or the * service becomes unhealthy * @throws InterruptedException if the thread is interrupted. */ private ActiveAttemptRecord waitForActiveAttempt(int timeoutMillis, long onlyAfterNanoTime) throws InterruptedException { long waitUntil = onlyAfterNanoTime + TimeUnit.NANOSECONDS.convert( timeoutMillis, TimeUnit.MILLISECONDS); do { // periodically check health state, because entering an // unhealthy state could prevent us from ever attempting to // become active. We can detect this and respond to the user // immediately. synchronized (this) { if (lastHealthState != State.SERVICE_HEALTHY) { // early out if service became unhealthy return null; } } synchronized (activeAttemptRecordLock) { if ((lastActiveAttemptRecord != null && lastActiveAttemptRecord.nanoTime >= onlyAfterNanoTime)) { return lastActiveAttemptRecord; } // Only wait 1sec so that we periodically recheck the health state // above. activeAttemptRecordLock.wait(1000); } } while (System.nanoTime() < waitUntil); // Timeout elapsed. LOG.warn(timeoutMillis + "ms timeout elapsed waiting for an attempt " + "to become active"); return null; } private StateChangeRequestInfo createReqInfo() { return new StateChangeRequestInfo(RequestSource.REQUEST_BY_ZKFC); } private synchronized void becomeStandby() { LOG.info("ZK Election indicated that " + localTarget + " should become standby"); try { int timeout = FailoverController.getGracefulFenceTimeout(conf); localTarget.getProxy(conf, timeout).transitionToStandby(createReqInfo()); LOG.info("Successfully transitioned " + localTarget + " to standby state"); } catch (Exception e) { LOG.error("Couldn't transition " + localTarget + " to standby state", e); // TODO handle this. It's a likely case since we probably got fenced // at the same time. } serviceState = HAServiceState.STANDBY; } private synchronized void fenceOldActive(byte[] data) { HAServiceTarget target = dataToTarget(data); try { doFence(target); } catch (Throwable t) { recordActiveAttempt(new ActiveAttemptRecord(false, "Unable to fence old active: " + StringUtils.stringifyException(t))); throw t; } } private void doFence(HAServiceTarget target) { LOG.info("Should fence: " + target); boolean gracefulWorked = new FailoverController(conf, RequestSource.REQUEST_BY_ZKFC).tryGracefulFence(target); if (gracefulWorked) { // It's possible that it's in standby but just about to go into active, // no? Is there some race here? LOG.info("Successfully transitioned " + target + " to standby " + "state without fencing"); return; } try { target.checkFencingConfigured(); } catch (BadFencingConfigurationException e) { LOG.error("Couldn't fence old active " + target, e); recordActiveAttempt(new ActiveAttemptRecord(false, "Unable to fence old active")); throw new RuntimeException(e); } if (!target.getFencer().fence(target)) { throw new RuntimeException("Unable to fence " + target); } } /** * Request from graceful failover to cede active role. Causes * this ZKFC to transition its local node to standby, then quit * the election for the specified period of time, after which it * will rejoin iff it is healthy. */ void cedeActive(final int millisToCede) throws AccessControlException, ServiceFailedException, IOException { try { UserGroupInformation.getLoginUser().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { doCedeActive(millisToCede); return null; } }); } catch (InterruptedException e) { throw new IOException(e); } } private void doCedeActive(int millisToCede) throws AccessControlException, ServiceFailedException, IOException { int timeout = FailoverController.getGracefulFenceTimeout(conf); // Lock elector to maintain lock ordering of elector -> ZKFC synchronized (elector) { synchronized (this) { if (millisToCede <= 0) { delayJoiningUntilNanotime = 0; recheckElectability(); return; } LOG.info("Requested by " + UserGroupInformation.getCurrentUser() + " at " + Server.getRemoteAddress() + " to cede active role."); boolean needFence = false; try { localTarget.getProxy(conf, timeout).transitionToStandby(createReqInfo()); LOG.info("Successfully ensured local node is in standby mode"); } catch (IOException org.apache.hadoop.shaded.io.) { LOG.warn("Unable to transition local node to standby: " + org.apache.hadoop.shaded.io..getLocalizedMessage()); LOG.warn("Quitting election but indicating that fencing is " + "necessary"); needFence = true; } delayJoiningUntilNanotime = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(millisToCede); elector.quitElection(needFence); serviceState = HAServiceState.INITIALIZING; } } recheckElectability(); } /** * Coordinate a graceful failover to this node. * @throws ServiceFailedException if the node fails to become active * @throws IOException some other error occurs */ void gracefulFailoverToYou() throws ServiceFailedException, IOException { try { UserGroupInformation.getLoginUser().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { doGracefulFailover(); return null; } }); } catch (InterruptedException e) { throw new IOException(e); } } /** * Coordinate a graceful failover. This proceeds in several phases: * 1) Pre-flight checks: ensure that the local node is healthy, and * thus a candidate for failover. * 2a) Determine the current active node. If it is the local node, no * need to failover - return success. * 2b) Get the other nodes * 3a) Ask the other nodes to yield from election for a number of seconds * 3b) Ask the active node to yield from the election for a number of seconds. * 4) Allow the normal election path to run in other threads. Wait until * we either become unhealthy or we see an election attempt recorded by * the normal code path. * 5) Allow the old active to rejoin the election, so a future * failback is possible. */ private void doGracefulFailover() throws ServiceFailedException, IOException, InterruptedException { int timeout = FailoverController.getGracefulFenceTimeout(conf) * 2; // Phase 1: pre-flight checks checkEligibleForFailover(); // Phase 2: determine old/current active node. Check that we're not // ourselves active, etc. HAServiceTarget oldActive = getCurrentActive(); if (oldActive == null) { // No node is currently active. So, if we aren't already // active ourselves by means of a normal election, then there's // probably something preventing us from becoming active. throw new ServiceFailedException( "No other node is currently active."); } if (oldActive.getAddress().equals(localTarget.getAddress())) { LOG.info("Local node " + localTarget + " is already active. " + "No need to failover. Returning success."); return; } // Phase 2b: get the other nodes List otherNodes = getAllOtherNodes(); List otherZkfcs = new ArrayList(otherNodes.size()); // Phase 3: ask the other nodes to yield from the election. long st = System.nanoTime(); HAServiceTarget activeNode = null; for (HAServiceTarget remote : otherNodes) { // same location, same node - may not always be == equality if (remote.getAddress().equals(oldActive.getAddress())) { activeNode = remote; continue; } otherZkfcs.add(cedeRemoteActive(remote, timeout)); } assert activeNode != null : "Active node does not match any known remote node"; // Phase 3b: ask the old active to yield otherZkfcs.add(cedeRemoteActive(activeNode, timeout)); // Phase 4: wait for the normal election to make the local node // active. ActiveAttemptRecord attempt = waitForActiveAttempt(timeout + 60000, st); if (attempt == null) { // We didn't even make an attempt to become active. synchronized(this) { if (lastHealthState != State.SERVICE_HEALTHY) { throw new ServiceFailedException("Unable to become active. " + "Service became unhealthy while trying to failover."); } } throw new ServiceFailedException("Unable to become active. " + "Local node did not get an opportunity to do so from ZooKeeper, " + "or the local node took too long to transition to active."); } // Phase 5. At this point, we made some attempt to become active. So we // can tell the old active to rejoin if it wants. This allows a quick // fail-back if we immediately crash. for (ZKFCProtocol zkfc : otherZkfcs) { zkfc.cedeActive(-1); } if (attempt.succeeded) { LOG.info("Successfully became active. " + attempt.status); } else { // Propagate failure String msg = "Failed to become active. " + attempt.status; throw new ServiceFailedException(msg); } } /** * Ask the remote zkfc to cede its active status and wait for the specified * timeout before attempting to claim leader status. * @param remote node to ask * @param timeout amount of time to cede * @return the {@link ZKFCProtocol} used to talk to the ndoe * @throws IOException */ private ZKFCProtocol cedeRemoteActive(HAServiceTarget remote, int timeout) throws IOException { LOG.info("Asking " + remote + " to cede its active state for " + timeout + "ms"); ZKFCProtocol oldZkfc = remote.getZKFCProxy(conf, timeout); oldZkfc.cedeActive(timeout); return oldZkfc; } /** * If the local node is an observer or is unhealthy it * is not eligible for graceful failover. * @throws ServiceFailedException if the node is an observer or unhealthy */ private synchronized void checkEligibleForFailover() throws ServiceFailedException { // Check health if (this.getLastHealthState() != State.SERVICE_HEALTHY) { throw new ServiceFailedException( localTarget + " is not currently healthy. " + "Cannot be failover target"); } if (serviceState == HAServiceState.OBSERVER) { throw new ServiceFailedException( localTarget + " is in observer state. " + "Cannot be failover target"); } } /** * @return an {@link HAServiceTarget} for the current active node * in the cluster, or null if no node is active. * @throws IOException if a ZK-related issue occurs * @throws InterruptedException if thread is interrupted */ private HAServiceTarget getCurrentActive() throws IOException, InterruptedException { synchronized (elector) { synchronized (this) { byte[] activeData; try { activeData = elector.getActiveData(); } catch (ActiveNotFoundException e) { return null; } catch (KeeperException ke) { throw new IOException( "Unexpected ZooKeeper issue fetching active node info", ke); } HAServiceTarget oldActive = dataToTarget(activeData); return oldActive; } } } /** * Check the current state of the service, and join the election * if it should be in the election. */ private void recheckElectability() { // Maintain lock ordering of elector -> ZKFC synchronized (elector) { synchronized (this) { boolean healthy = lastHealthState == State.SERVICE_HEALTHY; long remainingDelay = delayJoiningUntilNanotime - System.nanoTime(); if (remainingDelay > 0) { if (healthy) { LOG.info("Would have joined master election, but this node is " + "prohibited from doing so for " + TimeUnit.NANOSECONDS.toMillis(remainingDelay) + " more ms"); } scheduleRecheck(remainingDelay); return; } switch (lastHealthState) { case SERVICE_HEALTHY: if(serviceState != HAServiceState.OBSERVER) { elector.joinElection(targetToData(localTarget)); } if (quitElectionOnBadState) { quitElectionOnBadState = false; } break; case INITIALIZING: LOG.info("Ensuring that " + localTarget + " does not " + "participate in active master election"); elector.quitElection(false); serviceState = HAServiceState.INITIALIZING; break; case SERVICE_UNHEALTHY: case SERVICE_NOT_RESPONDING: LOG.info("Quitting master election for " + localTarget + " and marking that fencing is necessary"); elector.quitElection(true); serviceState = HAServiceState.INITIALIZING; break; case HEALTH_MONITOR_FAILED: fatalError("Health monitor failed!"); break; default: throw new IllegalArgumentException("Unhandled state:" + lastHealthState); } } } } /** * Schedule a call to {@link #recheckElectability()} in the future. */ private void scheduleRecheck(long whenNanos) { delayExecutor.schedule( new Runnable() { @Override public void run() { try { recheckElectability(); } catch (Throwable t) { fatalError("Failed to recheck electability: " + StringUtils.stringifyException(t)); } } }, whenNanos, TimeUnit.NANOSECONDS); } int serviceStateMismatchCount = 0; boolean quitElectionOnBadState = false; void verifyChangedServiceState(HAServiceState changedState) { synchronized (elector) { synchronized (this) { if (serviceState == HAServiceState.INITIALIZING) { if (quitElectionOnBadState) { LOG.debug("rechecking for electability from bad state"); recheckElectability(); } return; } if (changedState == HAServiceState.OBSERVER) { elector.quitElection(true); serviceState = HAServiceState.OBSERVER; return; } if (changedState == serviceState) { serviceStateMismatchCount = 0; return; } if (serviceStateMismatchCount == 0) { // recheck one more time. As this might be due to parallel transition. serviceStateMismatchCount++; return; } // quit the election as the expected state and reported state // mismatches. LOG.error("Local service " + localTarget + " has changed the serviceState to " + changedState + ". Expected was " + serviceState + ". Quitting election marking fencing necessary."); delayJoiningUntilNanotime = System.nanoTime() + TimeUnit.MILLISECONDS.toNanos(1000); elector.quitElection(true); quitElectionOnBadState = true; serviceStateMismatchCount = 0; serviceState = HAServiceState.INITIALIZING; } } } /** * @return the last health state passed to the FC * by the HealthMonitor. */ protected synchronized State getLastHealthState() { return lastHealthState; } protected synchronized void setLastHealthState(HealthMonitor.State newState) { LOG.info("Local service " + localTarget + " entered state: " + newState); lastHealthState = newState; } @VisibleForTesting public ActiveStandbyElector getElectorForTests() { return elector; } @VisibleForTesting ZKFCRpcServer getRpcServerForTests() { return rpcServer; } /** * Callbacks from elector */ class ElectorCallbacks implements ActiveStandbyElectorCallback { @Override public void becomeActive() throws ServiceFailedException { ZKFailoverController.this.becomeActive(); } @Override public void becomeStandby() { ZKFailoverController.this.becomeStandby(); } @Override public void enterNeutralMode() { } @Override public void notifyFatalError(String errorMessage) { fatalError(errorMessage); } @Override public void fenceOldActive(byte[] data) { ZKFailoverController.this.fenceOldActive(data); } @Override public String toString() { synchronized (ZKFailoverController.this) { return "Elector callbacks for " + localTarget; } } } /** * Callbacks from HealthMonitor */ class HealthCallbacks implements HealthMonitor.Callback { @Override public void enteredState(HealthMonitor.State newState) { setLastHealthState(newState); recheckElectability(); } } /** * Callbacks for HAServiceStatus */ class ServiceStateCallBacks implements HealthMonitor.ServiceStateCallback { @Override public void reportServiceStatus(HAServiceStatus status) { verifyChangedServiceState(status.getState()); } } private static class ActiveAttemptRecord { private final boolean succeeded; private final String status; private final long nanoTime; public ActiveAttemptRecord(boolean succeeded, String status) { this.succeeded = succeeded; this.status = status; this.nanoTime = System.nanoTime(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy