org.apache.hadoop.ha.ZKFailoverController Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hadoop-common Show documentation
Apache Hadoop Common
There is a newer version: 3.4.0
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.ha;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.security.PrivilegedAction;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.UnsupportedFileSystemException;
import org.apache.hadoop.ha.ActiveStandbyElector.ActiveNotFoundException;
import org.apache.hadoop.ha.ActiveStandbyElector.ActiveStandbyElectorCallback;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
import org.apache.hadoop.security.ProviderUtils;
import org.apache.hadoop.util.ZKUtil;
import org.apache.hadoop.util.ZKUtil.ZKAuthInfo;
import org.apache.hadoop.ha.HealthMonitor.State;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.authorize.PolicyProvider;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.hadoop.util.ToolRunner;
import org.apache.zookeeper.data.ACL;

import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.thirdparty.com.google.common.base.Preconditions;
import org.apache.hadoop.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@InterfaceAudience.LimitedPrivate("HDFS")
public abstract class ZKFailoverController {

  static final Logger LOG = LoggerFactory.getLogger(ZKFailoverController.class);
  
  public static final String ZK_QUORUM_KEY = "ha.zookeeper.quorum";
  private static final String ZK_SESSION_TIMEOUT_KEY = "ha.zookeeper.session-timeout.ms";
  private static final int ZK_SESSION_TIMEOUT_DEFAULT = 10*1000;
  private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
  public static final String ZK_ACL_KEY = "ha.zookeeper.acl";
  private static final String ZK_ACL_DEFAULT = "world:anyone:rwcda";
  public static final String ZK_AUTH_KEY = "ha.zookeeper.auth";
  static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";

  /**
   * All of the conf keys used by the ZKFC. This is used in order to allow
   * them to be overridden on a per-nameservice or per-namenode basis.
   */
  protected static final String[] ZKFC_CONF_KEYS = new String[] {
    ZK_QUORUM_KEY,
    ZK_SESSION_TIMEOUT_KEY,
    ZK_PARENT_ZNODE_KEY,
    ZK_ACL_KEY,
    ZK_AUTH_KEY
  };
  
  protected static final String USAGE =
      "Usage: hdfs zkfc [ -formatZK [-force] [-nonInteractive] ]\n"
      + "\t-force: formats the znode if the znode exists.\n"
      + "\t-nonInteractive: formats the znode aborts if the znode exists,\n"
      + "\tunless -force option is specified.";

  /** Unable to format the parent znode in ZK */
  static final int ERR_CODE_FORMAT_DENIED = 2;
  /** The parent znode doesn't exist in ZK */
  static final int ERR_CODE_NO_PARENT_ZNODE = 3;
  /** Fencing is not properly configured */
  static final int ERR_CODE_NO_FENCER = 4;
  /** Automatic failover is not enabled */
  static final int ERR_CODE_AUTO_FAILOVER_NOT_ENABLED = 5;
  /** Cannot connect to ZooKeeper */
  static final int ERR_CODE_NO_ZK = 6;
  
  protected Configuration conf;
  private String zkQuorum;
  protected final HAServiceTarget localTarget;

  private HealthMonitor healthMonitor;
  private ActiveStandbyElector elector;
  protected ZKFCRpcServer rpcServer;

  private State lastHealthState = State.INITIALIZING;

  private volatile HAServiceState serviceState = HAServiceState.INITIALIZING;

  /** Set if a fatal error occurs */
  private String fatalError = null;

  /**
   * A future nanotime before which the ZKFC will not join the election.
   * This is used during graceful failover.
   */
  private long delayJoiningUntilNanotime = 0;

  /** Executor on which {@link #scheduleRecheck(long)} schedules events */
  private ScheduledExecutorService delayExecutor =
    Executors.newScheduledThreadPool(1,
        new ThreadFactoryBuilder().setDaemon(true)
            .setNameFormat("ZKFC Delay timer #%d")
            .build());

  private ActiveAttemptRecord lastActiveAttemptRecord;
  private Object activeAttemptRecordLock = new Object();

  protected ZKFailoverController(Configuration conf, HAServiceTarget localTarget) {
    this.localTarget = localTarget;
    this.conf = conf;
  }
  

  protected abstract byte[] targetToData(HAServiceTarget target);
  protected abstract HAServiceTarget dataToTarget(byte[] data);
  protected abstract void loginAsFCUser() throws IOException;
  protected abstract void checkRpcAdminAccess()
      throws AccessControlException, IOException;
  protected abstract InetSocketAddress getRpcAddressToBindTo();
  protected abstract PolicyProvider getPolicyProvider();
  protected abstract List getAllOtherNodes();

  /**
   * Return the name of a znode inside the configured parent znode in which
   * the ZKFC will do all of its work. This is so that multiple federated
   * nameservices can run on the same ZK quorum without having to manually
   * configure them to separate subdirectories.
   */
  protected abstract String getScopeInsideParentNode();

  public HAServiceTarget getLocalTarget() {
    return localTarget;
  }

  @VisibleForTesting
  public HAServiceState getServiceState() {
    return serviceState;
  }

  public int run(final String[] args) throws Exception {
    if (!localTarget.isAutoFailoverEnabled()) {
      LOG.error("Automatic failover is not enabled for " + localTarget + "." +
          " Please ensure that automatic failover is enabled in the " +
          "configuration before running the ZK failover controller.");
      return ERR_CODE_AUTO_FAILOVER_NOT_ENABLED;
    }
    loginAsFCUser();
    try {
      return SecurityUtil.doAsLoginUserOrFatal(new PrivilegedAction() {
        @Override
        public Integer run() {
          try {
            return doRun(args);
          } catch (Exception t) {
            throw new RuntimeException(t);
          } finally {
            if (elector != null) {
              elector.terminateConnection();
            }
          }
        }
      });
    } catch (RuntimeException rte) {
      throw (Exception)rte.getCause();
    }
  }
  

  private int doRun(String[] args)
      throws Exception {
    try {
      initZK();
    } catch (KeeperException ke) {
      LOG.error("Unable to start failover controller. Unable to connect "
          + "to ZooKeeper quorum at " + zkQuorum + ". Please check the "
          + "configured value for " + ZK_QUORUM_KEY + " and ensure that "
          + "ZooKeeper is running.", ke);
      return ERR_CODE_NO_ZK;
    }
    try {
      if (args.length > 0) {
        if ("-formatZK".equals(args[0])) {
          boolean force = false;
          boolean interactive = true;
          for (int i = 1; i < args.length; i++) {
            if ("-force".equals(args[i])) {
              force = true;
            } else if ("-nonInteractive".equals(args[i])) {
              interactive = false;
            } else {
              badArg(args[i]);
            }
          }
          return formatZK(force, interactive);
        }
        else {
          badArg(args[0]);
        }
      }
    } catch (Exception e){
      LOG.error("The failover controller encounters runtime error", e);
      throw e;
    }

    if (!elector.parentZNodeExists()) {
      LOG.error("Unable to start failover controller. "
          + "Parent znode does not exist.\n"
          + "Run with -formatZK flag to initialize ZooKeeper.");
      return ERR_CODE_NO_PARENT_ZNODE;
    }

    try {
      localTarget.checkFencingConfigured();
    } catch (BadFencingConfigurationException e) {
      LOG.error("Fencing is not configured for " + localTarget + ".\n" +
          "You must configure a fencing method before using automatic " +
          "failover.", e);
      return ERR_CODE_NO_FENCER;
    }

    try {
      initRPC();
      initHM();
      startRPC();
      mainLoop();
    } catch (Exception e) {
      LOG.error("The failover controller encounters runtime error: ", e);
      throw e;
    } finally {
      rpcServer.stopAndJoin();
      
      elector.quitElection(true);
      healthMonitor.shutdown();
      healthMonitor.join();
    }
    return 0;
  }

  private void badArg(String arg) {
    printUsage();
    throw new HadoopIllegalArgumentException(
        "Bad argument: " + arg);
  }

  private void printUsage() {
    System.err.println(USAGE + "\n");
  }

  private int formatZK(boolean force, boolean interactive)
      throws IOException, InterruptedException, KeeperException {
    if (elector.parentZNodeExists()) {
      if (!force && (!interactive || !confirmFormat())) {
        return ERR_CODE_FORMAT_DENIED;
      }
      
      try {
        elector.clearParentZNode();
      } catch (IOException e) {
        LOG.error("Unable to clear zk parent znode", e);
        return 1;
      }
    }
    
    elector.ensureParentZNode();
    return 0;
  }

  private boolean confirmFormat() {
    String parentZnode = getParentZnode();
    System.err.println(
        "===============================================\n" +
        "The configured parent znode " + parentZnode + " already exists.\n" +
        "Are you sure you want to clear all failover information from\n" +
        "ZooKeeper?\n" +
        "WARNING: Before proceeding, ensure that all HDFS services and\n" +
        "failover controllers are stopped!\n" +
        "===============================================");
    try {
      return ToolRunner.confirmPrompt("Proceed formatting " + parentZnode + "?");
    } catch (IOException e) {
      LOG.debug("Failed to confirm", e);
      return false;
    }
  }

  // ------------------------------------------
  // Begin actual guts of failover controller
  // ------------------------------------------
  
  private void initHM() {
    healthMonitor = new HealthMonitor(conf, localTarget);
    healthMonitor.addCallback(new HealthCallbacks());
    healthMonitor.addServiceStateCallback(new ServiceStateCallBacks());
    healthMonitor.start();
  }

  protected void initRPC() throws IOException {
    InetSocketAddress bindAddr = getRpcAddressToBindTo();
    LOG.info("ZKFC RpcServer binding to {}", bindAddr);
    rpcServer = new ZKFCRpcServer(conf, bindAddr, this, getPolicyProvider());
  }

  protected void startRPC() throws IOException {
    rpcServer.start();
  }


  private void initZK() throws HadoopIllegalArgumentException, IOException,
      KeeperException {
    zkQuorum = conf.get(ZK_QUORUM_KEY);
    int zkTimeout = conf.getInt(ZK_SESSION_TIMEOUT_KEY,
        ZK_SESSION_TIMEOUT_DEFAULT);
    // Parse ACLs from configuration.
    String zkAclConf = conf.get(ZK_ACL_KEY, ZK_ACL_DEFAULT);
    zkAclConf = ZKUtil.resolveConfIndirection(zkAclConf);
    List zkAcls = ZKUtil.parseACLs(zkAclConf);
    if (zkAcls.isEmpty()) {
      zkAcls = Ids.CREATOR_ALL_ACL;
    }
    
    // Parse authentication from configuration. Exclude any Credential providers
    // using the hdfs scheme to avoid a circular dependency. As HDFS is likely
    // not started when ZKFC is started, we cannot read the credentials from it.
    Configuration c = conf;
    try {
      c = ProviderUtils.excludeIncompatibleCredentialProviders(
          conf, FileSystem.getFileSystemClass("hdfs", conf));
    } catch (UnsupportedFileSystemException e) {
      // Should not happen in a real cluster, as the hdfs FS will always be
      // present. Inside tests, the hdfs filesystem will not be present
      LOG.debug("No filesystem found for the hdfs scheme", e);
    }
    List zkAuths = SecurityUtil.getZKAuthInfos(c, ZK_AUTH_KEY);

    // Sanity check configuration.
    Preconditions.checkArgument(zkQuorum != null,
        "Missing required configuration '%s' for ZooKeeper quorum",
        ZK_QUORUM_KEY);
    Preconditions.checkArgument(zkTimeout > 0,
        "Invalid ZK session timeout %s", zkTimeout);
    
    int maxRetryNum = conf.getInt(
        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
        CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
    elector = new ActiveStandbyElector(zkQuorum,
        zkTimeout, getParentZnode(), zkAcls, zkAuths,
        new ElectorCallbacks(), maxRetryNum);
  }
  
  private String getParentZnode() {
    String znode = conf.get(ZK_PARENT_ZNODE_KEY,
        ZK_PARENT_ZNODE_DEFAULT);
    if (!znode.endsWith("/")) {
      znode += "/";
    }
    return znode + getScopeInsideParentNode();
  }

  private synchronized void mainLoop() throws InterruptedException {
    while (fatalError == null) {
      wait();
    }
    assert fatalError != null; // only get here on fatal
    throw new RuntimeException(
        "ZK Failover Controller failed: " + fatalError);
  }
  
  private synchronized void fatalError(String err) {
    LOG.error("Fatal error occurred:" + err);
    fatalError = err;
    notifyAll();
  }
  
  private synchronized void becomeActive() throws ServiceFailedException {
    LOG.info("Trying to make " + localTarget + " active...");
    try {
      HAServiceProtocolHelper.transitionToActive(localTarget.getProxy(
          conf, FailoverController.getRpcTimeoutToNewActive(conf)),
          createReqInfo());
      String msg = "Successfully transitioned " + localTarget +
          " to active state";
      LOG.info(msg);
      serviceState = HAServiceState.ACTIVE;
      recordActiveAttempt(new ActiveAttemptRecord(true, msg));

    } catch (Throwable t) {
      String msg = "Couldn't make " + localTarget + " active";
      LOG.error(msg, t);
      
      recordActiveAttempt(new ActiveAttemptRecord(false, msg + "\n" +
          StringUtils.stringifyException(t)));

      if (t instanceof ServiceFailedException) {
        throw (ServiceFailedException)t;
      } else {
        throw new ServiceFailedException("Couldn't transition to active",
            t);
      }
/*
* TODO:
* we need to make sure that if we get fenced and then quickly restarted,
* none of these calls will retry across the restart boundary
* perhaps the solution is that, whenever the nn starts, it gets a unique
* ID, and when we start becoming active, we record it, and then any future
* calls use the same ID
*/
      
    }
  }

  /**
   * Store the results of the last attempt to become active.
   * This is used so that, during manually initiated failover,
   * we can report back the results of the attempt to become active
   * to the initiator of the failover.
   */
  private void recordActiveAttempt(
      ActiveAttemptRecord record) {
    synchronized (activeAttemptRecordLock) {
      lastActiveAttemptRecord = record;
      activeAttemptRecordLock.notifyAll();
    }
  }

  /**
   * Wait until one of the following events:
   * 
   * Another thread publishes the results of an attempt to become active
   * using {@link #recordActiveAttempt(ActiveAttemptRecord)}
   * The node enters bad health status
   * The specified timeout elapses
   * 
   * 
   * @param timeoutMillis number of millis to wait
   * @param onlyAfterNanoTime accept attempt records only after a given
   * timestamp. Use this parameter to ignore the old attempt records from a
   * previous fail-over attempt.
   * @return the published record, or null if the timeout elapses or the
   * service becomes unhealthy 
   * @throws InterruptedException if the thread is interrupted.
   */
  private ActiveAttemptRecord waitForActiveAttempt(int timeoutMillis,
      long onlyAfterNanoTime) throws InterruptedException {
    long waitUntil = onlyAfterNanoTime + TimeUnit.NANOSECONDS.convert(
        timeoutMillis, TimeUnit.MILLISECONDS);
    
    do {
      // periodically check health state, because entering an
      // unhealthy state could prevent us from ever attempting to
      // become active. We can detect this and respond to the user
      // immediately.
      synchronized (this) {
        if (lastHealthState != State.SERVICE_HEALTHY) {
          // early out if service became unhealthy
          return null;
        }
      }

      synchronized (activeAttemptRecordLock) {
        if ((lastActiveAttemptRecord != null &&
            lastActiveAttemptRecord.nanoTime >= onlyAfterNanoTime)) {
          return lastActiveAttemptRecord;
        }
        // Only wait 1sec so that we periodically recheck the health state
        // above.
        activeAttemptRecordLock.wait(1000);
      }
    } while (System.nanoTime() < waitUntil);
    
    // Timeout elapsed.
    LOG.warn(timeoutMillis + "ms timeout elapsed waiting for an attempt " +
        "to become active");
    return null;
  }

  private StateChangeRequestInfo createReqInfo() {
    return new StateChangeRequestInfo(RequestSource.REQUEST_BY_ZKFC);
  }

  private synchronized void becomeStandby() {
    LOG.info("ZK Election indicated that " + localTarget +
        " should become standby");
    try {
      int timeout = FailoverController.getGracefulFenceTimeout(conf);
      localTarget.getProxy(conf, timeout).transitionToStandby(createReqInfo());
      LOG.info("Successfully transitioned " + localTarget +
          " to standby state");
    } catch (Exception e) {
      LOG.error("Couldn't transition " + localTarget + " to standby state",
          e);
      // TODO handle this. It's a likely case since we probably got fenced
      // at the same time.
    }
    serviceState = HAServiceState.STANDBY;
  }
  

  private synchronized void fenceOldActive(byte[] data) {
    HAServiceTarget target = dataToTarget(data);
    
    try {
      doFence(target);
    } catch (Throwable t) {
      recordActiveAttempt(new ActiveAttemptRecord(false, "Unable to fence old active: " + StringUtils.stringifyException(t)));
      throw t;
    }
  }
  
  private void doFence(HAServiceTarget target) {
    LOG.info("Should fence: " + target);
    boolean gracefulWorked = new FailoverController(conf,
        RequestSource.REQUEST_BY_ZKFC).tryGracefulFence(target);
    if (gracefulWorked) {
      // It's possible that it's in standby but just about to go into active,
      // no? Is there some race here?
      LOG.info("Successfully transitioned " + target + " to standby " +
          "state without fencing");
      return;
    }
    
    try {
      target.checkFencingConfigured();
    } catch (BadFencingConfigurationException e) {
      LOG.error("Couldn't fence old active " + target, e);
      recordActiveAttempt(new ActiveAttemptRecord(false, "Unable to fence old active"));
      throw new RuntimeException(e);
    }
    
    if (!target.getFencer().fence(target)) {
      throw new RuntimeException("Unable to fence " + target);
    }
  }


  /**
   * Request from graceful failover to cede active role. Causes
   * this ZKFC to transition its local node to standby, then quit
   * the election for the specified period of time, after which it
   * will rejoin iff it is healthy.
   */
  void cedeActive(final int millisToCede)
      throws AccessControlException, ServiceFailedException, IOException {
    try {
      UserGroupInformation.getLoginUser().doAs(new PrivilegedExceptionAction() {
        @Override
        public Void run() throws Exception {
          doCedeActive(millisToCede);
          return null;
        }
      });
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }
  
  private void doCedeActive(int millisToCede) 
      throws AccessControlException, ServiceFailedException, IOException {
    int timeout = FailoverController.getGracefulFenceTimeout(conf);

    // Lock elector to maintain lock ordering of elector -> ZKFC
    synchronized (elector) {
      synchronized (this) {
        if (millisToCede <= 0) {
          delayJoiningUntilNanotime = 0;
          recheckElectability();
          return;
        }
  
        LOG.info("Requested by " + UserGroupInformation.getCurrentUser() +
            " at " + Server.getRemoteAddress() + " to cede active role.");
        boolean needFence = false;
        try {
          localTarget.getProxy(conf, timeout).transitionToStandby(createReqInfo());
          LOG.info("Successfully ensured local node is in standby mode");
        } catch (IOException ioe) {
          LOG.warn("Unable to transition local node to standby: " +
              ioe.getLocalizedMessage());
          LOG.warn("Quitting election but indicating that fencing is " +
              "necessary");
          needFence = true;
        }
        delayJoiningUntilNanotime = System.nanoTime() +
            TimeUnit.MILLISECONDS.toNanos(millisToCede);
        elector.quitElection(needFence);
        serviceState = HAServiceState.INITIALIZING;
      }
    }
    recheckElectability();
  }
  
  /**
   * Coordinate a graceful failover to this node.
   * @throws ServiceFailedException if the node fails to become active
   * @throws IOException some other error occurs
   */
  void gracefulFailoverToYou() throws ServiceFailedException, IOException {
    try {
      UserGroupInformation.getLoginUser().doAs(new PrivilegedExceptionAction() {
        @Override
        public Void run() throws Exception {
          doGracefulFailover();
          return null;
        }
        
      });
    } catch (InterruptedException e) {
      throw new IOException(e);
    }
  }

  /**
   * Coordinate a graceful failover. This proceeds in several phases:
   * 1) Pre-flight checks: ensure that the local node is healthy, and
   * thus a candidate for failover.
   * 2a) Determine the current active node. If it is the local node, no
   * need to failover - return success.
   * 2b) Get the other nodes
   * 3a) Ask the other nodes to yield from election for a number of seconds
   * 3b) Ask the active node to yield from the election for a number of seconds.
   * 4) Allow the normal election path to run in other threads. Wait until
   * we either become unhealthy or we see an election attempt recorded by
   * the normal code path.
   * 5) Allow the old active to rejoin the election, so a future
   * failback is possible.
   */
  private void doGracefulFailover()
      throws ServiceFailedException, IOException, InterruptedException {
    int timeout = FailoverController.getGracefulFenceTimeout(conf) * 2;
    
    // Phase 1: pre-flight checks
    checkEligibleForFailover();
    
    // Phase 2: determine old/current active node. Check that we're not
    // ourselves active, etc.
    HAServiceTarget oldActive = getCurrentActive();
    if (oldActive == null) {
      // No node is currently active. So, if we aren't already
      // active ourselves by means of a normal election, then there's
      // probably something preventing us from becoming active.
      throw new ServiceFailedException(
          "No other node is currently active.");
    }
    
    if (oldActive.getAddress().equals(localTarget.getAddress())) {
      LOG.info("Local node " + localTarget + " is already active. " +
          "No need to failover. Returning success.");
      return;
    }

    // Phase 2b: get the other nodes
    List otherNodes = getAllOtherNodes();
    List otherZkfcs = new ArrayList(otherNodes.size());

    // Phase 3: ask the other nodes to yield from the election.
    long st = System.nanoTime();
    HAServiceTarget activeNode = null;
    for (HAServiceTarget remote : otherNodes) {
      // same location, same node - may not always be == equality
      if (remote.getAddress().equals(oldActive.getAddress())) {
        activeNode = remote;
        continue;
      }
      otherZkfcs.add(cedeRemoteActive(remote, timeout));
    }

    assert
      activeNode != null : "Active node does not match any known remote node";

    // Phase 3b: ask the old active to yield
    otherZkfcs.add(cedeRemoteActive(activeNode, timeout));

    // Phase 4: wait for the normal election to make the local node
    // active.
    ActiveAttemptRecord attempt = waitForActiveAttempt(timeout + 60000, st);
    
    if (attempt == null) {
      // We didn't even make an attempt to become active.
      synchronized(this) {
        if (lastHealthState != State.SERVICE_HEALTHY) {
          throw new ServiceFailedException("Unable to become active. " +
            "Service became unhealthy while trying to failover.");          
        }
      }
      
      throw new ServiceFailedException("Unable to become active. " +
          "Local node did not get an opportunity to do so from ZooKeeper, " +
          "or the local node took too long to transition to active.");
    }

    // Phase 5. At this point, we made some attempt to become active. So we
    // can tell the old active to rejoin if it wants. This allows a quick
    // fail-back if we immediately crash.
    for (ZKFCProtocol zkfc : otherZkfcs) {
      zkfc.cedeActive(-1);
    }

    if (attempt.succeeded) {
      LOG.info("Successfully became active. " + attempt.status);
    } else {
      // Propagate failure
      String msg = "Failed to become active. " + attempt.status;
      throw new ServiceFailedException(msg);
    }
  }

  /**
   * Ask the remote zkfc to cede its active status and wait for the specified
   * timeout before attempting to claim leader status.
   * @param remote node to ask
   * @param timeout amount of time to cede
   * @return the {@link ZKFCProtocol} used to talk to the ndoe
   * @throws IOException
   */
  private ZKFCProtocol cedeRemoteActive(HAServiceTarget remote, int timeout)
    throws IOException {
    LOG.info("Asking " + remote + " to cede its active state for "
               + timeout + "ms");
    ZKFCProtocol oldZkfc = remote.getZKFCProxy(conf, timeout);
    oldZkfc.cedeActive(timeout);
    return oldZkfc;
  }

  /**
   * If the local node is an observer or is unhealthy it
   * is not eligible for graceful failover.
   * @throws ServiceFailedException if the node is an observer or unhealthy
   */
  private synchronized void checkEligibleForFailover()
      throws ServiceFailedException {
    // Check health
    if (this.getLastHealthState() != State.SERVICE_HEALTHY) {
      throw new ServiceFailedException(
          localTarget + " is not currently healthy. " +
          "Cannot be failover target");
    }
    if (serviceState == HAServiceState.OBSERVER) {
      throw new ServiceFailedException(
          localTarget + " is in observer state. " +
          "Cannot be failover target");
    }
  }

  /**
   * @return an {@link HAServiceTarget} for the current active node
   * in the cluster, or null if no node is active.
   * @throws IOException if a ZK-related issue occurs
   * @throws InterruptedException if thread is interrupted 
   */
  private HAServiceTarget getCurrentActive()
      throws IOException, InterruptedException {
    synchronized (elector) {
      synchronized (this) {
        byte[] activeData;
        try {
          activeData = elector.getActiveData();
        } catch (ActiveNotFoundException e) {
          return null;
        } catch (KeeperException ke) {
          throw new IOException(
              "Unexpected ZooKeeper issue fetching active node info", ke);
        }
        
        HAServiceTarget oldActive = dataToTarget(activeData);
        return oldActive;
      }
    }
  }

  /**
   * Check the current state of the service, and join the election
   * if it should be in the election.
   */
  private void recheckElectability() {
    // Maintain lock ordering of elector -> ZKFC
    synchronized (elector) {
      synchronized (this) {
        boolean healthy = lastHealthState == State.SERVICE_HEALTHY;
    
        long remainingDelay = delayJoiningUntilNanotime - System.nanoTime(); 
        if (remainingDelay > 0) {
          if (healthy) {
            LOG.info("Would have joined master election, but this node is " +
                "prohibited from doing so for " +
                TimeUnit.NANOSECONDS.toMillis(remainingDelay) + " more ms");
          }
          scheduleRecheck(remainingDelay);
          return;
        }
    
        switch (lastHealthState) {
        case SERVICE_HEALTHY:
          if(serviceState != HAServiceState.OBSERVER) {
            elector.joinElection(targetToData(localTarget));
          }
          if (quitElectionOnBadState) {
            quitElectionOnBadState = false;
          }
          break;
          
        case INITIALIZING:
          LOG.info("Ensuring that " + localTarget + " does not " +
              "participate in active master election");
          elector.quitElection(false);
          serviceState = HAServiceState.INITIALIZING;
          break;
    
        case SERVICE_UNHEALTHY:
        case SERVICE_NOT_RESPONDING:
          LOG.info("Quitting master election for " + localTarget +
              " and marking that fencing is necessary");
          elector.quitElection(true);
          serviceState = HAServiceState.INITIALIZING;
          break;
          
        case HEALTH_MONITOR_FAILED:
          fatalError("Health monitor failed!");
          break;
          
        default:
          throw new IllegalArgumentException("Unhandled state:"
                                               + lastHealthState);
        }
      }
    }
  }
  
  /**
   * Schedule a call to {@link #recheckElectability()} in the future.
   */
  private void scheduleRecheck(long whenNanos) {
    delayExecutor.schedule(
        new Runnable() {
          @Override
          public void run() {
            try {
              recheckElectability();
            } catch (Throwable t) {
              fatalError("Failed to recheck electability: " +
                  StringUtils.stringifyException(t));
            }
          }
        },
        whenNanos, TimeUnit.NANOSECONDS);
  }

  int serviceStateMismatchCount = 0;
  boolean quitElectionOnBadState = false;

  void verifyChangedServiceState(HAServiceState changedState) {
    synchronized (elector) {
      synchronized (this) {
        if (serviceState == HAServiceState.INITIALIZING) {
          if (quitElectionOnBadState) {
            LOG.debug("rechecking for electability from bad state");
            recheckElectability();
          }
          return;
        }
        if (changedState == HAServiceState.OBSERVER) {
          elector.quitElection(true);
          serviceState = HAServiceState.OBSERVER;
          return;
        }
        if (changedState == serviceState) {
          serviceStateMismatchCount = 0;
          return;
        }
        if (serviceStateMismatchCount == 0) {
          // recheck one more time. As this might be due to parallel transition.
          serviceStateMismatchCount++;
          return;
        }
        // quit the election as the expected state and reported state
        // mismatches.
        LOG.error("Local service " + localTarget
            + " has changed the serviceState to " + changedState
            + ". Expected was " + serviceState
            + ". Quitting election marking fencing necessary.");
        delayJoiningUntilNanotime = System.nanoTime()
            + TimeUnit.MILLISECONDS.toNanos(1000);
        elector.quitElection(true);
        quitElectionOnBadState = true;
        serviceStateMismatchCount = 0;
        serviceState = HAServiceState.INITIALIZING;
      }
    }
  }

  /**
   * @return the last health state passed to the FC
   * by the HealthMonitor.
   */
  protected synchronized State getLastHealthState() {
    return lastHealthState;
  }

  protected synchronized void setLastHealthState(HealthMonitor.State newState) {
    LOG.info("Local service " + localTarget +
        " entered state: " + newState);
    lastHealthState = newState;
  }
  
  @VisibleForTesting
  public ActiveStandbyElector getElectorForTests() {
    return elector;
  }
  
  @VisibleForTesting
  ZKFCRpcServer getRpcServerForTests() {
    return rpcServer;
  }

  /**
   * Callbacks from elector
   */
  class ElectorCallbacks implements ActiveStandbyElectorCallback {
    @Override
    public void becomeActive() throws ServiceFailedException {
      ZKFailoverController.this.becomeActive();
    }

    @Override
    public void becomeStandby() {
      ZKFailoverController.this.becomeStandby();
    }

    @Override
    public void enterNeutralMode() {
    }

    @Override
    public void notifyFatalError(String errorMessage) {
      fatalError(errorMessage);
    }

    @Override
    public void fenceOldActive(byte[] data) {
      ZKFailoverController.this.fenceOldActive(data);
    }
    
    @Override
    public String toString() {
      synchronized (ZKFailoverController.this) {
        return "Elector callbacks for " + localTarget;
      }
    }
  }
  
  /**
   * Callbacks from HealthMonitor
   */
  class HealthCallbacks implements HealthMonitor.Callback {
    @Override
    public void enteredState(HealthMonitor.State newState) {
      setLastHealthState(newState);
      recheckElectability();
    }
  }

  /**
   * Callbacks for HAServiceStatus
   */
  class ServiceStateCallBacks implements HealthMonitor.ServiceStateCallback {
    @Override
    public void reportServiceStatus(HAServiceStatus status) {
      verifyChangedServiceState(status.getState());
    }
  }

  private static class ActiveAttemptRecord {
    private final boolean succeeded;
    private final String status;
    private final long nanoTime;
    
    public ActiveAttemptRecord(boolean succeeded, String status) {
      this.succeeded = succeeded;
      this.status = status;
      this.nanoTime = System.nanoTime();
    }
  }

}