org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Main functionality for HBase
There is a newer version: 3.0.0-beta-1
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.master.procedure;

import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.locks.Lock;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.master.AssignmentManager;
import org.apache.hadoop.hbase.master.MasterFileSystem;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.RegionStates;
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionInfo;
import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos;
import org.apache.hadoop.hbase.protobuf.generated.MasterProcedureProtos.ServerCrashState;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
import org.apache.hadoop.hbase.zookeeper.MetaTableLocator;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.KeeperException;

/**
 * Handle crashed server. This is a port to ProcedureV2 of what used to be euphemistically called
 * ServerShutdownHandler.
 *
 * The procedure flow varies dependent on whether meta is assigned, if we are
 * doing distributed log replay versus distributed log splitting, and if we are to split logs at
 * all.
 *
 * 
This procedure asks that all crashed servers get processed equally; we yield after the
 * completion of each successful flow step. We do this so that we do not 'deadlock' waiting on
 * a region assignment so we can replay edits which could happen if a region moved there are edits
 * on two servers for replay.
 *
 * TODO: ASSIGN and WAIT_ON_ASSIGN (at least) are not idempotent. Revisit when assign is pv2.
 * TODO: We do not have special handling for system tables.
 */
public class ServerCrashProcedure
extends StateMachineProcedure
implements ServerProcedureInterface {
  private static final Log LOG = LogFactory.getLog(ServerCrashProcedure.class);

  /**
   * Configuration key to set how long to wait in ms doing a quick check on meta state.
   */
  public static final String KEY_SHORT_WAIT_ON_META =
      "hbase.master.servercrash.short.wait.on.meta.ms";

  public static final int DEFAULT_SHORT_WAIT_ON_META = 1000;

  /**
   * Configuration key to set how many retries to cycle before we give up on meta.
   * Each attempt will wait at least {@link #KEY_SHORT_WAIT_ON_META} milliseconds.
   */
  public static final String KEY_RETRIES_ON_META =
      "hbase.master.servercrash.meta.retries";

  public static final int DEFAULT_RETRIES_ON_META = 10;

  /**
   * Configuration key to set how long to wait in ms on regions in transition.
   */
  public static final String KEY_WAIT_ON_RIT =
      "hbase.master.servercrash.wait.on.rit.ms";

  public static final int DEFAULT_WAIT_ON_RIT = 30000;

  private static final Set META_REGION_SET = new HashSet();
  static {
    META_REGION_SET.add(HRegionInfo.FIRST_META_REGIONINFO);
  }

  /**
   * Name of the crashed server to process.
   */
  private ServerName serverName;

  /**
   * Whether DeadServer knows that we are processing it.
   */
  private boolean notifiedDeadServer = false;

  /**
   * Regions that were on the crashed server.
   */
  private Set regionsOnCrashedServer;

  /**
   * Regions assigned. Usually some subset of {@link #regionsOnCrashedServer}.
   */
  private List regionsAssigned;

  private boolean distributedLogReplay = false;
  private boolean carryingMeta = false;
  private boolean shouldSplitWal;

  /**
   * Cycles on same state. Good for figuring if we are stuck.
   */
  private int cycles = 0;

  /**
   * Ordinal of the previous state. So we can tell if we are progressing or not. TODO: if useful,
   * move this back up into StateMachineProcedure
   */
  private int previousState;

  /**
   * Call this constructor queuing up a Procedure.
   * @param serverName Name of the crashed server.
   * @param shouldSplitWal True if we should split WALs as part of crashed server processing.
   * @param carryingMeta True if carrying hbase:meta table region.
   */
  public ServerCrashProcedure(
      final MasterProcedureEnv env,
      final ServerName serverName,
      final boolean shouldSplitWal,
      final boolean carryingMeta) {
    this.serverName = serverName;
    this.shouldSplitWal = shouldSplitWal;
    this.carryingMeta = carryingMeta;
    this.setOwner(env.getRequestUser().getShortName());
  }

  /**
   * Used when deserializing from a procedure store; we'll construct one of these then call
   * {@link #deserializeStateData(InputStream)}. Do not use directly.
   */
  public ServerCrashProcedure() {
    super();
  }

  private void throwProcedureYieldException(final String msg) throws ProcedureYieldException {
    String logMsg = msg + "; cycle=" + this.cycles + ", running for " +
        StringUtils.formatTimeDiff(System.currentTimeMillis(), getStartTime());
    // The procedure executor logs ProcedureYieldException at trace level. For now, log these
    // yields for server crash processing at DEBUG. Revisit when stable.
    if (LOG.isDebugEnabled()) LOG.debug(logMsg);
    throw new ProcedureYieldException(logMsg);
  }

  @Override
  protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state)
      throws ProcedureYieldException {
    if (LOG.isTraceEnabled()) {
      LOG.trace(state);
    }
    // Keep running count of cycles
    if (state.ordinal() != this.previousState) {
      this.previousState = state.ordinal();
      this.cycles = 0;
    } else {
      this.cycles++;
    }
    MasterServices services = env.getMasterServices();
    // Is master fully online? If not, yield. No processing of servers unless master is up
    if (!services.getAssignmentManager().isFailoverCleanupDone()) {
      throwProcedureYieldException("Waiting on master failover to complete");
    }
    // HBASE-14802
    // If we have not yet notified that we are processing a dead server, we should do now.
    if (!notifiedDeadServer) {
      services.getServerManager().getDeadServers().notifyServer(serverName);
      notifiedDeadServer = true;
    }

    try {
      switch (state) {
      case SERVER_CRASH_START:
        LOG.info("Start processing crashed " + this.serverName);
        start(env);
        // If carrying meta, process it first. Else, get list of regions on crashed server.
        if (this.carryingMeta) setNextState(ServerCrashState.SERVER_CRASH_PROCESS_META);
        else setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
        break;

      case SERVER_CRASH_GET_REGIONS:
        // If hbase:meta is not assigned, yield.
        if (!isMetaAssignedQuickTest(env)) {
          // isMetaAssignedQuickTest does not really wait. Let's delay a little before
          // another round of execution.
          long wait =
              env.getMasterConfiguration().getLong(KEY_SHORT_WAIT_ON_META,
                DEFAULT_SHORT_WAIT_ON_META);
          wait = wait / 10;
          Thread.sleep(wait);
          throwProcedureYieldException("Waiting on hbase:meta assignment");
        }
        this.regionsOnCrashedServer =
            services.getAssignmentManager().getRegionStates().getServerRegions(this.serverName);
        // Where to go next? Depends on whether we should split logs at all or if we should do
        // distributed log splitting (DLS) vs distributed log replay (DLR).
        if (!this.shouldSplitWal) {
          setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
        } else if (this.distributedLogReplay) {
          setNextState(ServerCrashState.SERVER_CRASH_PREPARE_LOG_REPLAY);
        } else {
          setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
        }
        break;

      case SERVER_CRASH_PROCESS_META:
        // If we fail processing hbase:meta, yield.
        if (!processMeta(env)) {
          throwProcedureYieldException("Waiting on regions-in-transition to clear");
        }
        setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
        break;

      case SERVER_CRASH_PREPARE_LOG_REPLAY:
        prepareLogReplay(env, this.regionsOnCrashedServer);
        setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
        break;

      case SERVER_CRASH_SPLIT_LOGS:
        splitLogs(env);
        // If DLR, go to FINISH. Otherwise, if DLS, go to SERVER_CRASH_ASSIGN
        if (this.distributedLogReplay) setNextState(ServerCrashState.SERVER_CRASH_FINISH);
        else setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
        break;

      case SERVER_CRASH_ASSIGN:
        List regionsToAssign = calcRegionsToAssign(env);

        // Assign may not be idempotent. SSH used to requeue the SSH if we got an IOE assigning
        // which is what we are mimicing here but it looks prone to double assignment if assign
        // fails midway. TODO: Test.

        // If no regions to assign, skip assign and skip to the finish.
        boolean regions = regionsToAssign != null && !regionsToAssign.isEmpty();
        if (regions) {
          this.regionsAssigned = regionsToAssign;
          if (!assign(env, regionsToAssign)) {
            throwProcedureYieldException("Failed assign; will retry");
          }
        }
        if (this.shouldSplitWal && distributedLogReplay) {
          // Take this route even if there are apparently no regions assigned. This may be our
          // second time through here; i.e. we assigned and crashed just about here. On second
          // time through, there will be no regions because we assigned them in the previous step.
          // Even though no regions, we need to go through here to clean up the DLR zk markers.
          setNextState(ServerCrashState.SERVER_CRASH_WAIT_ON_ASSIGN);
        } else {
          setNextState(ServerCrashState.SERVER_CRASH_FINISH);
        }
        break;

      case SERVER_CRASH_WAIT_ON_ASSIGN:
        // TODO: The list of regionsAssigned may be more than we actually assigned. See down in
        // AM #1629 around 'if (regionStates.wasRegionOnDeadServer(encodedName)) {' where where we
        // will skip assigning a region because it is/was on a dead server. Should never happen!
        // It was on this server. Worst comes to worst, we'll still wait here till other server is
        // processed.

        // If the wait on assign failed, yield -- if we have regions to assign.
        if (this.regionsAssigned != null && !this.regionsAssigned.isEmpty()) {
          if (!waitOnAssign(env, this.regionsAssigned)) {
            throwProcedureYieldException("Waiting on region assign");
          }
        }
        setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
        break;

      case SERVER_CRASH_FINISH:
        LOG.info("Finished processing of crashed " + serverName);
        services.getServerManager().getDeadServers().finish(serverName);
        return Flow.NO_MORE_STATE;

      default:
        throw new UnsupportedOperationException("unhandled state=" + state);
      }
    } catch (ProcedureYieldException e) {
      LOG.warn("Failed serverName=" + this.serverName + ", state=" + state + "; retry "
          + e.getMessage());
      throw e;
    } catch (IOException e) {
      LOG.warn("Failed serverName=" + this.serverName + ", state=" + state + "; retry", e);
    } catch (InterruptedException e) {
      // TODO: Make executor allow IEs coming up out of execute.
      LOG.warn("Interrupted serverName=" + this.serverName + ", state=" + state + "; retry", e);
      Thread.currentThread().interrupt();
    }
    return Flow.HAS_MORE_STATE;
  }

  /**
   * Start processing of crashed server. In here we'll just set configs. and return.
   * @param env
   * @throws IOException
   */
  private void start(final MasterProcedureEnv env) throws IOException {
    MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
    // Set recovery mode late. This is what the old ServerShutdownHandler used do.
    mfs.setLogRecoveryMode();
    this.distributedLogReplay = mfs.getLogRecoveryMode() == RecoveryMode.LOG_REPLAY;
  }

  /**
   * @param env
   * @return False if we fail to assign and split logs on meta ('process').
   * @throws IOException
   * @throws InterruptedException
   */
  private boolean processMeta(final MasterProcedureEnv env)
  throws IOException {
    if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName);
    MasterServices services = env.getMasterServices();
    MasterFileSystem mfs = services.getMasterFileSystem();
    AssignmentManager am = services.getAssignmentManager();
    HRegionInfo metaHRI = HRegionInfo.FIRST_META_REGIONINFO;
    if (this.shouldSplitWal) {
      if (this.distributedLogReplay) {
        prepareLogReplay(env, META_REGION_SET);
      } else {
        // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
        mfs.splitMetaLog(serverName);
        am.getRegionStates().logSplit(metaHRI);
      }
    }

    // Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout
    boolean processed = true;
    boolean shouldAssignMeta = false;
    AssignmentManager.ServerHostRegion rsCarryingMetaRegion = am.isCarryingMeta(serverName);
      switch (rsCarryingMetaRegion) {
        case HOSTING_REGION:
          LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
          am.regionOffline(HRegionInfo.FIRST_META_REGIONINFO);
          shouldAssignMeta = true;
          break;
        case UNKNOWN:
          if (!services.getMetaTableLocator().isLocationAvailable(services.getZooKeeper())) {
            // the meta location as per master is null. This could happen in case when meta
            // assignment in previous run failed, while meta znode has been updated to null.
            // We should try to assign the meta again.
            shouldAssignMeta = true;
            break;
          }
          // fall through
        case NOT_HOSTING_REGION:
          LOG.info("META has been assigned to otherwhere, skip assigning.");
          break;
        default:
          throw new IOException("Unsupported action in MetaServerShutdownHandler");
    }
    if (shouldAssignMeta) {
      // TODO: May block here if hard time figuring state of meta.
      verifyAndAssignMetaWithRetries(env);
      if (this.shouldSplitWal && distributedLogReplay) {
        int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
        if (!waitOnRegionToClearRegionsInTransition(am, metaHRI, timeout)) {
          processed = false;
        } else {
          // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment.
          mfs.splitMetaLog(serverName);
        }
      }
    }
    return processed;
  }

  /**
   * @return True if region cleared RIT, else false if we timed out waiting.
   * @throws InterruptedIOException
   */
  private boolean waitOnRegionToClearRegionsInTransition(AssignmentManager am,
      final HRegionInfo hri, final int timeout)
  throws InterruptedIOException {
    try {
      if (!am.waitOnRegionToClearRegionsInTransition(hri, timeout)) {
        // Wait here is to avoid log replay hits current dead server and incur a RPC timeout
        // when replay happens before region assignment completes.
        LOG.warn("Region " + hri.getEncodedName() + " didn't complete assignment in time");
        return false;
      }
    } catch (InterruptedException ie) {
      throw new InterruptedIOException("Caught " + ie +
        " during waitOnRegionToClearRegionsInTransition for " + hri);
    }
    return true;
  }

  private void prepareLogReplay(final MasterProcedureEnv env, final Set regions)
  throws IOException {
    if (LOG.isDebugEnabled()) {
      LOG.debug("Mark " + size(this.regionsOnCrashedServer) + " regions-in-recovery from " +
        this.serverName);
    }
    MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
    AssignmentManager am = env.getMasterServices().getAssignmentManager();
    mfs.prepareLogReplay(this.serverName, regions);
    am.getRegionStates().logSplit(this.serverName);
  }

  private void splitLogs(final MasterProcedureEnv env) throws IOException {
    if (LOG.isDebugEnabled()) {
      LOG.debug("Splitting logs from " + serverName + "; region count=" +
        size(this.regionsOnCrashedServer));
    }
    MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem();
    AssignmentManager am = env.getMasterServices().getAssignmentManager();
    // TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running.
    mfs.splitLog(this.serverName);
    if (!carryingMeta) {
      mfs.archiveMetaLog(this.serverName);
    }
    am.getRegionStates().logSplit(this.serverName);
  }

  static int size(final Collection hris) {
    return hris == null? 0: hris.size();
  }

  /**
   * Figure out what we need to assign. Should be idempotent.
   * @param env
   * @return List of calculated regions to assign; may be empty or null.
   * @throws IOException
   */
  private List calcRegionsToAssign(final MasterProcedureEnv env)
  throws IOException {
    AssignmentManager am = env.getMasterServices().getAssignmentManager();
    List regionsToAssignAggregator = new ArrayList();
    int replicaCount = env.getMasterConfiguration().getInt(HConstants.META_REPLICAS_NUM,
      HConstants.DEFAULT_META_REPLICA_NUM);
    for (int i = 1; i < replicaCount; i++) {
      HRegionInfo metaHri =
          RegionReplicaUtil.getRegionInfoForReplica(HRegionInfo.FIRST_META_REGIONINFO, i);
      if (am.isCarryingMetaReplica(this.serverName, metaHri) ==
          AssignmentManager.ServerHostRegion.HOSTING_REGION) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Reassigning meta replica" + metaHri + " that was on " + this.serverName);
        }
        regionsToAssignAggregator.add(metaHri);
      }
    }
    // Clean out anything in regions in transition.
    List regionsInTransition = am.cleanOutCrashedServerReferences(serverName);
    if (LOG.isDebugEnabled()) {
      LOG.debug("Reassigning " + size(this.regionsOnCrashedServer) +
        " region(s) that " + (serverName == null? "null": serverName)  +
        " was carrying (and " + regionsInTransition.size() +
        " regions(s) that were opening on this server)");
    }
    regionsToAssignAggregator.addAll(regionsInTransition);

    // Iterate regions that were on this server and figure which of these we need to reassign
    if (this.regionsOnCrashedServer != null && !this.regionsOnCrashedServer.isEmpty()) {
      RegionStates regionStates = am.getRegionStates();
      for (HRegionInfo hri: this.regionsOnCrashedServer) {
        if (regionsInTransition.contains(hri)) continue;
        String encodedName = hri.getEncodedName();
        Lock lock = am.acquireRegionLock(encodedName);
        try {
          RegionState rit = regionStates.getRegionTransitionState(hri);
          if (processDeadRegion(hri, am)) {
            ServerName addressFromAM = regionStates.getRegionServerOfRegion(hri);
            if (addressFromAM != null && !addressFromAM.equals(this.serverName)) {
              // If this region is in transition on the dead server, it must be
              // opening or pending_open, which should have been covered by
              // AM#cleanOutCrashedServerReferences
              LOG.info("Skip assigning " + hri.getRegionNameAsString()
                + " because opened on " + addressFromAM.getServerName());
              continue;
            }
            if (rit != null) {
              if (rit.getServerName() != null && !rit.isOnServer(this.serverName)) {
                // Skip regions that are in transition on other server
                LOG.info("Skip assigning region in transition on other server" + rit);
                continue;
              }
              LOG.info("Reassigning region " + rit + " and clearing zknode if exists");
              try {
                // This clears out any RIT that might be sticking around.
                ZKAssign.deleteNodeFailSilent(env.getMasterServices().getZooKeeper(), hri);
              } catch (KeeperException e) {
                // TODO: FIX!!!! ABORTING SERVER BECAUSE COULDN"T PURGE ZNODE. This is what we
                // used to do but that doesn't make it right!!!
                env.getMasterServices().abort("Unexpected error deleting RIT " + hri, e);
                throw new IOException(e);
              }
              regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
            } else if (regionStates.isRegionInState(
                hri, RegionState.State.SPLITTING_NEW, RegionState.State.MERGING_NEW)) {
              regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
            }
            regionsToAssignAggregator.add(hri);
          // TODO: The below else if is different in branch-1 from master branch.
          } else if (rit != null) {
            if ((rit.isPendingCloseOrClosing() || rit.isOffline())
                && am.getTableStateManager().isTableState(hri.getTable(),
                ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING) ||
                am.getReplicasToClose().contains(hri)) {
              // If the table was partially disabled and the RS went down, we should clear the
              // RIT and remove the node for the region.
              // The rit that we use may be stale in case the table was in DISABLING state
              // but though we did assign we will not be clearing the znode in CLOSING state.
              // Doing this will have no harm. See HBASE-5927
              regionStates.updateRegionState(hri, RegionState.State.OFFLINE);
              am.deleteClosingOrClosedNode(hri, rit.getServerName());
              am.offlineDisabledRegion(hri);
            } else {
              LOG.warn("THIS SHOULD NOT HAPPEN: unexpected region in transition "
                + rit + " not to be assigned by SSH of server " + serverName);
            }
          }
        } finally {
          lock.unlock();
        }
      }
    }
    return regionsToAssignAggregator;
  }

  private boolean assign(final MasterProcedureEnv env, final List hris)
  throws InterruptedIOException {
    MasterServices masterServices = env.getMasterServices();
    AssignmentManager am = masterServices.getAssignmentManager();
    // Determine what type of assignment to do if the dead server already restarted.
    boolean retainAssignment =
      (masterServices.getConfiguration().getBoolean("hbase.master.retain.assignment", true) &&
       masterServices.getServerManager().isServerWithSameHostnamePortOnline(serverName)) ?
           true : false;
    try {
      if (retainAssignment) {
        Map hriServerMap =
            new HashMap(hris.size());
        for (HRegionInfo hri: hris) {
          hriServerMap.put(hri, serverName);
        }
        LOG.info("Best effort in SSH to retain assignment of " + hris.size()
          + " regions from the dead server " + serverName);
        am.assign(hriServerMap);
      } else {
        LOG.info("Using round robin in SSH to assign " + hris.size()
          + " regions from the dead server " + serverName);
        am.assign(hris);
      }
    } catch (InterruptedException ie) {
      LOG.error("Caught " + ie + " during " + (retainAssignment ? "retaining" : "round-robin")
        + " assignment");
      throw (InterruptedIOException)new InterruptedIOException().initCause(ie);
    } catch (IOException ioe) {
      LOG.warn("Caught " + ioe + " during region assignment, will retry");
      return false;
    }
    return true;
  }

  private boolean waitOnAssign(final MasterProcedureEnv env, final List hris)
  throws InterruptedIOException {
    int timeout = env.getMasterConfiguration().getInt(KEY_WAIT_ON_RIT, DEFAULT_WAIT_ON_RIT);
    for (HRegionInfo hri: hris) {
      // TODO: Blocks here.
      if (!waitOnRegionToClearRegionsInTransition(env.getMasterServices().getAssignmentManager(),
          hri, timeout)) {
        return false;
      }
    }
    return true;
  }

  @Override
  protected void rollbackState(MasterProcedureEnv env, ServerCrashState state)
  throws IOException {
    // Can't rollback.
    throw new UnsupportedOperationException("unhandled state=" + state);
  }

  @Override
  protected ServerCrashState getState(int stateId) {
    return ServerCrashState.valueOf(stateId);
  }

  @Override
  protected int getStateId(ServerCrashState state) {
    return state.getNumber();
  }

  @Override
  protected ServerCrashState getInitialState() {
    return ServerCrashState.SERVER_CRASH_START;
  }

  @Override
  protected boolean abort(MasterProcedureEnv env) {
    // TODO
    return false;
  }

  @Override
  protected boolean acquireLock(final MasterProcedureEnv env) {
    if (env.waitServerCrashProcessingEnabled(this)) return false;
    return env.getProcedureQueue().tryAcquireServerExclusiveLock(this, getServerName());
  }

  @Override
  protected void releaseLock(final MasterProcedureEnv env) {
    env.getProcedureQueue().releaseServerExclusiveLock(this, getServerName());
  }

  @Override
  public void toStringClassDetails(StringBuilder sb) {
    sb.append(getClass().getSimpleName());
    sb.append(" serverName=");
    sb.append(this.serverName);
    sb.append(", shouldSplitWal=");
    sb.append(shouldSplitWal);
    sb.append(", carryingMeta=");
    sb.append(carryingMeta);
  }

  @Override
  public void serializeStateData(final OutputStream stream) throws IOException {
    super.serializeStateData(stream);

    MasterProcedureProtos.ServerCrashStateData.Builder state =
      MasterProcedureProtos.ServerCrashStateData.newBuilder().
      setServerName(ProtobufUtil.toServerName(this.serverName)).
      setDistributedLogReplay(this.distributedLogReplay).
      setCarryingMeta(this.carryingMeta).
      setShouldSplitWal(this.shouldSplitWal);
    if (this.regionsOnCrashedServer != null && !this.regionsOnCrashedServer.isEmpty()) {
      for (HRegionInfo hri: this.regionsOnCrashedServer) {
        state.addRegionsOnCrashedServer(HRegionInfo.convert(hri));
      }
    }
    if (this.regionsAssigned != null && !this.regionsAssigned.isEmpty()) {
      for (HRegionInfo hri: this.regionsAssigned) {
        state.addRegionsAssigned(HRegionInfo.convert(hri));
      }
    }
    state.build().writeDelimitedTo(stream);
  }

  @Override
  public void deserializeStateData(final InputStream stream) throws IOException {
    super.deserializeStateData(stream);

    MasterProcedureProtos.ServerCrashStateData state =
      MasterProcedureProtos.ServerCrashStateData.parseDelimitedFrom(stream);
    this.serverName = ProtobufUtil.toServerName(state.getServerName());
    this.distributedLogReplay = state.hasDistributedLogReplay()?
      state.getDistributedLogReplay(): false;
    this.carryingMeta = state.hasCarryingMeta()? state.getCarryingMeta(): false;
    // shouldSplitWAL has a default over in pb so this invocation will always work.
    this.shouldSplitWal = state.getShouldSplitWal();
    int size = state.getRegionsOnCrashedServerCount();
    if (size > 0) {
      this.regionsOnCrashedServer = new HashSet(size);
      for (RegionInfo ri: state.getRegionsOnCrashedServerList()) {
        this.regionsOnCrashedServer.add(HRegionInfo.convert(ri));
      }
    }
    size = state.getRegionsAssignedCount();
    if (size > 0) {
      this.regionsAssigned = new ArrayList(size);
      for (RegionInfo ri: state.getRegionsOnCrashedServerList()) {
        this.regionsAssigned.add(HRegionInfo.convert(ri));
      }
    }
  }

  /**
   * Process a dead region from a dead RS. Checks if the region is disabled or
   * disabling or if the region has a partially completed split.
   * @param hri
   * @param assignmentManager
   * @return Returns true if specified region should be assigned, false if not.
   * @throws IOException
   */
  private static boolean processDeadRegion(HRegionInfo hri, AssignmentManager assignmentManager)
  throws IOException {
    boolean tablePresent = assignmentManager.getTableStateManager().isTablePresent(hri.getTable());
    if (!tablePresent) {
      LOG.info("The table " + hri.getTable() + " was deleted.  Hence not proceeding.");
      return false;
    }
    // If table is not disabled but the region is offlined,
    boolean disabled = assignmentManager.getTableStateManager().isTableState(hri.getTable(),
      ZooKeeperProtos.Table.State.DISABLED);
    if (disabled){
      LOG.info("The table " + hri.getTable() + " was disabled.  Hence not proceeding.");
      return false;
    }
    if (hri.isOffline() && hri.isSplit()) {
      // HBASE-7721: Split parent and daughters are inserted into hbase:meta as an atomic operation.
      // If the meta scanner saw the parent split, then it should see the daughters as assigned
      // to the dead server. We don't have to do anything.
      return false;
    }
    boolean disabling = assignmentManager.getTableStateManager().isTableState(hri.getTable(),
      ZooKeeperProtos.Table.State.DISABLING);
    if (disabling) {
      LOG.info("The table " + hri.getTable() + " is disabled.  Hence not assigning region" +
        hri.getEncodedName());
      return false;
    }
    return true;
  }

  /**
   * If hbase:meta is not assigned already, assign.
   * @throws IOException
   */
  private void verifyAndAssignMetaWithRetries(final MasterProcedureEnv env) throws IOException {
    MasterServices services = env.getMasterServices();
    int iTimes = services.getConfiguration().getInt(KEY_RETRIES_ON_META, DEFAULT_RETRIES_ON_META);
    // Just reuse same time as we have for short wait on meta. Adding another config is overkill.
    long waitTime =
      services.getConfiguration().getLong(KEY_SHORT_WAIT_ON_META, DEFAULT_SHORT_WAIT_ON_META);
    int iFlag = 0;
    while (true) {
      try {
        verifyAndAssignMeta(env);
        break;
      } catch (KeeperException e) {
        services.abort("In server shutdown processing, assigning meta", e);
        throw new IOException("Aborting", e);
      } catch (Exception e) {
        if (iFlag >= iTimes) {
          services.abort("verifyAndAssignMeta failed after" + iTimes + " retries, aborting", e);
          throw new IOException("Aborting", e);
        }
        try {
          Thread.sleep(waitTime);
        } catch (InterruptedException e1) {
          LOG.warn("Interrupted when is the thread sleep", e1);
          Thread.currentThread().interrupt();
          throw (InterruptedIOException)new InterruptedIOException().initCause(e1);
        }
        iFlag++;
      }
    }
  }

  /**
   * If hbase:meta is not assigned already, assign.
   * @throws InterruptedException
   * @throws IOException
   * @throws KeeperException
   */
  private void verifyAndAssignMeta(final MasterProcedureEnv env)
      throws InterruptedException, IOException, KeeperException {
    MasterServices services = env.getMasterServices();
    if (!isMetaAssignedQuickTest(env)) {
      services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
    } else if (serverName.equals(services.getMetaTableLocator().
        getMetaRegionLocation(services.getZooKeeper()))) {
      // hbase:meta seems to be still alive on the server whom master is expiring
      // and thinks is dying. Let's re-assign the hbase:meta anyway.
      services.getAssignmentManager().assignMeta(HRegionInfo.FIRST_META_REGIONINFO);
    } else {
      LOG.info("Skip assigning hbase:meta because it is online at "
          + services.getMetaTableLocator().getMetaRegionLocation(services.getZooKeeper()));
    }
  }

  /**
   * A quick test that hbase:meta is assigned; blocks for short time only.
   * @return True if hbase:meta location is available and verified as good.
   * @throws InterruptedException
   * @throws IOException
   */
  private boolean isMetaAssignedQuickTest(final MasterProcedureEnv env)
  throws InterruptedException, IOException {
    ZooKeeperWatcher zkw = env.getMasterServices().getZooKeeper();
    MetaTableLocator mtl = env.getMasterServices().getMetaTableLocator();
    boolean metaAssigned = false;
    // Is hbase:meta location available yet?
    if (mtl.isLocationAvailable(zkw)) {
      ClusterConnection connection = env.getMasterServices().getConnection();
      // Is hbase:meta location good yet?
      long timeout =
        env.getMasterConfiguration().getLong(KEY_SHORT_WAIT_ON_META, DEFAULT_SHORT_WAIT_ON_META);
      if (mtl.verifyMetaRegionLocation(connection, zkw, timeout)) {
        metaAssigned = true;
      }
    }
    return metaAssigned;
  }

  @Override
  public ServerName getServerName() {
    return this.serverName;
  }

  @Override
  public boolean hasMetaTableRegion() {
    return this.carryingMeta;
  }

  @Override
  public ServerOperationType getServerOperationType() {
    return ServerOperationType.CRASH_HANDLER;
  }

  /**
   * For this procedure, yield at end of each successful flow step so that all crashed servers
   * can make progress rather than do the default which has each procedure running to completion
   * before we move to the next. For crashed servers, especially if running with distributed log
   * replay, we will want all servers to come along; we do not want the scenario where a server is
   * stuck waiting for regions to online so it can replay edits.
   */
  @Override
  protected boolean isYieldBeforeExecuteFromState(MasterProcedureEnv env, ServerCrashState state) {
    return true;
  }

  @Override
  protected boolean shouldWaitClientAck(MasterProcedureEnv env) {
    // The operation is triggered internally on the server
    // the client does not know about this procedure.
    return false;
  }
}