All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure Maven / Gradle / Ivy

There is a newer version: 3.0.0-beta-1
Show newest version
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.master.procedure;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.MasterWalManager;
import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.HBaseProtos.RegionInfo;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.ServerCrashState;

/**
 * Handle crashed server. This is a port to ProcedureV2 of what used to be euphemistically called
 * ServerShutdownHandler.
 *
 * 

The procedure flow varies dependent on whether meta is assigned and if we are to split logs. * *

We come in here after ServerManager has noticed a server has expired. Procedures * queued on the rpc should have been notified about fail and should be concurrently * getting themselves ready to assign elsewhere. */ public class ServerCrashProcedure extends StateMachineProcedure implements ServerProcedureInterface { private static final Log LOG = LogFactory.getLog(ServerCrashProcedure.class); /** * Name of the crashed server to process. */ private ServerName serverName; /** * Whether DeadServer knows that we are processing it. */ private boolean notifiedDeadServer = false; /** * Regions that were on the crashed server. */ private List regionsOnCrashedServer; private boolean carryingMeta = false; private boolean shouldSplitWal; /** * Cycles on same state. Good for figuring if we are stuck. */ private int cycles = 0; /** * Ordinal of the previous state. So we can tell if we are progressing or not. TODO: if useful, * move this back up into StateMachineProcedure */ private int previousState; /** * Call this constructor queuing up a Procedure. * @param serverName Name of the crashed server. * @param shouldSplitWal True if we should split WALs as part of crashed server processing. * @param carryingMeta True if carrying hbase:meta table region. */ public ServerCrashProcedure( final MasterProcedureEnv env, final ServerName serverName, final boolean shouldSplitWal, final boolean carryingMeta) { this.serverName = serverName; this.shouldSplitWal = shouldSplitWal; this.carryingMeta = carryingMeta; this.setOwner(env.getRequestUser()); } /** * Used when deserializing from a procedure store; we'll construct one of these then call * {@link #deserializeStateData(InputStream)}. Do not use directly. */ public ServerCrashProcedure() { super(); } @Override protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state) throws ProcedureSuspendedException, ProcedureYieldException { if (LOG.isTraceEnabled()) { LOG.trace(state + " " + this + "; cycles=" + this.cycles); } // Keep running count of cycles if (state.ordinal() != this.previousState) { this.previousState = state.ordinal(); this.cycles = 0; } else { this.cycles++; } final MasterServices services = env.getMasterServices(); // HBASE-14802 // If we have not yet notified that we are processing a dead server, we should do now. if (!notifiedDeadServer) { services.getServerManager().getDeadServers().notifyServer(serverName); notifiedDeadServer = true; } try { switch (state) { case SERVER_CRASH_START: LOG.info("Start " + this); start(env); // If carrying meta, process it first. Else, get list of regions on crashed server. if (this.carryingMeta) { setNextState(ServerCrashState.SERVER_CRASH_PROCESS_META); } else { setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS); } break; case SERVER_CRASH_GET_REGIONS: // If hbase:meta is not assigned, yield. if (env.getAssignmentManager().waitMetaInitialized(this)) { throw new ProcedureSuspendedException(); } this.regionsOnCrashedServer = services.getAssignmentManager().getRegionStates() .getServerRegionInfoSet(serverName); // Where to go next? Depends on whether we should split logs at all or // if we should do distributed log splitting. if (!this.shouldSplitWal) { setNextState(ServerCrashState.SERVER_CRASH_ASSIGN); } else { setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS); } break; case SERVER_CRASH_PROCESS_META: processMeta(env); setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS); break; case SERVER_CRASH_SPLIT_LOGS: splitLogs(env); setNextState(ServerCrashState.SERVER_CRASH_ASSIGN); break; case SERVER_CRASH_ASSIGN: // If no regions to assign, skip assign and skip to the finish. // Filter out meta regions. Those are handled elsewhere in this procedure. // Filter changes this.regionsOnCrashedServer. if (filterDefaultMetaRegions(regionsOnCrashedServer)) { if (LOG.isTraceEnabled()) { LOG.trace("Assigning regions " + HRegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this + "; cycles=" + this.cycles); } handleRIT(env, regionsOnCrashedServer); addChildProcedure(env.getAssignmentManager(). createAssignProcedures(regionsOnCrashedServer, true)); } setNextState(ServerCrashState.SERVER_CRASH_FINISH); break; case SERVER_CRASH_FINISH: services.getServerManager().getDeadServers().finish(serverName); return Flow.NO_MORE_STATE; default: throw new UnsupportedOperationException("unhandled state=" + state); } } catch (IOException e) { LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + this.cycles, e); } return Flow.HAS_MORE_STATE; } /** * Start processing of crashed server. In here we'll just set configs. and return. * @param env * @throws IOException */ private void start(final MasterProcedureEnv env) throws IOException {} /** * @param env * @throws IOException * @throws InterruptedException */ private void processMeta(final MasterProcedureEnv env) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName); if (this.shouldSplitWal) { // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment. env.getMasterServices().getMasterWalManager().splitMetaLog(serverName); } // Assign meta if still carrying it. Check again: region may be assigned because of RIT timeout final AssignmentManager am = env.getMasterServices().getAssignmentManager(); for (HRegionInfo hri: am.getRegionStates().getServerRegionInfoSet(serverName)) { if (!isDefaultMetaRegion(hri)) continue; am.offlineRegion(hri); addChildProcedure(am.createAssignProcedure(hri, true)); } } private boolean filterDefaultMetaRegions(final List regions) { if (regions == null) return false; final Iterator it = regions.iterator(); while (it.hasNext()) { final HRegionInfo hri = it.next(); if (isDefaultMetaRegion(hri)) { it.remove(); } } return !regions.isEmpty(); } private boolean isDefaultMetaRegion(final HRegionInfo hri) { return hri.getTable().equals(TableName.META_TABLE_NAME) && RegionReplicaUtil.isDefaultReplica(hri); } private void splitLogs(final MasterProcedureEnv env) throws IOException { if (LOG.isDebugEnabled()) { LOG.debug("Splitting WALs " + this); } MasterWalManager mwm = env.getMasterServices().getMasterWalManager(); AssignmentManager am = env.getMasterServices().getAssignmentManager(); // TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running. // PROBLEM!!! WE BLOCK HERE. mwm.splitLog(this.serverName); if (LOG.isDebugEnabled()) { LOG.debug("Done splitting WALs " + this); } am.getRegionStates().logSplit(this.serverName); } static int size(final Collection hris) { return hris == null? 0: hris.size(); } @Override protected void rollbackState(MasterProcedureEnv env, ServerCrashState state) throws IOException { // Can't rollback. throw new UnsupportedOperationException("unhandled state=" + state); } @Override protected ServerCrashState getState(int stateId) { return ServerCrashState.valueOf(stateId); } @Override protected int getStateId(ServerCrashState state) { return state.getNumber(); } @Override protected ServerCrashState getInitialState() { return ServerCrashState.SERVER_CRASH_START; } @Override protected boolean abort(MasterProcedureEnv env) { // TODO return false; } @Override protected LockState acquireLock(final MasterProcedureEnv env) { // TODO: Put this BACK AFTER AMv2 goes in!!!! // if (env.waitFailoverCleanup(this)) return LockState.LOCK_EVENT_WAIT; if (env.waitServerCrashProcessingEnabled(this)) return LockState.LOCK_EVENT_WAIT; if (env.getProcedureScheduler().waitServerExclusiveLock(this, getServerName())) { return LockState.LOCK_EVENT_WAIT; } return LockState.LOCK_ACQUIRED; } @Override protected void releaseLock(final MasterProcedureEnv env) { env.getProcedureScheduler().wakeServerExclusiveLock(this, getServerName()); } @Override public void toStringClassDetails(StringBuilder sb) { sb.append(getClass().getSimpleName()); sb.append(" server="); sb.append(serverName); sb.append(", splitWal="); sb.append(shouldSplitWal); sb.append(", meta="); sb.append(carryingMeta); } @Override public void serializeStateData(final OutputStream stream) throws IOException { super.serializeStateData(stream); MasterProcedureProtos.ServerCrashStateData.Builder state = MasterProcedureProtos.ServerCrashStateData.newBuilder(). setServerName(ProtobufUtil.toServerName(this.serverName)). setCarryingMeta(this.carryingMeta). setShouldSplitWal(this.shouldSplitWal); if (this.regionsOnCrashedServer != null && !this.regionsOnCrashedServer.isEmpty()) { for (HRegionInfo hri: this.regionsOnCrashedServer) { state.addRegionsOnCrashedServer(HRegionInfo.convert(hri)); } } state.build().writeDelimitedTo(stream); } @Override public void deserializeStateData(final InputStream stream) throws IOException { super.deserializeStateData(stream); MasterProcedureProtos.ServerCrashStateData state = MasterProcedureProtos.ServerCrashStateData.parseDelimitedFrom(stream); this.serverName = ProtobufUtil.toServerName(state.getServerName()); this.carryingMeta = state.hasCarryingMeta()? state.getCarryingMeta(): false; // shouldSplitWAL has a default over in pb so this invocation will always work. this.shouldSplitWal = state.getShouldSplitWal(); int size = state.getRegionsOnCrashedServerCount(); if (size > 0) { this.regionsOnCrashedServer = new ArrayList(size); for (RegionInfo ri: state.getRegionsOnCrashedServerList()) { this.regionsOnCrashedServer.add(HRegionInfo.convert(ri)); } } } @Override public ServerName getServerName() { return this.serverName; } @Override public boolean hasMetaTableRegion() { return this.carryingMeta; } @Override public ServerOperationType getServerOperationType() { return ServerOperationType.CRASH_HANDLER; } /** * For this procedure, yield at end of each successful flow step so that all crashed servers * can make progress rather than do the default which has each procedure running to completion * before we move to the next. For crashed servers, especially if running with distributed log * replay, we will want all servers to come along; we do not want the scenario where a server is * stuck waiting for regions to online so it can replay edits. */ @Override protected boolean isYieldBeforeExecuteFromState(MasterProcedureEnv env, ServerCrashState state) { return true; } @Override protected boolean shouldWaitClientAck(MasterProcedureEnv env) { // The operation is triggered internally on the server // the client does not know about this procedure. return false; } /** * Handle any outstanding RIT that are up against this.serverName, the crashed server. * Notify them of crash. Remove assign entries from the passed in regions * otherwise we have two assigns going on and they will fight over who has lock. * Notify Unassigns also. * @param crashedServer Server that crashed. * @param regions Regions that were on crashed server * @return Subset of regions that were RIT against crashedServer */ private void handleRIT(final MasterProcedureEnv env, final List regions) { if (regions == null) return; AssignmentManager am = env.getMasterServices().getAssignmentManager(); final Iterator it = regions.iterator(); ServerCrashException sce = null; while (it.hasNext()) { final HRegionInfo hri = it.next(); RegionTransitionProcedure rtp = am.getRegionStates().getRegionTransitionProcedure(hri); if (rtp == null) continue; // Make sure the RIT is against this crashed server. In the case where there are many // processings of a crashed server -- backed up for whatever reason (slow WAL split) -- // then a previous SCP may have already failed an assign, etc., and it may have a new // location target; DO NOT fail these else we make for assign flux. ServerName rtpServerName = rtp.getServer(env); if (rtpServerName == null) { LOG.warn("RIT with ServerName null! " + rtp); continue; } if (!rtpServerName.equals(this.serverName)) continue; LOG.info("pid=" + getProcId() + " found RIT " + rtp + "; " + rtp.getRegionState(env).toShortString()); // Notify RIT on server crash. if (sce == null) { sce = new ServerCrashException(getProcId(), getServerName()); } rtp.remoteCallFailed(env, this.serverName, sce); if (rtp instanceof AssignProcedure) { // If an assign, include it in our return and remove from passed-in list of regions. it.remove(); } } } @Override protected ProcedureMetrics getProcedureMetrics(MasterProcedureEnv env) { return env.getMasterServices().getMasterMetrics().getServerCrashProcMetrics(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy