All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.hadoop.hbase.master.RegionStates Maven / Gradle / Ivy

The newest version!
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.master;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Comparator;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.RegionTransition;
import org.apache.hadoop.hbase.ServerLoad;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.TableStateManager;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client.Mutation;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.master.RegionState.State;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ConfigUtil;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.PairOfSameType;
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.zookeeper.KeeperException;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

/**
 * Region state accountant. It holds the states of all regions in the memory.
 * In normal scenario, it should match the meta table and the true region states.
 *
 * This map is used by AssignmentManager to track region states.
 */
@InterfaceAudience.Private
public class RegionStates {
  private static final Log LOG = LogFactory.getLog(RegionStates.class);

  public final static RegionStateStampComparator REGION_STATE_COMPARATOR =
    new RegionStateStampComparator();

  // This comparator sorts the RegionStates by time stamp then Region name.
  // Comparing by timestamp alone can lead us to discard different RegionStates that happen
  // to share a timestamp.
  private static class RegionStateStampComparator implements Comparator {
    @Override
    public int compare(RegionState l, RegionState r) {
      return Long.compare(l.getStamp(), r.getStamp()) == 0 ?
          Bytes.compareTo(l.getRegion().getRegionName(), r.getRegion().getRegionName()) :
          Long.compare(l.getStamp(), r.getStamp());
    }
  }

  /**
   * Regions currently in transition.
   */
  final HashMap regionsInTransition =
    new HashMap();

  /**
   * Region encoded name to state map.
   * All the regions should be in this map.
   */
  private final Map regionStates =
    new HashMap();

  /**
   * Holds mapping of table -> region state
   */
  private final Map> regionStatesTableIndex =
      new HashMap>();

  /**
   * Server to regions assignment map.
   * Contains the set of regions currently assigned to a given server.
   */
  private final Map> serverHoldings =
    new HashMap>();

  /**
   * Maintains the mapping from the default region to the replica regions.
   */
  private final Map> defaultReplicaToOtherReplicas =
    new HashMap>();

  /**
   * Region to server assignment map.
   * Contains the server a given region is currently assigned to.
   */
  private final TreeMap regionAssignments =
    new TreeMap();

  /**
   * Encoded region name to server assignment map for re-assignment
   * purpose. Contains the server a given region is last known assigned
   * to, which has not completed log splitting, so not assignable.
   * If a region is currently assigned, this server info in this
   * map should be the same as that in regionAssignments.
   * However the info in regionAssignments is cleared when the region
   * is offline while the info in lastAssignments is cleared when
   * the region is closed or the server is dead and processed.
   */
  private final HashMap lastAssignments =
    new HashMap();

  /**
   * Encoded region name to server assignment map for the
   * purpose to clean up serverHoldings when a region is online
   * on a new server. When the region is offline from the previous
   * server, we cleaned up regionAssignments so that it has the
   * latest assignment map. But we didn't clean up serverHoldings
   * to match the meta. We need this map to find out the old server
   * whose serverHoldings needs cleanup, given a moved region.
   */
  private final HashMap oldAssignments =
    new HashMap();

  /**
   * Map a host port pair string to the latest start code
   * of a region server which is known to be dead. It is dead
   * to us, but server manager may not know it yet.
   */
  private final HashMap deadServers =
    new HashMap();

  /**
   * Map a dead servers to the time when log split is done.
   * Since log splitting is not ordered, we have to remember
   * all processed instances. The map is cleaned up based
   * on a configured time. By default, we assume a dead
   * server should be done with log splitting in two hours.
   */
  private final HashMap processedServers =
    new HashMap();
  private long lastProcessedServerCleanTime;

  private final TableStateManager tableStateManager;
  private final RegionStateStore regionStateStore;
  private final ServerManager serverManager;
  private final MasterServices server;
  private final boolean useZK; // Is it ZK based assignment?

  // The maximum time to keep a log split info in region states map
  static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
  static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours

  RegionStates(final MasterServices master, final TableStateManager tableStateManager,
      final ServerManager serverManager, final RegionStateStore regionStateStore) {
    this.tableStateManager = tableStateManager;
    this.regionStateStore = regionStateStore;
    this.serverManager = serverManager;
    this.server = master;
    this.useZK = ConfigUtil.useZKForAssignment(server.getConfiguration());
  }

  /**
   * @return a copy of the region assignment map
   */
  public synchronized Map getRegionAssignments() {
    return new TreeMap(regionAssignments);
  }

  /**
   * Return the replicas (including default) for the regions grouped by ServerName
   * @param regions
   * @return a pair containing the groupings as a map
   */
  synchronized Map> getRegionAssignments(
    Collection regions) {
    Map> map = new HashMap>();
    for (HRegionInfo region : regions) {
      HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
      Set allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
      if (allReplicas != null) {
        for (HRegionInfo hri : allReplicas) {
          ServerName server = regionAssignments.get(hri);
          if (server != null) {
            List regionsOnServer = map.get(server);
            if (regionsOnServer == null) {
              regionsOnServer = new ArrayList(1);
              map.put(server, regionsOnServer);
            }
            regionsOnServer.add(hri);
          }
        }
      }
    }
    return map;
  }

  public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
    return regionAssignments.get(hri);
  }

  /**
   * Get regions in transition and their states
   */
  public synchronized Set getRegionsInTransition() {
    return new HashSet(regionsInTransition.values());
  }

  /**
   * Get all regions and their states
   */
  public synchronized Set getAllRegions() {
    return new HashSet(regionStates.values());
  }

  /**
   * @return a set of the regions in transition that are sorted by timestamp
   */
  public synchronized SortedSet getRegionsInTransitionOrderedByTimestamp() {
    final TreeSet rit = new TreeSet(REGION_STATE_COMPARATOR);
    for (RegionState rs: regionsInTransition.values()) {
      rit.add(rs);
    }
    return rit;
  }

  /**
   * Get the number of regions in transition.
   */
  public synchronized int getRegionsInTransitionCount() {
    return regionsInTransition.size();
  }

  /**
   * @return True if specified region in transition.
   */
  public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
    return regionsInTransition.containsKey(hri.getEncodedName());
  }

  /**
   * @return True if specified region in transition.
   */
  public synchronized boolean isRegionInTransition(final String encodedName) {
    return regionsInTransition.containsKey(encodedName);
  }

  /**
   * @return True if any region in transition.
   */
  public synchronized boolean isRegionsInTransition() {
    return !regionsInTransition.isEmpty();
  }

  /**
   * @return True if hbase:meta table region is in transition.
   */
  public synchronized boolean isMetaRegionInTransition() {
    for (RegionState state : regionsInTransition.values()) {
      if (state.getRegion().isMetaRegion()) return true;
    }
    return false;
  }

  /**
   * @return True if specified region assigned, and not in transition.
   */
  public synchronized boolean isRegionOnline(final HRegionInfo hri) {
    return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
  }

  /**
   * @return True if specified region offline/closed, but not in transition.
   * If the region is not in the map, it is offline to us too.
   */
  public synchronized boolean isRegionOffline(final HRegionInfo hri) {
    return getRegionState(hri) == null || (!isRegionInTransition(hri)
      && isRegionInState(hri, State.OFFLINE, State.CLOSED));
  }

  /**
   * @return True if specified region is in one of the specified states.
   */
  public boolean isRegionInState(
      final HRegionInfo hri, final State... states) {
    return isRegionInState(hri.getEncodedName(), states);
  }

  /**
   * @return True if specified region is in one of the specified states.
   */
  public boolean isRegionInState(
      final String encodedName, final State... states) {
    RegionState regionState = getRegionState(encodedName);
    return isOneOfStates(regionState, states);
  }

  /**
   * Wait for the state map to be updated by assignment manager.
   */
  public synchronized void waitForUpdate(
      final long timeout) throws InterruptedException {
    this.wait(timeout);
  }

  /**
   * Get region transition state
   */
  public RegionState getRegionTransitionState(final HRegionInfo hri) {
    return getRegionTransitionState(hri.getEncodedName());
  }

  /**
   * Get region transition state
   */
  public synchronized RegionState
      getRegionTransitionState(final String encodedName) {
    return regionsInTransition.get(encodedName);
  }

  /**
   * Add a list of regions to RegionStates. If a region is split
   * and offline, its state will be SPLIT. Otherwise, its state will
   * be OFFLINE. Region already in RegionStates will be skipped.
   */
  public void createRegionStates(
      final List hris) {
    for (HRegionInfo hri: hris) {
      createRegionState(hri);
    }
  }

  /**
   * Add a region to RegionStates. If the region is split
   * and offline, its state will be SPLIT. Otherwise, its state will
   * be OFFLINE. If it is already in RegionStates, this call has
   * no effect, and the original state is returned.
   */
  public RegionState createRegionState(final HRegionInfo hri) {
    return createRegionState(hri, null, null, null);
  }

  /**
   * Add a region to RegionStates with the specified state.
   * If the region is already in RegionStates, this call has
   * no effect, and the original state is returned.
   *
   * @param hri the region info to create a state for
   * @param newState the state to the region in set to
   * @param serverName the server the region is transitioning on
   * @param lastHost the last server that hosts the region
   * @return the current state
   */
  public synchronized RegionState createRegionState(final HRegionInfo hri,
      State newState, ServerName serverName, ServerName lastHost) {
    if (newState == null || (newState == State.OPEN && serverName == null)) {
      newState =  State.OFFLINE;
    }
    if (hri.isOffline() && hri.isSplit()) {
      newState = State.SPLIT;
      serverName = null;
    }
    String encodedName = hri.getEncodedName();
    RegionState regionState = regionStates.get(encodedName);
    if (regionState != null) {
      LOG.warn("Tried to create a state for a region already in RegionStates, "
        + "used existing: " + regionState + ", ignored new: " + newState);
    } else {
      regionState = new RegionState(hri, newState, serverName);
      putRegionState(regionState);
      if (newState == State.OPEN) {
        if (!serverName.equals(lastHost)) {
          LOG.warn("Open region's last host " + lastHost
            + " should be the same as the current one " + serverName
            + ", ignored the last and used the current one");
          lastHost = serverName;
        }
        lastAssignments.put(encodedName, lastHost);
        regionAssignments.put(hri, lastHost);
      } else if (!regionState.isUnassignable()) {
        regionsInTransition.put(encodedName, regionState);
      }
      if (lastHost != null && newState != State.SPLIT) {
        addToReplicaMapping(hri);
        addToServerHoldings(lastHost, hri);
        if (newState != State.OPEN) {
          oldAssignments.put(encodedName, lastHost);
        }
      }
    }
    return regionState;
  }

  private RegionState putRegionState(RegionState regionState) {
    HRegionInfo hri = regionState.getRegion();
    String encodedName = hri.getEncodedName();
    TableName table = hri.getTable();
    RegionState oldState = regionStates.put(encodedName, regionState);
    Map map = regionStatesTableIndex.get(table);
    if (map == null) {
      map = new HashMap();
      regionStatesTableIndex.put(table, map);
    }
    map.put(encodedName, regionState);
    return oldState;
  }

  /**
   * Set the region state to CLOSED
   */
  public RegionState setRegionStateTOCLOSED(
      final byte[] regionName,
      final ServerName serverName) {
    HRegionInfo regionInfo = getRegionInfo(regionName);
    return setRegionStateTOCLOSED(regionInfo, serverName);
  }

  /**
   * Set the region state to CLOSED
   */
  public RegionState setRegionStateTOCLOSED(
      final HRegionInfo regionInfo,
      final ServerName serverName) {
    ServerName sn = serverName;
    if (sn == null) {
      RegionState regionState = getRegionState(regionInfo.getEncodedName());
      if (regionState != null) {
        sn = regionState.getServerName();
      }
      // TODO: if sn is null, should we dig into
      // lastAssignments.get(regionInfo.getEncodedName() to get the server name?
      // For now, I just keep the same logic that works in the past
    }
    // We have to make sure that the last region server is set to be the same as the
    // current RS.  If we don't do that, we could run into situation that both AM and SSH
    // think other would do the assignment work; at the end, neither does the work and
    // region remains RIT.
    // See HBASE-13330 and HBASE-17023
    setLastRegionServerOfRegion(sn, regionInfo.getEncodedName());
    return updateRegionState(regionInfo, State.CLOSED, sn);
  }

  /**
   * Update a region state. It will be put in transition if not already there.
   */
  public RegionState updateRegionState(
      final HRegionInfo hri, final State state) {
    RegionState regionState = getRegionState(hri.getEncodedName());
    return updateRegionState(hri, state,
      regionState == null ? null : regionState.getServerName());
  }

  /**
   * Update a region state. It will be put in transition if not already there.
   *
   * If we can't find the region info based on the region name in
   * the transition, log a warning and return null.
   */
  public RegionState updateRegionState(
      final RegionTransition transition, final State state) {
    byte [] regionName = transition.getRegionName();
    HRegionInfo regionInfo = getRegionInfo(regionName);
    if (regionInfo == null) {
      String prettyRegionName = HRegionInfo.prettyPrint(
        HRegionInfo.encodeRegionName(regionName));
      LOG.warn("Failed to find region " + prettyRegionName
        + " in updating its state to " + state
        + " based on region transition " + transition);
      return null;
    }
    return updateRegionState(regionInfo, state,
      transition.getServerName());
  }

  /**
   * Transition a region state to OPEN from OPENING/PENDING_OPEN
   */
  public synchronized RegionState transitionOpenFromPendingOpenOrOpeningOnServer(
      final RegionTransition transition, final RegionState fromState, final ServerName sn) {
    if(fromState.isPendingOpenOrOpeningOnServer(sn)){
      return updateRegionState(transition, State.OPEN);
    }
    return null;
  }

  /**
   * Update a region state. It will be put in transition if not already there.
   */
  public RegionState updateRegionState(
      final HRegionInfo hri, final State state, final ServerName serverName) {
    return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
  }

  public void regionOnline(final HRegionInfo hri, final ServerName serverName) {
    regionOnline(hri, serverName, HConstants.NO_SEQNUM);
  }

  /**
   * A region is online, won't be in transition any more.
   * We can't confirm it is really online on specified region server
   * because it hasn't been put in region server's online region list yet.
   */
  public void regionOnline(final HRegionInfo hri, final ServerName serverName, long openSeqNum) {
    String encodedName = hri.getEncodedName();
    if (!serverManager.isServerOnline(serverName)) {
      // This is possible if the region server dies before master gets a
      // chance to handle ZK event in time. At this time, if the dead server
      // is already processed by SSH, we should ignore this event.
      // If not processed yet, ignore and let SSH deal with it.
      LOG.warn("Ignored, " + encodedName + " was opened on a dead server: " + serverName);
      return;
    }
    updateRegionState(hri, State.OPEN, serverName, openSeqNum);

    synchronized (this) {
      RegionState regionState = regionsInTransition.remove(encodedName);
      // When region is online and remove from regionsInTransition,
      // update the RIT duration to assignment manager metrics
      if (regionState != null && this.server.getAssignmentManager() != null) {
        long ritDuration = System.currentTimeMillis() - regionState.getStamp()
            + regionState.getRitDuration();
        this.server.getAssignmentManager().getAssignmentManagerMetrics()
            .updateRitDuration(ritDuration);
      }
      ServerName oldServerName = regionAssignments.put(hri, serverName);
      if (!serverName.equals(oldServerName)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Onlined " + hri.getShortNameToLog() + " on " + serverName);
        }
        addToServerHoldings(serverName, hri);
        addToReplicaMapping(hri);
        if (oldServerName == null) {
          oldServerName = oldAssignments.remove(encodedName);
        }
        if (oldServerName != null
            && !oldServerName.equals(serverName)
            && serverHoldings.containsKey(oldServerName)) {
          LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
          removeFromServerHoldings(oldServerName, hri);
        }
      }
    }
  }

  private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
    Set regions = serverHoldings.get(serverName);
    if (regions == null) {
      regions = new HashSet();
      serverHoldings.put(serverName, regions);
    }
    regions.add(hri);
  }

  private void addToReplicaMapping(HRegionInfo hri) {
    HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
    Set replicas =
        defaultReplicaToOtherReplicas.get(defaultReplica);
    if (replicas == null) {
      replicas = new HashSet();
      defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
    }
    replicas.add(hri);
  }

  private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
    Set oldRegions = serverHoldings.get(serverName);
    oldRegions.remove(hri);
    if (oldRegions.isEmpty()) {
      serverHoldings.remove(serverName);
    }
  }

  private void removeFromReplicaMapping(HRegionInfo hri) {
    HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
    Set replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
    if (replicas != null) {
      replicas.remove(hri);
      if (replicas.isEmpty()) {
        defaultReplicaToOtherReplicas.remove(defaultReplica);
      }
    }
  }

  /**
   * Used in some unit tests
   */
  @VisibleForTesting
  synchronized boolean existsInServerHoldings(final ServerName serverName,
      final HRegionInfo hri) {
    Set oldRegions = serverHoldings.get(serverName);
    if (oldRegions != null) {
      return oldRegions.contains(hri);
    }
    return false;
  }

  /**
   * A dead server's wals have been split so that all the regions
   * used to be open on it can be safely assigned now. Mark them assignable.
   */
  public synchronized void logSplit(final ServerName serverName) {
    for (Iterator> it
        = lastAssignments.entrySet().iterator(); it.hasNext();) {
      Map.Entry e = it.next();
      if (e.getValue().equals(serverName)) {
        it.remove();
      }
    }
    long now = System.currentTimeMillis();
    if (LOG.isDebugEnabled()) {
      LOG.debug("Adding to log splitting servers " + serverName);
    }
    processedServers.put(serverName, Long.valueOf(now));
    Configuration conf = server.getConfiguration();
    long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
    // Doesn't have to be very accurate about the clean up time
    if (now > lastProcessedServerCleanTime + obsoleteTime) {
      lastProcessedServerCleanTime = now;
      long cutoff = now - obsoleteTime;
      for (Iterator> it
          = processedServers.entrySet().iterator(); it.hasNext();) {
        Map.Entry e = it.next();
        if (e.getValue().longValue() < cutoff) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Removed from log splitting servers " + e.getKey());
          }
          it.remove();
        }
      }
    }
  }

  /**
   * Log split is done for a given region, so it is assignable now.
   */
  public void logSplit(final HRegionInfo region) {
    clearLastAssignment(region);
  }

  public synchronized void clearLastAssignment(final HRegionInfo region) {
    lastAssignments.remove(region.getEncodedName());
  }

  /**
   * A region is offline, won't be in transition any more.
   */
  public void regionOffline(final HRegionInfo hri) {
    regionOffline(hri, null);
  }

  /**
   * A region is offline, won't be in transition any more. Its state
   * should be the specified expected state, which can only be
   * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
   */
  public void regionOffline(
      final HRegionInfo hri, final State expectedState) {
    Preconditions.checkArgument(expectedState == null
      || RegionState.isUnassignable(expectedState),
        "Offlined region should not be " + expectedState);
    if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
      // Remove it from all region maps
      deleteRegion(hri);
      return;
    }

    /*
     * One tricky case, if region here is a replica region and its parent is at
     * SPLIT state, its newState should be same as its parent, not OFFLINE.
     */
    State newState =
        expectedState == null ? State.OFFLINE : expectedState;

    if ((expectedState == null) && !RegionReplicaUtil.isDefaultReplica(hri)) {
      RegionState primateState = getRegionState(
          RegionReplicaUtil.getRegionInfoForDefaultReplica(hri));
      if ((primateState != null) && (primateState.getState() == State.SPLIT)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Update region " + hri + "to SPLIT, from primary region " +
              RegionReplicaUtil.getRegionInfoForDefaultReplica(hri));
        }
        newState = State.SPLIT;
      }
    }

    updateRegionState(hri, newState);
    String encodedName = hri.getEncodedName();
    synchronized (this) {
      regionsInTransition.remove(encodedName);
      ServerName oldServerName = regionAssignments.remove(hri);
      if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
        if (newState == State.MERGED || newState == State.SPLIT
            || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
              ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
          // Offline the region only if it's merged/split, or the table is disabled/disabling.
          // Otherwise, offline it from this server only when it is online on a different server.
          LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
          removeFromServerHoldings(oldServerName, hri);
          removeFromReplicaMapping(hri);
        } else {
          // Need to remember it so that we can offline it from this
          // server when it is online on a different server.
          oldAssignments.put(encodedName, oldServerName);
        }
      }
    }
  }

  /**
   * A server is offline, all regions on it are dead.
   */
  public List serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) {
    // Offline all regions on this server not already in transition.
    List rits = new ArrayList();
    Set> regionsToClean =
      new HashSet>();
    // Offline regions outside the loop and synchronized block to avoid
    // ConcurrentModificationException and deadlock in case of meta anassigned,
    // but RegionState a blocked.
    Set regionsToOffline = new HashSet();
    Map daughter2Parent = new HashMap<>();
    synchronized (this) {
      Set assignedRegions = serverHoldings.get(sn);
      if (assignedRegions == null) {
        assignedRegions = new HashSet();
      }

      for (HRegionInfo region : assignedRegions) {
        // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
        if (isRegionOnline(region)) {
          regionsToOffline.add(region);
        } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
          LOG.debug("Offline splitting/merging region " + getRegionState(region));
          try {
            // Delete the ZNode if exists
            ZKAssign.deleteNodeFailSilent(watcher, region);
            regionsToOffline.add(region);
            PairOfSameType daughterRegions =
              MetaTableAccessor.getDaughterRegionsFromParent(this.server.getConnection(), region);
            if (daughterRegions != null) {
              if (daughterRegions.getFirst() != null) {
                daughter2Parent.put(daughterRegions.getFirst().getEncodedName(), region);
              }
              if (daughterRegions.getSecond() != null) {
                daughter2Parent.put(daughterRegions.getSecond().getEncodedName(), region);
              }
            }
          } catch (KeeperException ke) {
            server.abort("Unexpected ZK exception deleting node " + region, ke);
          } catch (IOException e) {
            LOG.warn("get daughter from meta exception " + region, e);
          }
        }
      }

      for (RegionState state : regionsInTransition.values()) {
        HRegionInfo hri = state.getRegion();
        if (assignedRegions.contains(hri)) {
          // Region is open on this region server, but in transition.
          // This region must be moving away from this server, or splitting/merging.
          // SSH will handle it, either skip assigning, or re-assign.
          LOG.info("Transitioning " + state + " will be handled by ServerCrashProcedure for " + sn);
        } else if (sn.equals(state.getServerName())) {
          // Region is in transition on this region server, and this
          // region is not open on this server. So the region must be
          // moving to this server from another one (i.e. opening or
          // pending open on this server, was open on another one.
          // Offline state is also kind of pending open if the region is in
          // transition. The region could be in failed_close state too if we have
          // tried several times to open it while this region server is not reachable)
          if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
            LOG.info("Found region in " + state +
              " to be reassigned by ServerCrashProcedure for " + sn);
            rits.add(hri);
          } else if (state.isSplittingNew() || state.isMergingNew()) {
            LOG.info(
              "Offline/Cleanup region if no meta entry exists, hri: " + hri + " state: " + state);
            if (daughter2Parent.containsKey(hri.getEncodedName())) {
              HRegionInfo parent = daughter2Parent.get(hri.getEncodedName());
              HRegionInfo info = getHRIFromMeta(parent);
              if (info != null && info.isSplit() && info.isOffline()) {
                regionsToClean.add(Pair.newPair(state.getRegion(), info));
              } else {
                regionsToClean.add(Pair.newPair(state.getRegion(), null));
              }
            } else {
              regionsToClean.add(Pair.newPair(state.getRegion(), null));
            }
          } else {
            LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
          }
        }
      }
      this.notifyAll();
    }

    for (HRegionInfo hri : regionsToOffline) {
      regionOffline(hri);
    }

    cleanFailedSplitMergeRegions(regionsToClean);
    return rits;
  }

  private HRegionInfo getHRIFromMeta(HRegionInfo parent) {
    Result result = null;
    try {
      result =
        MetaTableAccessor.getRegionResult(this.server.getConnection(), parent.getRegionName());
      HRegionInfo info = MetaTableAccessor.getHRegionInfo(result);
      return info;
    } catch (IOException e) {
      LOG.error("got exception when query meta with region " + parent.getEncodedName(), e);
      return null;
    }
  }

  /**
   * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
   * In ZK mode we rollback and hence cleanup daughters/merged region. We also cleanup if
   * meta doesn't have these regions.
   *
   * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
   */
  private void cleanFailedSplitMergeRegions(Set> hris) {
    if (hris.isEmpty()) {
      return;
    }

    for (Pair hriPair : hris) {
      HRegionInfo hri = hriPair.getFirst();
      HRegionInfo parentInfo = hriPair.getSecond();
      // This is RPC to meta table. It is done while we have a synchronize on
      // regionstates. No progress will be made if meta is not available at this time.
      // This is a cleanup task. Not critical.
      try {
        Pair regionPair =
            MetaTableAccessor.getRegion(server.getConnection(), hri.getRegionName());
        if (regionPair == null || useZK) {
          regionOffline(hri);

          // If we use ZK, then we can cleanup entries from meta, since we roll back.
          if (regionPair != null) {
            MetaTableAccessor.deleteRegion(this.server.getConnection(), hri);
          }
          if (parentInfo != null) {
            List mutations = new ArrayList();
            HRegionInfo copyOfParent = new HRegionInfo(parentInfo);
            copyOfParent.setOffline(false);
            copyOfParent.setSplit(false);
            Put putParent = MetaTableAccessor.makePutFromRegionInfo(copyOfParent);
            mutations.add(putParent);
            MetaTableAccessor.mutateMetaTable(this.server.getConnection(), mutations);
          }
          LOG.debug("Cleaning up HDFS since no meta entry exists, hri: " + hri);
          FSUtils.deleteRegionDir(server.getConfiguration(), hri);
        }
      } catch (IOException e) {
        LOG.warn("Got exception while cleaning up region " + hri, e);
      }
    }
  }

  /**
   * Gets the online regions of the specified table.
   * This method looks at the in-memory state.  It does not go to hbase:meta.
   * Only returns online regions.  If a region on this table has been
   * closed during a disable, etc., it will be included in the returned list.
   * So, the returned list may not necessarily be ALL regions in this table, its
   * all the ONLINE regions in the table.
   * @param tableName
   * @return Online regions from tableName
   */
  public synchronized List getRegionsOfTable(TableName tableName) {
    List tableRegions = new ArrayList();
    // boundary needs to have table's name but regionID 0 so that it is sorted
    // before all table's regions.
    HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
    for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
      if(!hri.getTable().equals(tableName)) break;
      tableRegions.add(hri);
    }
    return tableRegions;
  }

  /**
   * Gets current state of all regions of the table.
   * This method looks at the in-memory state.  It does not go to hbase:meta.
   * Method guaranteed to return keys for all states
   * in {@link org.apache.hadoop.hbase.master.RegionState.State}
   *
   * @param tableName
   * @return Online regions from tableName
   */
  public synchronized Map>
  getRegionByStateOfTable(TableName tableName) {
    Map> tableRegions =
        new HashMap>();
    for (State state : State.values()) {
      tableRegions.put(state, new ArrayList());
    }
    Map indexMap = regionStatesTableIndex.get(tableName);
    if (indexMap == null)
      return tableRegions;
    for (RegionState regionState : indexMap.values()) {
      tableRegions.get(regionState.getState()).add(regionState.getRegion());
    }
    return tableRegions;
  }

  /**
   * Wait on region to clear regions-in-transition.
   * 

* If the region isn't in transition, returns immediately. Otherwise, method * blocks until the region is out of transition. */ public synchronized void waitOnRegionToClearRegionsInTransition( final HRegionInfo hri) throws InterruptedException { if (!isRegionInTransition(hri)) return; while(!server.isStopped() && isRegionInTransition(hri)) { RegionState rs = getRegionState(hri); LOG.info("Waiting on " + rs + " to clear regions-in-transition"); waitForUpdate(100); } if (server.isStopped()) { LOG.info("Giving up wait on region in " + "transition because stoppable.isStopped is set"); } } /** * A table is deleted. Remove its regions from all internal maps. * We loop through all regions assuming we don't delete tables too much. */ public void tableDeleted(final TableName tableName) { Set regionsToDelete = new HashSet(); synchronized (this) { for (RegionState state: regionStates.values()) { HRegionInfo region = state.getRegion(); if (region.getTable().equals(tableName)) { regionsToDelete.add(region); } } } for (HRegionInfo region: regionsToDelete) { deleteRegion(region); } } /** * Get a copy of all regions assigned to a server */ public synchronized Set getServerRegions(ServerName serverName) { Set regions = serverHoldings.get(serverName); if (regions == null) return null; return new HashSet(regions); } /** * Remove a region from all state maps. */ @VisibleForTesting public synchronized void deleteRegion(final HRegionInfo hri) { String encodedName = hri.getEncodedName(); regionsInTransition.remove(encodedName); regionStates.remove(encodedName); TableName table = hri.getTable(); Map indexMap = regionStatesTableIndex.get(table); indexMap.remove(encodedName); if (indexMap.size() == 0) regionStatesTableIndex.remove(table); lastAssignments.remove(encodedName); ServerName sn = regionAssignments.remove(hri); if (sn != null) { Set regions = serverHoldings.get(sn); regions.remove(hri); } } @VisibleForTesting public boolean isRegionInRegionStates(final HRegionInfo hri) { return (getRegionState(hri) != null || isRegionOnline(hri)) || isRegionInTransition(hri) || isRegionInState(hri, State.OFFLINE, State.CLOSED); } /** * Checking if a region was assigned to a server which is not online now. * If so, we should hold re-assign this region till SSH has split its wals. * Once logs are split, the last assignment of this region will be reset, * which means a null last assignment server is ok for re-assigning. * * A region server could be dead but we don't know it yet. We may * think it's online falsely. Therefore if a server is online, we still * need to confirm it reachable and having the expected start code. */ synchronized boolean wasRegionOnDeadServer(final String encodedName) { ServerName server = lastAssignments.get(encodedName); return isServerDeadAndNotProcessed(server); } synchronized boolean isServerDeadAndNotProcessed(ServerName server) { if (server == null) return false; if (serverManager.isServerOnline(server)) { String hostAndPort = server.getHostAndPort(); long startCode = server.getStartcode(); Long deadCode = deadServers.get(hostAndPort); if (deadCode == null || startCode > deadCode.longValue()) { if (serverManager.isServerReachable(server)) { return false; } // The size of deadServers won't grow unbounded. deadServers.put(hostAndPort, Long.valueOf(startCode)); } // Watch out! If the server is not dead, the region could // remain unassigned. That's why ServerManager#isServerReachable // should use some retry. // // We cache this info since it is very unlikely for that // instance to come back up later on. We don't want to expire // the server since we prefer to let it die naturally. LOG.warn("Couldn't reach online server " + server); } // Now, we know it's dead. Check if it's processed return !processedServers.containsKey(server); } /** * Get the last region server a region was on for purpose of re-assignment, * i.e. should the re-assignment be held back till log split is done? */ synchronized ServerName getLastRegionServerOfRegion(final String encodedName) { return lastAssignments.get(encodedName); } synchronized void setLastRegionServerOfRegions( final ServerName serverName, final List regionInfos) { for (HRegionInfo hri: regionInfos) { setLastRegionServerOfRegion(serverName, hri.getEncodedName()); } } synchronized void setLastRegionServerOfRegion( final ServerName serverName, final String encodedName) { lastAssignments.put(encodedName, serverName); } void splitRegion(HRegionInfo p, HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException { regionStateStore.splitRegion(p, a, b, sn, getRegionReplication(p)); synchronized (this) { // After PONR, split is considered to be done. // Update server holdings to be aligned with the meta. Set regions = serverHoldings.get(sn); if (regions == null) { throw new IllegalStateException(sn + " should host some regions"); } regions.remove(p); regions.add(a); regions.add(b); } } void mergeRegions(HRegionInfo p, HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException { regionStateStore.mergeRegions(p, a, b, sn, getRegionReplication(a)); synchronized (this) { // After PONR, merge is considered to be done. // Update server holdings to be aligned with the meta. Set regions = serverHoldings.get(sn); if (regions == null) { throw new IllegalStateException(sn + " should host some regions"); } regions.remove(a); regions.remove(b); regions.add(p); } } private int getRegionReplication(HRegionInfo r) throws IOException { if (tableStateManager != null) { HTableDescriptor htd = ((MasterServices)server).getTableDescriptors().get(r.getTable()); if (htd != null) { return htd.getRegionReplication(); } } return 1; } /** * At cluster clean re/start, mark all user regions closed except those of tables * that are excluded, such as disabled/disabling/enabling tables. All user regions * and their previous locations are returned. */ synchronized Map closeAllUserRegions(Set excludedTables) { boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty(); Set toBeClosed = new HashSet(regionStates.size()); for(RegionState state: regionStates.values()) { HRegionInfo hri = state.getRegion(); if (state.isSplit() || hri.isSplit()) { continue; } TableName tableName = hri.getTable(); if (!TableName.META_TABLE_NAME.equals(tableName) && (noExcludeTables || !excludedTables.contains(tableName))) { toBeClosed.add(hri); } } Map allUserRegions = new HashMap(toBeClosed.size()); for (HRegionInfo hri: toBeClosed) { RegionState regionState = updateRegionState(hri, State.CLOSED); allUserRegions.put(hri, regionState.getServerName()); } return allUserRegions; } /** * Compute the average load across all region servers. * Currently, this uses a very naive computation - just uses the number of * regions being served, ignoring stats about number of requests. * @return the average load */ protected synchronized double getAverageLoad() { int numServers = 0, totalLoad = 0; for (Map.Entry> e: serverHoldings.entrySet()) { Set regions = e.getValue(); ServerName serverName = e.getKey(); int regionCount = regions.size(); if (serverManager.isServerOnline(serverName)) { totalLoad += regionCount; numServers++; } } if (numServers > 1) { // The master region server holds only a couple regions. // Don't consider this server in calculating the average load // if there are other region servers to avoid possible confusion. Set hris = serverHoldings.get(server.getServerName()); if (hris != null) { totalLoad -= hris.size(); numServers--; } } return numServers == 0 ? 0.0 : (double)totalLoad / (double)numServers; } /** * This is an EXPENSIVE clone. Cloning though is the safest thing to do. * Can't let out original since it can change and at least the load balancer * wants to iterate this exported list. We need to synchronize on regions * since all access to this.servers is under a lock on this.regions. * * @return A clone of current assignments by table. */ protected Map>> getAssignmentsByTable() { Map>> result = new HashMap>>(); synchronized (this) { if (!server.getConfiguration().getBoolean( HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, false)) { Map> svrToRegions = new HashMap>(serverHoldings.size()); for (Map.Entry> e: serverHoldings.entrySet()) { svrToRegions.put(e.getKey(), new ArrayList(e.getValue())); } result.put(TableName.valueOf(HConstants.ENSEMBLE_TABLE_NAME), svrToRegions); } else { for (Map.Entry> e: serverHoldings.entrySet()) { for (HRegionInfo hri: e.getValue()) { if (hri.isMetaRegion()) continue; TableName tablename = hri.getTable(); Map> svrToRegions = result.get(tablename); if (svrToRegions == null) { svrToRegions = new HashMap>(serverHoldings.size()); result.put(tablename, svrToRegions); } List regions = svrToRegions.get(e.getKey()); if (regions == null) { regions = new ArrayList(); svrToRegions.put(e.getKey(), regions); } regions.add(hri); } } } } Map onlineSvrs = serverManager.getOnlineServers(); // Take care of servers w/o assignments, and remove servers in draining mode List drainingServers = this.serverManager.getDrainingServersList(); for (Map> map: result.values()) { for (ServerName svr: onlineSvrs.keySet()) { if (!map.containsKey(svr)) { map.put(svr, new ArrayList()); } } map.keySet().removeAll(drainingServers); } return result; } protected RegionState getRegionState(final HRegionInfo hri) { return getRegionState(hri.getEncodedName()); } /** * Returns a clone of region assignments per server * @return a Map of ServerName to a List of HRegionInfo's */ protected synchronized Map> getRegionAssignmentsByServer() { Map> regionsByServer = new HashMap>(serverHoldings.size()); for (Map.Entry> e: serverHoldings.entrySet()) { regionsByServer.put(e.getKey(), new ArrayList(e.getValue())); } return regionsByServer; } protected synchronized RegionState getRegionState(final String encodedName) { return regionStates.get(encodedName); } /** * Get the HRegionInfo from cache, if not there, from the hbase:meta table * @param regionName * @return HRegionInfo for the region */ @SuppressWarnings("deprecation") protected HRegionInfo getRegionInfo(final byte [] regionName) { String encodedName = HRegionInfo.encodeRegionName(regionName); RegionState regionState = getRegionState(encodedName); if (regionState != null) { return regionState.getRegion(); } try { Pair p = MetaTableAccessor.getRegion(server.getConnection(), regionName); HRegionInfo hri = p == null ? null : p.getFirst(); if (hri != null) { createRegionState(hri); } return hri; } catch (IOException e) { server.abort("Aborting because error occurred while reading " + Bytes.toStringBinary(regionName) + " from hbase:meta", e); return null; } } static boolean isOneOfStates(RegionState regionState, State... states) { State s = regionState != null ? regionState.getState() : null; for (State state: states) { if (s == state) return true; } return false; } /** * Update a region state. It will be put in transition if not already there. */ private RegionState updateRegionState(final HRegionInfo hri, final State state, final ServerName serverName, long openSeqNum) { if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) { LOG.warn("Failed to open/close " + hri.getShortNameToLog() + " on " + serverName + ", set to " + state); } String encodedName = hri.getEncodedName(); RegionState regionState = new RegionState( hri, state, System.currentTimeMillis(), serverName); RegionState oldState = getRegionState(encodedName); if (!regionState.equals(oldState)) { LOG.info("Transition " + oldState + " to " + regionState); // Persist region state before updating in-memory info, if needed regionStateStore.updateRegionState(openSeqNum, regionState, oldState); } synchronized (this) { RegionState oldRegionState = regionsInTransition.put(encodedName, regionState); // When region transform old region state to new region state, // accumulate the RIT duration to new region state. if (oldRegionState != null) { regionState.updateRitDuration(oldRegionState.getStamp()); } putRegionState(regionState); // For these states, region should be properly closed. // There should be no log splitting issue. if ((state == State.CLOSED || state == State.MERGED || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) { ServerName last = lastAssignments.get(encodedName); if (last.equals(serverName)) { lastAssignments.remove(encodedName); } else { LOG.warn(encodedName + " moved to " + state + " on " + serverName + ", expected " + last); } } // Once a region is opened, record its last assignment right away. if (serverName != null && state == State.OPEN) { ServerName last = lastAssignments.get(encodedName); if (!serverName.equals(last)) { lastAssignments.put(encodedName, serverName); if (last != null && isServerDeadAndNotProcessed(last)) { LOG.warn(encodedName + " moved to " + serverName + ", while it's previous host " + last + " is dead but not processed yet"); } } } // notify the change this.notifyAll(); } return regionState; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy