org.apache.hadoop.hbase.master.replication.SyncReplicationReplayWALManager Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Server functionality for HBase
There is a newer version: 3.0.0-beta-1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.master.replication;

import static org.apache.hadoop.hbase.replication.ReplicationUtils.getPeerRemoteWALDir;
import static org.apache.hadoop.hbase.replication.ReplicationUtils.getPeerReplayWALDir;
import static org.apache.hadoop.hbase.replication.ReplicationUtils.getPeerSnapshotWALDir;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.ServerListener;
import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureScheduler;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureEvent;
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
import org.apache.hadoop.hbase.replication.ReplicationException;
import org.apache.hadoop.hbase.replication.ReplicationUtils;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * The manager for replaying remote wal.
 * 
 * First, it will be used to balance the replay work across all the region servers. We will record
 * the region servers which have already been used for replaying wal, and prevent sending new replay
 * work to it, until the previous replay work has been done, where we will remove the region server
 * from the used worker set. See the comment for {@code UsedReplayWorkersForPeer} for more details.
 * 

 * Second, the logic for managing the remote wal directory is kept here. Before replaying the wals,
 * we will rename the remote wal directory, the new name is called 'replay' directory, see
 * {@link #renameToPeerReplayWALDir(String)}. This is used to prevent further writing of remote
 * wals, which is very important for keeping consistency. And then we will start replaying all the
 * wals, once a wal has been replayed, we will truncate the file, so that if there are crashes
 * happen, we do not need to replay all the wals again, see {@link #finishReplayWAL(String)} and
 * {@link #isReplayWALFinished(String)}. After replaying all the wals, we will rename the 'replay'
 * directory, the new name is called 'snapshot' directory. In the directory, we will keep all the
 * names for the wals being replayed, since all the files should have been truncated. When we
 * transitting original the ACTIVE cluster to STANDBY later, and there are region server crashes, we
 * will see the wals in this directory to determine whether a wal should be split and replayed or
 * not. You can see the code in {@link org.apache.hadoop.hbase.regionserver.SplitLogWorker} for more
 * details.
 */
@InterfaceAudience.Private
public class SyncReplicationReplayWALManager {

  private static final Logger LOG = LoggerFactory.getLogger(SyncReplicationReplayWALManager.class);

  private final ServerManager serverManager;

  private final FileSystem fs;

  private final Path walRootDir;

  private final Path remoteWALDir;

  /**
   * This class is used to record the used workers(region servers) for a replication peer. For
   * balancing the replaying remote wal job, we will only schedule one remote replay procedure each
   * time. So when acquiring a worker, we will first get all the region servers for this cluster,
   * and then filter out the used ones.
   * 

   * The {@link ProcedureEvent} is used for notifying procedures that there are available workers
   * now. We used to use sleeping and retrying before, but if the interval is too large, for
   * example, exponential backoff, then it is not effective, but if the interval is too small, then
   * we will flood the procedure wal.
   * 
   * The states are only kept in memory, so when restarting, we need to reconstruct these
   * information, using the information stored in related procedures. See the {@code afterReplay}
   * method in {@link RecoverStandbyProcedure} and {@link SyncReplicationReplayWALProcedure} for
   * more details.
   */
  private static final class UsedReplayWorkersForPeer {

    private final Set usedWorkers = new HashSet();

    private final ProcedureEvent event;

    public UsedReplayWorkersForPeer(String peerId) {
      this.event = new ProcedureEvent<>(peerId);
    }

    public void used(ServerName worker) {
      usedWorkers.add(worker);
    }

    public Optional acquire(ServerManager serverManager) {
      Optional worker = serverManager.getOnlineServers().keySet().stream()
        .filter(server -> !usedWorkers.contains(server)).findAny();
      worker.ifPresent(usedWorkers::add);
      return worker;
    }

    public void release(ServerName worker) {
      usedWorkers.remove(worker);
    }

    public void suspend(Procedure proc) {
      event.suspend();
      event.suspendIfNotReady(proc);
    }

    public void wake(MasterProcedureScheduler scheduler) {
      if (!event.isReady()) {
        event.wake(scheduler);
      }
    }
  }

  private final ConcurrentMap usedWorkersByPeer =
    new ConcurrentHashMap<>();

  public SyncReplicationReplayWALManager(MasterServices services)
    throws IOException, ReplicationException {
    this.serverManager = services.getServerManager();
    this.fs = services.getMasterFileSystem().getWALFileSystem();
    this.walRootDir = services.getMasterFileSystem().getWALRootDir();
    this.remoteWALDir = new Path(this.walRootDir, ReplicationUtils.REMOTE_WAL_DIR_NAME);
    serverManager.registerListener(new ServerListener() {

      @Override
      public void serverAdded(ServerName serverName) {
        MasterProcedureScheduler scheduler =
          services.getMasterProcedureExecutor().getEnvironment().getProcedureScheduler();
        for (UsedReplayWorkersForPeer usedWorkers : usedWorkersByPeer.values()) {
          synchronized (usedWorkers) {
            usedWorkers.wake(scheduler);
          }
        }
      }
    });
  }

  public void registerPeer(String peerId) {
    usedWorkersByPeer.put(peerId, new UsedReplayWorkersForPeer(peerId));
  }

  public void unregisterPeer(String peerId) {
    usedWorkersByPeer.remove(peerId);
  }

  /**
   * Get a worker for replaying remote wal for a give peer. If no worker available, i.e, all the
   * region servers have been used by others, a {@link ProcedureSuspendedException} will be thrown
   * to suspend the procedure. And it will be woken up later when there are available workers,
   * either by others release a worker, or there is a new region server joins the cluster.
   */
  public ServerName acquirePeerWorker(String peerId, Procedure proc)
    throws ProcedureSuspendedException {
    UsedReplayWorkersForPeer usedWorkers = usedWorkersByPeer.get(peerId);
    synchronized (usedWorkers) {
      Optional worker = usedWorkers.acquire(serverManager);
      if (worker.isPresent()) {
        return worker.get();
      }
      // no worker available right now, suspend the procedure
      usedWorkers.suspend(proc);
    }
    throw new ProcedureSuspendedException();
  }

  public void releasePeerWorker(String peerId, ServerName worker,
    MasterProcedureScheduler scheduler) {
    UsedReplayWorkersForPeer usedWorkers = usedWorkersByPeer.get(peerId);
    synchronized (usedWorkers) {
      usedWorkers.release(worker);
      usedWorkers.wake(scheduler);
    }
  }

  /**
   * Will only be called when loading procedures, where we need to construct the used worker set for
   * each peer.
   */
  public void addUsedPeerWorker(String peerId, ServerName worker) {
    usedWorkersByPeer.get(peerId).used(worker);
  }

  public void createPeerRemoteWALDir(String peerId) throws IOException {
    Path peerRemoteWALDir = getPeerRemoteWALDir(remoteWALDir, peerId);
    if (!fs.exists(peerRemoteWALDir) && !fs.mkdirs(peerRemoteWALDir)) {
      throw new IOException("Unable to mkdir " + peerRemoteWALDir);
    }
  }

  private void rename(Path src, Path dst, String peerId) throws IOException {
    if (fs.exists(src)) {
      deleteDir(dst, peerId);
      if (!fs.rename(src, dst)) {
        throw new IOException(
          "Failed to rename dir from " + src + " to " + dst + " for peer id=" + peerId);
      }
      LOG.info("Renamed dir from {} to {} for peer id={}", src, dst, peerId);
    } else if (!fs.exists(dst)) {
      throw new IOException(
        "Want to rename from " + src + " to " + dst + ", but they both do not exist");
    }
  }

  public void renameToPeerReplayWALDir(String peerId) throws IOException {
    rename(getPeerRemoteWALDir(remoteWALDir, peerId), getPeerReplayWALDir(remoteWALDir, peerId),
      peerId);
  }

  public void renameToPeerSnapshotWALDir(String peerId) throws IOException {
    rename(getPeerReplayWALDir(remoteWALDir, peerId), getPeerSnapshotWALDir(remoteWALDir, peerId),
      peerId);
  }

  public List getReplayWALsAndCleanUpUnusedFiles(String peerId) throws IOException {
    Path peerReplayWALDir = getPeerReplayWALDir(remoteWALDir, peerId);
    for (FileStatus status : fs.listStatus(peerReplayWALDir,
      p -> p.getName().endsWith(ReplicationUtils.RENAME_WAL_SUFFIX))) {
      Path src = status.getPath();
      String srcName = src.getName();
      String dstName =
        srcName.substring(0, srcName.length() - ReplicationUtils.RENAME_WAL_SUFFIX.length());
      FSUtils.renameFile(fs, src, new Path(src.getParent(), dstName));
    }
    List wals = new ArrayList<>();
    for (FileStatus status : fs.listStatus(peerReplayWALDir)) {
      Path path = status.getPath();
      if (path.getName().endsWith(ReplicationUtils.SYNC_WAL_SUFFIX)) {
        wals.add(path);
      } else {
        if (!fs.delete(path, true)) {
          LOG.warn("Can not delete unused file: " + path);
        }
      }
    }
    return wals;
  }

  private void deleteDir(Path dir, String peerId) throws IOException {
    if (!fs.delete(dir, true) && fs.exists(dir)) {
      throw new IOException("Failed to remove dir " + dir + " for peer id=" + peerId);
    }
  }

  public void removePeerRemoteWALs(String peerId) throws IOException {
    deleteDir(getPeerRemoteWALDir(remoteWALDir, peerId), peerId);
    deleteDir(getPeerReplayWALDir(remoteWALDir, peerId), peerId);
    deleteDir(getPeerSnapshotWALDir(remoteWALDir, peerId), peerId);
  }

  public String removeWALRootPath(Path path) {
    String pathStr = path.toString();
    // remove the "/" too.
    return pathStr.substring(walRootDir.toString().length() + 1);
  }

  public void finishReplayWAL(String wal) throws IOException {
    Path walPath = new Path(walRootDir, wal);
    fs.truncate(walPath, 0);
  }

  public boolean isReplayWALFinished(String wal) throws IOException {
    Path walPath = new Path(walRootDir, wal);
    return fs.getFileStatus(walPath).getLen() == 0;
  }

  public Path getRemoteWALDir() {
    return remoteWALDir;
  }
}