org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Server functionality for HBase
There is a newer version: 3.0.0-beta-1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hbase.replication.master;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.cleaner.BaseLogCleanerDelegate;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager;
import org.apache.hadoop.hbase.replication.ReplicationException;
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
import org.apache.hadoop.hbase.replication.ReplicationOffsetUtil;
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.hbase.thirdparty.com.google.common.base.Predicate;
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils;

/**
 * Implementation of a log cleaner that checks if a log is still scheduled for replication before
 * deleting it when its TTL is over.
 * 
 * The logic is a bit complicated after we switch to use table based replication queue storage, see
 * the design doc in HBASE-27109 and the comments in HBASE-27214 for more details.
 */
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
public class ReplicationLogCleaner extends BaseLogCleanerDelegate {
  private static final Logger LOG = LoggerFactory.getLogger(ReplicationLogCleaner.class);
  private Set notFullyDeadServers;
  private Set peerIds;
  // ServerName -> PeerId -> WalGroup -> Offset
  // Here the server name is the source server name, so we can make sure that there is only one
  // queue for a given peer, that why we can use a String peerId as key instead of
  // ReplicationQueueId.
  private Map>> replicationOffsets;
  private ReplicationPeerManager rpm;
  private Supplier> getNotFullyDeadServers;

  private boolean canFilter;
  private boolean stopped = false;

  @Override
  public void preClean() {
    if (this.getConf() == null) {
      return;
    }
    try {
      if (!rpm.getQueueStorage().hasData()) {
        return;
      }
    } catch (ReplicationException e) {
      LOG.error("Error occurred while executing queueStorage.hasData()", e);
      return;
    }
    canFilter = rpm.getReplicationLogCleanerBarrier().start();
    if (canFilter) {
      notFullyDeadServers = getNotFullyDeadServers.get();
      peerIds = rpm.listPeers(null).stream().map(ReplicationPeerDescription::getPeerId)
        .collect(Collectors.toSet());
      // must get the not fully dead servers first and then get the replication queue data, in this
      // way we can make sure that, we should have added the missing replication queues for the dead
      // region servers recorded in the above set, otherwise the logic in the
      // filterForDeadRegionServer method may lead us delete wal still in use.
      List allQueueData;
      try {
        allQueueData = rpm.getQueueStorage().listAllQueues();
      } catch (ReplicationException e) {
        LOG.error("Can not list all replication queues, give up cleaning", e);
        rpm.getReplicationLogCleanerBarrier().stop();
        canFilter = false;
        notFullyDeadServers = null;
        peerIds = null;
        return;
      }
      replicationOffsets = new HashMap<>();
      for (ReplicationQueueData queueData : allQueueData) {
        ReplicationQueueId queueId = queueData.getId();
        ServerName serverName = queueId.getServerWALsBelongTo();
        Map> peerId2Offsets =
          replicationOffsets.computeIfAbsent(serverName, k -> new HashMap<>());
        Map offsets =
          peerId2Offsets.computeIfAbsent(queueId.getPeerId(), k -> new HashMap<>());
        offsets.putAll(queueData.getOffsets());
      }
    } else {
      LOG.info("Skip replication log cleaner because an AddPeerProcedure is running");
    }
  }

  @Override
  public void postClean() {
    if (canFilter) {
      rpm.getReplicationLogCleanerBarrier().stop();
      canFilter = false;
      // release memory
      notFullyDeadServers = null;
      peerIds = null;
      replicationOffsets = null;
    }
  }

  private boolean shouldDelete(ReplicationGroupOffset offset, FileStatus file) {
    return !ReplicationOffsetUtil.shouldReplicate(offset, file.getPath().getName());
  }

  private boolean filterForLiveRegionServer(ServerName serverName, FileStatus file) {
    Map> peerId2Offsets =
      replicationOffsets.get(serverName);
    if (peerId2Offsets == null) {
      // if there are replication queues missing, we can not delete the wal
      return false;
    }
    for (String peerId : peerIds) {
      Map offsets = peerId2Offsets.get(peerId);
      // if no replication queue for a peer, we can not delete the wal
      if (offsets == null) {
        return false;
      }
      String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getPath().getName());
      ReplicationGroupOffset offset = offsets.get(walGroupId);
      // if a replication queue still need to replicate this wal, we can not delete it
      if (!shouldDelete(offset, file)) {
        return false;
      }
    }
    // if all replication queues have already finished replicating this wal, we can delete it.
    return true;
  }

  private boolean filterForDeadRegionServer(ServerName serverName, FileStatus file) {
    Map> peerId2Offsets =
      replicationOffsets.get(serverName);
    if (peerId2Offsets == null) {
      // no replication queue for this dead rs, we can delete all wal files for it
      return true;
    }
    for (String peerId : peerIds) {
      Map offsets = peerId2Offsets.get(peerId);
      if (offsets == null) {
        // for dead server, we only care about existing replication queues, as we will delete a
        // queue after we finish replicating it.
        continue;
      }
      String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getPath().getName());
      ReplicationGroupOffset offset = offsets.get(walGroupId);
      // if a replication queue still need to replicate this wal, we can not delete it
      if (!shouldDelete(offset, file)) {
        return false;
      }
    }
    // if all replication queues have already finished replicating this wal, we can delete it.
    return true;
  }

  @Override
  public Iterable getDeletableFiles(Iterable files) {
    // all members of this class are null if replication is disabled,
    // so we cannot filter the files
    if (this.getConf() == null) {
      return files;
    }
    if (!canFilter) {
      // We can not delete anything if there are AddPeerProcedure running at the same time
      // See HBASE-27214 for more details.
      return Collections.emptyList();
    }

    return Iterables.filter(files, new Predicate() {
      @Override
      public boolean apply(FileStatus file) {
        // just for overriding the findbugs NP warnings, as the parameter is marked as Nullable in
        // the guava Predicate.
        if (file == null) {
          return false;
        }
        if (peerIds.isEmpty()) {
          // no peer, can always delete
          return true;
        }
        // not a valid wal file name, delete
        if (!AbstractFSWALProvider.validateWALFilename(file.getPath().getName())) {
          return true;
        }
        // meta wal is always deletable as we will never replicate it
        if (AbstractFSWALProvider.isMetaFile(file.getPath())) {
          return true;
        }
        ServerName serverName =
          AbstractFSWALProvider.parseServerNameFromWALName(file.getPath().getName());
        if (notFullyDeadServers.contains(serverName)) {
          return filterForLiveRegionServer(serverName, file);
        } else {
          return filterForDeadRegionServer(serverName, file);
        }
      }
    });
  }

  private Set getNotFullyDeadServers(MasterServices services) {
    List onlineServers = services.getServerManager().getOnlineServersList();
    return Stream.concat(onlineServers.stream(),
      services.getMasterProcedureExecutor().getProcedures().stream()
        .filter(p -> p instanceof ServerCrashProcedure).filter(p -> !p.isFinished())
        .map(p -> ((ServerCrashProcedure) p).getServerName()))
      .collect(Collectors.toSet());
  }

  @Override
  public void init(Map params) {
    super.init(params);
    if (MapUtils.isNotEmpty(params)) {
      Object master = params.get(HMaster.MASTER);
      if (master != null && master instanceof MasterServices) {
        MasterServices m = (MasterServices) master;
        rpm = m.getReplicationPeerManager();
        getNotFullyDeadServers = () -> getNotFullyDeadServers(m);
        return;
      }
    }
    throw new IllegalArgumentException("Missing " + HMaster.MASTER + " parameter");
  }

  @Override
  public void stop(String why) {
    this.stopped = true;
  }

  @Override
  public boolean isStopped() {
    return this.stopped;
  }
}