org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hbase-server Show documentation
Show all versions of hbase-server Show documentation
Server functionality for HBase
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.replication.master;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.cleaner.BaseLogCleanerDelegate;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager;
import org.apache.hadoop.hbase.replication.ReplicationException;
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
import org.apache.hadoop.hbase.replication.ReplicationOffsetUtil;
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.base.Predicate;
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils;
/**
* Implementation of a log cleaner that checks if a log is still scheduled for replication before
* deleting it when its TTL is over.
*
* The logic is a bit complicated after we switch to use table based replication queue storage, see
* the design doc in HBASE-27109 and the comments in HBASE-27214 for more details.
*/
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
public class ReplicationLogCleaner extends BaseLogCleanerDelegate {
private static final Logger LOG = LoggerFactory.getLogger(ReplicationLogCleaner.class);
private Set notFullyDeadServers;
private Set peerIds;
// ServerName -> PeerId -> WalGroup -> Offset
// Here the server name is the source server name, so we can make sure that there is only one
// queue for a given peer, that why we can use a String peerId as key instead of
// ReplicationQueueId.
private Map>> replicationOffsets;
private ReplicationPeerManager rpm;
private Supplier> getNotFullyDeadServers;
private boolean canFilter;
private boolean stopped = false;
@Override
public void preClean() {
if (this.getConf() == null) {
return;
}
try {
if (!rpm.getQueueStorage().hasData()) {
return;
}
} catch (ReplicationException e) {
LOG.error("Error occurred while executing queueStorage.hasData()", e);
return;
}
canFilter = rpm.getReplicationLogCleanerBarrier().start();
if (canFilter) {
notFullyDeadServers = getNotFullyDeadServers.get();
peerIds = rpm.listPeers(null).stream().map(ReplicationPeerDescription::getPeerId)
.collect(Collectors.toSet());
// must get the not fully dead servers first and then get the replication queue data, in this
// way we can make sure that, we should have added the missing replication queues for the dead
// region servers recorded in the above set, otherwise the logic in the
// filterForDeadRegionServer method may lead us delete wal still in use.
List allQueueData;
try {
allQueueData = rpm.getQueueStorage().listAllQueues();
} catch (ReplicationException e) {
LOG.error("Can not list all replication queues, give up cleaning", e);
rpm.getReplicationLogCleanerBarrier().stop();
canFilter = false;
notFullyDeadServers = null;
peerIds = null;
return;
}
replicationOffsets = new HashMap<>();
for (ReplicationQueueData queueData : allQueueData) {
ReplicationQueueId queueId = queueData.getId();
ServerName serverName = queueId.getServerWALsBelongTo();
Map> peerId2Offsets =
replicationOffsets.computeIfAbsent(serverName, k -> new HashMap<>());
Map offsets =
peerId2Offsets.computeIfAbsent(queueId.getPeerId(), k -> new HashMap<>());
offsets.putAll(queueData.getOffsets());
}
} else {
LOG.info("Skip replication log cleaner because an AddPeerProcedure is running");
}
}
@Override
public void postClean() {
if (canFilter) {
rpm.getReplicationLogCleanerBarrier().stop();
canFilter = false;
// release memory
notFullyDeadServers = null;
peerIds = null;
replicationOffsets = null;
}
}
private boolean shouldDelete(ReplicationGroupOffset offset, FileStatus file) {
return !ReplicationOffsetUtil.shouldReplicate(offset, file.getPath().getName());
}
private boolean filterForLiveRegionServer(ServerName serverName, FileStatus file) {
Map> peerId2Offsets =
replicationOffsets.get(serverName);
if (peerId2Offsets == null) {
// if there are replication queues missing, we can not delete the wal
return false;
}
for (String peerId : peerIds) {
Map offsets = peerId2Offsets.get(peerId);
// if no replication queue for a peer, we can not delete the wal
if (offsets == null) {
return false;
}
String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getPath().getName());
ReplicationGroupOffset offset = offsets.get(walGroupId);
// if a replication queue still need to replicate this wal, we can not delete it
if (!shouldDelete(offset, file)) {
return false;
}
}
// if all replication queues have already finished replicating this wal, we can delete it.
return true;
}
private boolean filterForDeadRegionServer(ServerName serverName, FileStatus file) {
Map> peerId2Offsets =
replicationOffsets.get(serverName);
if (peerId2Offsets == null) {
// no replication queue for this dead rs, we can delete all wal files for it
return true;
}
for (String peerId : peerIds) {
Map offsets = peerId2Offsets.get(peerId);
if (offsets == null) {
// for dead server, we only care about existing replication queues, as we will delete a
// queue after we finish replicating it.
continue;
}
String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getPath().getName());
ReplicationGroupOffset offset = offsets.get(walGroupId);
// if a replication queue still need to replicate this wal, we can not delete it
if (!shouldDelete(offset, file)) {
return false;
}
}
// if all replication queues have already finished replicating this wal, we can delete it.
return true;
}
@Override
public Iterable getDeletableFiles(Iterable files) {
// all members of this class are null if replication is disabled,
// so we cannot filter the files
if (this.getConf() == null) {
return files;
}
if (!canFilter) {
// We can not delete anything if there are AddPeerProcedure running at the same time
// See HBASE-27214 for more details.
return Collections.emptyList();
}
return Iterables.filter(files, new Predicate() {
@Override
public boolean apply(FileStatus file) {
// just for overriding the findbugs NP warnings, as the parameter is marked as Nullable in
// the guava Predicate.
if (file == null) {
return false;
}
if (peerIds.isEmpty()) {
// no peer, can always delete
return true;
}
// not a valid wal file name, delete
if (!AbstractFSWALProvider.validateWALFilename(file.getPath().getName())) {
return true;
}
// meta wal is always deletable as we will never replicate it
if (AbstractFSWALProvider.isMetaFile(file.getPath())) {
return true;
}
ServerName serverName =
AbstractFSWALProvider.parseServerNameFromWALName(file.getPath().getName());
if (notFullyDeadServers.contains(serverName)) {
return filterForLiveRegionServer(serverName, file);
} else {
return filterForDeadRegionServer(serverName, file);
}
}
});
}
private Set getNotFullyDeadServers(MasterServices services) {
List onlineServers = services.getServerManager().getOnlineServersList();
return Stream.concat(onlineServers.stream(),
services.getMasterProcedureExecutor().getProcedures().stream()
.filter(p -> p instanceof ServerCrashProcedure).filter(p -> !p.isFinished())
.map(p -> ((ServerCrashProcedure) p).getServerName()))
.collect(Collectors.toSet());
}
@Override
public void init(Map params) {
super.init(params);
if (MapUtils.isNotEmpty(params)) {
Object master = params.get(HMaster.MASTER);
if (master != null && master instanceof MasterServices) {
MasterServices m = (MasterServices) master;
rpm = m.getReplicationPeerManager();
getNotFullyDeadServers = () -> getNotFullyDeadServers(m);
return;
}
}
throw new IllegalArgumentException("Missing " + HMaster.MASTER + " parameter");
}
@Override
public void stop(String why) {
this.stopped = true;
}
@Override
public boolean isStopped() {
return this.stopped;
}
}