org.apache.hadoop.hbase.master.SplitLogManager Maven / Gradle / Ivy
Show all versions of hbase-server Show documentation
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import static org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective.CHECK;
import static org.apache.hadoop.hbase.master.SplitLogManager.ResubmitDirective.FORCE;
import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.DELETED;
import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.FAILURE;
import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.IN_PROGRESS;
import static org.apache.hadoop.hbase.master.SplitLogManager.TerminationStatus.SUCCESS;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.Chore;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.SplitLogCounters;
import org.apache.hadoop.hbase.SplitLogTask;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.exceptions.DeserializationException;
import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.RegionStoreSequenceIds;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
import org.apache.hadoop.hbase.regionserver.SplitLogWorker;
import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
import org.apache.hadoop.hbase.regionserver.wal.HLogUtil;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.util.StringUtils;
import org.apache.zookeeper.AsyncCallback;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NoNodeException;
import org.apache.zookeeper.ZooDefs.Ids;
import org.apache.zookeeper.data.Stat;
import com.google.common.annotations.VisibleForTesting;
/**
* Distributes the task of log splitting to the available region servers.
* Coordination happens via zookeeper. For every log file that has to be split a
* znode is created under /hbase/splitlog
. SplitLogWorkers race to grab a task.
*
* SplitLogManager monitors the task znodes that it creates using the
* timeoutMonitor thread. If a task's progress is slow then
* {@link #resubmit(String, Task, ResubmitDirective)} will take away the task from the owner
* {@link SplitLogWorker} and the task will be up for grabs again. When the task is done then the
* task's znode is deleted by SplitLogManager.
*
*
Clients call {@link #splitLogDistributed(Path)} to split a region server's
* log files. The caller thread waits in this method until all the log files
* have been split.
*
*
All the zookeeper calls made by this class are asynchronous. This is mainly
* to help reduce response time seen by the callers.
*
*
There is race in this design between the SplitLogManager and the
* SplitLogWorker. SplitLogManager might re-queue a task that has in reality
* already been completed by a SplitLogWorker. We rely on the idempotency of
* the log splitting task for correctness.
*
*
It is also assumed that every log splitting task is unique and once
* completed (either with success or with error) it will be not be submitted
* again. If a task is resubmitted then there is a risk that old "delete task"
* can delete the re-submission.
*/
@InterfaceAudience.Private
public class SplitLogManager extends ZooKeeperListener {
private static final Log LOG = LogFactory.getLog(SplitLogManager.class);
public static final int DEFAULT_TIMEOUT = 120000;
public static final int DEFAULT_ZK_RETRIES = 3;
public static final int DEFAULT_MAX_RESUBMIT = 3;
public static final int DEFAULT_UNASSIGNED_TIMEOUT = (3 * 60 * 1000); //3 min
private final Stoppable stopper;
private final MasterServices master;
private final ServerName serverName;
private final TaskFinisher taskFinisher;
private FileSystem fs;
private Configuration conf;
private long zkretries;
private long resubmit_threshold;
private long timeout;
private long unassignedTimeout;
private long lastTaskCreateTime = Long.MAX_VALUE;
public boolean ignoreZKDeleteForTesting = false;
private volatile long lastRecoveringNodeCreationTime = 0;
// When lastRecoveringNodeCreationTime is older than the following threshold, we'll check
// whether to GC stale recovering znodes
private long checkRecoveringTimeThreshold = 15000; // 15 seconds
private final List, Boolean>> failedRecoveringRegionDeletions = Collections
.synchronizedList(new ArrayList, Boolean>>());
/**
* In distributedLogReplay mode, we need touch both splitlog and recovering-regions znodes in one
* operation. So the lock is used to guard such cases.
*/
protected final ReentrantLock recoveringRegionLock = new ReentrantLock();
private volatile RecoveryMode recoveryMode;
private volatile boolean isDrainingDone = false;
private final ConcurrentMap tasks = new ConcurrentHashMap();
private TimeoutMonitor timeoutMonitor;
private volatile Set deadWorkers = null;
private final Object deadWorkersLock = new Object();
private Set failedDeletions = null;
/**
* Wrapper around {@link #SplitLogManager(ZooKeeperWatcher zkw, Configuration conf,
* Stoppable stopper, MasterServices master, ServerName serverName,
* boolean masterRecovery, TaskFinisher tf)}
* that provides a task finisher for copying recovered edits to their final destination.
* The task finisher has to be robust because it can be arbitrarily restarted or called
* multiple times.
*
* @param zkw the ZK watcher
* @param conf the HBase configuration
* @param stopper the stoppable in case anything is wrong
* @param master the master services
* @param serverName the master server name
* @param masterRecovery an indication if the master is in recovery
* @throws KeeperException
* @throws InterruptedIOException
*/
public SplitLogManager(ZooKeeperWatcher zkw, final Configuration conf,
Stoppable stopper, MasterServices master, ServerName serverName, boolean masterRecovery)
throws InterruptedIOException, KeeperException {
this(zkw, conf, stopper, master, serverName, masterRecovery, new TaskFinisher() {
@Override
public Status finish(ServerName workerName, String logfile) {
try {
HLogSplitter.finishSplitLogFile(logfile, conf);
} catch (IOException e) {
LOG.warn("Could not finish splitting of log file " + logfile, e);
return Status.ERR;
}
return Status.DONE;
}
});
}
/**
* Its OK to construct this object even when region-servers are not online. It
* does lookup the orphan tasks in zk but it doesn't block waiting for them
* to be done.
*
* @param zkw the ZK watcher
* @param conf the HBase configuration
* @param stopper the stoppable in case anything is wrong
* @param master the master services
* @param serverName the master server name
* @param masterRecovery an indication if the master is in recovery
* @param tf task finisher
* @throws KeeperException
* @throws InterruptedIOException
*/
public SplitLogManager(ZooKeeperWatcher zkw, Configuration conf,
Stoppable stopper, MasterServices master,
ServerName serverName, boolean masterRecovery, TaskFinisher tf)
throws InterruptedIOException, KeeperException {
super(zkw);
this.taskFinisher = tf;
this.conf = conf;
this.stopper = stopper;
this.master = master;
this.zkretries = conf.getLong("hbase.splitlog.zk.retries", DEFAULT_ZK_RETRIES);
this.resubmit_threshold = conf.getLong("hbase.splitlog.max.resubmit", DEFAULT_MAX_RESUBMIT);
this.timeout = conf.getInt("hbase.splitlog.manager.timeout", DEFAULT_TIMEOUT);
this.unassignedTimeout =
conf.getInt("hbase.splitlog.manager.unassigned.timeout", DEFAULT_UNASSIGNED_TIMEOUT);
// Determine recovery mode
setRecoveryMode(true);
LOG.info("Timeout=" + timeout + ", unassigned timeout=" + unassignedTimeout +
", distributedLogReplay=" + (this.recoveryMode == RecoveryMode.LOG_REPLAY));
this.serverName = serverName;
this.timeoutMonitor = new TimeoutMonitor(
conf.getInt("hbase.splitlog.manager.timeoutmonitor.period", 1000), stopper);
this.failedDeletions = Collections.synchronizedSet(new HashSet());
if (!masterRecovery) {
Threads.setDaemonThreadRunning(timeoutMonitor.getThread(), serverName
+ ".splitLogManagerTimeoutMonitor");
}
// Watcher can be null during tests with Mock'd servers.
if (this.watcher != null) {
this.watcher.registerListener(this);
lookForOrphans();
}
}
private FileStatus[] getFileList(List logDirs, PathFilter filter) throws IOException {
List fileStatus = new ArrayList();
for (Path hLogDir : logDirs) {
this.fs = hLogDir.getFileSystem(conf);
if (!fs.exists(hLogDir)) {
LOG.warn(hLogDir + " doesn't exist. Nothing to do!");
continue;
}
FileStatus[] logfiles = FSUtils.listStatus(fs, hLogDir, filter);
if (logfiles == null || logfiles.length == 0) {
LOG.info(hLogDir + " is empty dir, no logs to split");
} else {
Collections.addAll(fileStatus, logfiles);
}
}
FileStatus[] a = new FileStatus[fileStatus.size()];
return fileStatus.toArray(a);
}
/**
* @param logDir
* one region sever hlog dir path in .logs
* @throws IOException
* if there was an error while splitting any log file
* @return cumulative size of the logfiles split
* @throws IOException
*/
public long splitLogDistributed(final Path logDir) throws IOException {
List logDirs = new ArrayList();
logDirs.add(logDir);
return splitLogDistributed(logDirs);
}
/**
* The caller will block until all the log files of the given region server
* have been processed - successfully split or an error is encountered - by an
* available worker region server. This method must only be called after the
* region servers have been brought online.
*
* @param logDirs List of log dirs to split
* @throws IOException If there was an error while splitting any log file
* @return cumulative size of the logfiles split
*/
public long splitLogDistributed(final List logDirs) throws IOException {
if (logDirs.isEmpty()) {
return 0;
}
Set serverNames = new HashSet();
for (Path logDir : logDirs) {
try {
ServerName serverName = HLogUtil.getServerNameFromHLogDirectoryName(logDir);
if (serverName != null) {
serverNames.add(serverName);
}
} catch (IllegalArgumentException e) {
// ignore invalid format error.
LOG.warn("Cannot parse server name from " + logDir);
}
}
return splitLogDistributed(serverNames, logDirs, null);
}
/**
* The caller will block until all the hbase:meta log files of the given region server
* have been processed - successfully split or an error is encountered - by an
* available worker region server. This method must only be called after the
* region servers have been brought online.
*
* @param logDirs List of log dirs to split
* @param filter the Path filter to select specific files for considering
* @throws IOException If there was an error while splitting any log file
* @return cumulative size of the logfiles split
*/
public long splitLogDistributed(final Set serverNames, final List logDirs,
PathFilter filter) throws IOException {
MonitoredTask status = TaskMonitor.get().createStatus(
"Doing distributed log split in " + logDirs);
FileStatus[] logfiles = getFileList(logDirs, filter);
status.setStatus("Checking directory contents...");
LOG.debug("Scheduling batch of logs to split");
SplitLogCounters.tot_mgr_log_split_batch_start.incrementAndGet();
LOG.info("started splitting " + logfiles.length + " logs in " + logDirs);
long t = EnvironmentEdgeManager.currentTimeMillis();
long totalSize = 0;
TaskBatch batch = new TaskBatch();
Boolean isMetaRecovery = (filter == null) ? null : false;
for (FileStatus lf : logfiles) {
// TODO If the log file is still being written to - which is most likely
// the case for the last log file - then its length will show up here
// as zero. The size of such a file can only be retrieved after
// recover-lease is done. totalSize will be under in most cases and the
// metrics that it drives will also be under-reported.
totalSize += lf.getLen();
String pathToLog = FSUtils.removeRootPath(lf.getPath(), conf);
if (!enqueueSplitTask(pathToLog, batch)) {
throw new IOException("duplicate log split scheduled for " + lf.getPath());
}
}
waitForSplittingCompletion(batch, status);
// remove recovering regions from ZK
if (filter == MasterFileSystem.META_FILTER /* reference comparison */) {
// we split meta regions and user regions separately therefore logfiles are either all for
// meta or user regions but won't for both( we could have mixed situations in tests)
isMetaRecovery = true;
}
this.removeRecoveringRegionsFromZK(serverNames, isMetaRecovery);
if (batch.done != batch.installed) {
batch.isDead = true;
SplitLogCounters.tot_mgr_log_split_batch_err.incrementAndGet();
LOG.warn("error while splitting logs in " + logDirs +
" installed = " + batch.installed + " but only " + batch.done + " done");
String msg = "error or interrupted while splitting logs in "
+ logDirs + " Task = " + batch;
status.abort(msg);
throw new IOException(msg);
}
for(Path logDir: logDirs){
status.setStatus("Cleaning up log directory...");
try {
if (fs.exists(logDir) && !fs.delete(logDir, false)) {
LOG.warn("Unable to delete log src dir. Ignoring. " + logDir);
}
} catch (IOException ioe) {
FileStatus[] files = fs.listStatus(logDir);
if (files != null && files.length > 0) {
LOG.warn("returning success without actually splitting and " +
"deleting all the log files in path " + logDir);
} else {
LOG.warn("Unable to delete log src dir. Ignoring. " + logDir, ioe);
}
}
SplitLogCounters.tot_mgr_log_split_batch_success.incrementAndGet();
}
String msg = "finished splitting (more than or equal to) " + totalSize +
" bytes in " + batch.installed + " log files in " + logDirs + " in " +
(EnvironmentEdgeManager.currentTimeMillis() - t) + "ms";
status.markComplete(msg);
LOG.info(msg);
return totalSize;
}
/**
* Add a task entry to splitlog znode if it is not already there.
*
* @param taskname the path of the log to be split
* @param batch the batch this task belongs to
* @return true if a new entry is created, false if it is already there.
*/
boolean enqueueSplitTask(String taskname, TaskBatch batch) {
SplitLogCounters.tot_mgr_log_split_start.incrementAndGet();
// This is a znode path under the splitlog dir with the rest of the path made up of an
// url encoding of the passed in log to split.
String path = ZKSplitLog.getEncodedNodeName(watcher, taskname);
lastTaskCreateTime = EnvironmentEdgeManager.currentTimeMillis();
Task oldtask = createTaskIfAbsent(path, batch);
if (oldtask == null) {
// publish the task in zk
createNode(path, zkretries);
return true;
}
return false;
}
private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
synchronized (batch) {
while ((batch.done + batch.error) != batch.installed) {
try {
status.setStatus("Waiting for distributed tasks to finish. "
+ " scheduled=" + batch.installed
+ " done=" + batch.done
+ " error=" + batch.error);
int remaining = batch.installed - (batch.done + batch.error);
int actual = activeTasks(batch);
if (remaining != actual) {
LOG.warn("Expected " + remaining
+ " active tasks, but actually there are " + actual);
}
int remainingInZK = remainingTasksInZK();
if (remainingInZK >= 0 && actual > remainingInZK) {
LOG.warn("Expected at least" + actual
+ " tasks in ZK, but actually there are " + remainingInZK);
}
if (remainingInZK == 0 || actual == 0) {
LOG.warn("No more task remaining (ZK or task map), splitting "
+ "should have completed. Remaining tasks in ZK " + remainingInZK
+ ", active tasks in map " + actual);
if (remainingInZK == 0 && actual == 0) {
return;
}
}
batch.wait(100);
if (stopper.isStopped()) {
LOG.warn("Stopped while waiting for log splits to be completed");
return;
}
} catch (InterruptedException e) {
LOG.warn("Interrupted while waiting for log splits to be completed");
Thread.currentThread().interrupt();
return;
}
}
}
}
@VisibleForTesting
ConcurrentMap getTasks() {
return tasks;
}
private int activeTasks(final TaskBatch batch) {
int count = 0;
for (Task t: tasks.values()) {
if (t.batch == batch && t.status == TerminationStatus.IN_PROGRESS) {
count++;
}
}
return count;
}
private int remainingTasksInZK() {
int count = 0;
try {
List tasks =
ZKUtil.listChildrenNoWatch(watcher, watcher.splitLogZNode);
if (tasks != null) {
for (String t: tasks) {
if (!ZKSplitLog.isRescanNode(t)) {
count++;
}
}
}
} catch (KeeperException ke) {
LOG.warn("Failed to check remaining tasks", ke);
count = -1;
}
return count;
}
/**
* It removes recovering regions under /hbase/recovering-regions/[encoded region name] so that the
* region server hosting the region can allow reads to the recovered region
* @param serverNames servers which are just recovered
* @param isMetaRecovery whether current recovery is for the meta region on
* serverNames
*/
private void
removeRecoveringRegionsFromZK(final Set serverNames, Boolean isMetaRecovery) {
if (this.recoveryMode != RecoveryMode.LOG_REPLAY) {
// the function is only used in WALEdit direct replay mode
return;
}
final String metaEncodeRegionName = HRegionInfo.FIRST_META_REGIONINFO.getEncodedName();
int count = 0;
Set recoveredServerNameSet = new HashSet();
if (serverNames != null) {
for (ServerName tmpServerName : serverNames) {
recoveredServerNameSet.add(tmpServerName.getServerName());
}
}
try {
this.recoveringRegionLock.lock();
List tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.splitLogZNode);
if (tasks != null) {
for (String t : tasks) {
if (!ZKSplitLog.isRescanNode(t)) {
count++;
}
}
}
if (count == 0 && this.master.isInitialized()
&& !this.master.getServerManager().areDeadServersInProgress()) {
// no splitting work items left
deleteRecoveringRegionZNodes(watcher, null);
// reset lastRecoveringNodeCreationTime because we cleared all recovering znodes at
// this point.
lastRecoveringNodeCreationTime = Long.MAX_VALUE;
} else if (!recoveredServerNameSet.isEmpty()) {
// remove recovering regions which doesn't have any RS associated with it
List regions = ZKUtil.listChildrenNoWatch(watcher, watcher.recoveringRegionsZNode);
if (regions != null) {
for (String region : regions) {
if(isMetaRecovery != null) {
if ((isMetaRecovery && !region.equalsIgnoreCase(metaEncodeRegionName))
|| (!isMetaRecovery && region.equalsIgnoreCase(metaEncodeRegionName))) {
// skip non-meta regions when recovering the meta region or
// skip the meta region when recovering user regions
continue;
}
}
String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, region);
List failedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
if (failedServers == null || failedServers.isEmpty()) {
ZKUtil.deleteNode(watcher, nodePath);
continue;
}
if (recoveredServerNameSet.containsAll(failedServers)) {
ZKUtil.deleteNodeRecursively(watcher, nodePath);
} else {
for (String failedServer : failedServers) {
if (recoveredServerNameSet.contains(failedServer)) {
String tmpPath = ZKUtil.joinZNode(nodePath, failedServer);
ZKUtil.deleteNode(watcher, tmpPath);
}
}
}
}
}
}
} catch (KeeperException ke) {
LOG.warn("removeRecoveringRegionsFromZK got zookeeper exception. Will retry", ke);
if (serverNames != null && !serverNames.isEmpty()) {
this.failedRecoveringRegionDeletions.add(new Pair, Boolean>(serverNames,
isMetaRecovery));
}
} finally {
this.recoveringRegionLock.unlock();
}
}
/**
* It removes stale recovering regions under /hbase/recovering-regions/[encoded region name]
* during master initialization phase.
* @param failedServers A set of known failed servers
* @throws KeeperException
*/
void removeStaleRecoveringRegionsFromZK(final Set failedServers)
throws KeeperException {
Set knownFailedServers = new HashSet();
if (failedServers != null) {
for (ServerName tmpServerName : failedServers) {
knownFailedServers.add(tmpServerName.getServerName());
}
}
this.recoveringRegionLock.lock();
try {
List tasks = ZKUtil.listChildrenNoWatch(watcher, watcher.splitLogZNode);
if (tasks != null) {
for (String t : tasks) {
byte[] data = ZKUtil.getData(this.watcher, ZKUtil.joinZNode(watcher.splitLogZNode, t));
if (data != null) {
SplitLogTask slt = null;
try {
slt = SplitLogTask.parseFrom(data);
} catch (DeserializationException e) {
LOG.warn("Failed parse data for znode " + t, e);
}
if (slt != null && slt.isDone()) {
continue;
}
}
// decode the file name
t = ZKSplitLog.getFileName(t);
ServerName serverName = HLogUtil.getServerNameFromHLogDirectoryName(new Path(t));
if (serverName != null) {
knownFailedServers.add(serverName.getServerName());
} else {
LOG.warn("Found invalid WAL log file name:" + t);
}
}
}
// remove recovering regions which doesn't have any RS associated with it
List regions = ZKUtil.listChildrenNoWatch(watcher, watcher.recoveringRegionsZNode);
if (regions != null) {
for (String region : regions) {
String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, region);
List regionFailedServers = ZKUtil.listChildrenNoWatch(watcher, nodePath);
if (regionFailedServers == null || regionFailedServers.isEmpty()) {
ZKUtil.deleteNode(watcher, nodePath);
continue;
}
boolean needMoreRecovery = false;
for (String tmpFailedServer : regionFailedServers) {
if (knownFailedServers.contains(tmpFailedServer)) {
needMoreRecovery = true;
break;
}
}
if (!needMoreRecovery) {
ZKUtil.deleteNodeRecursively(watcher, nodePath);
}
}
}
} finally {
this.recoveringRegionLock.unlock();
}
}
public static void deleteRecoveringRegionZNodes(ZooKeeperWatcher watcher, List regions) {
try {
if (regions == null) {
// remove all children under /home/recovering-regions
LOG.info("Garbage collecting all recovering regions.");
ZKUtil.deleteChildrenRecursively(watcher, watcher.recoveringRegionsZNode);
} else {
for (String curRegion : regions) {
String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, curRegion);
ZKUtil.deleteNodeRecursively(watcher, nodePath);
}
}
} catch (KeeperException e) {
LOG.warn("Cannot remove recovering regions from ZooKeeper", e);
}
}
private void setDone(String path, TerminationStatus status) {
Task task = tasks.get(path);
if (task == null) {
if (!ZKSplitLog.isRescanNode(watcher, path)) {
SplitLogCounters.tot_mgr_unacquired_orphan_done.incrementAndGet();
LOG.debug("unacquired orphan task is done " + path);
}
} else {
synchronized (task) {
if (task.status == IN_PROGRESS) {
if (status == SUCCESS) {
SplitLogCounters.tot_mgr_log_split_success.incrementAndGet();
LOG.info("Done splitting " + path);
} else {
SplitLogCounters.tot_mgr_log_split_err.incrementAndGet();
LOG.warn("Error splitting " + path);
}
task.status = status;
if (task.batch != null) {
synchronized (task.batch) {
if (status == SUCCESS) {
task.batch.done++;
} else {
task.batch.error++;
}
task.batch.notify();
}
}
}
}
}
// delete the task node in zk. It's an async
// call and no one is blocked waiting for this node to be deleted. All
// task names are unique (log.) there is no risk of deleting
// a future task.
// if a deletion fails, TimeoutMonitor will retry the same deletion later
deleteNode(path, zkretries);
return;
}
private void createNode(String path, Long retry_count) {
SplitLogTask slt = new SplitLogTask.Unassigned(serverName, this.recoveryMode);
ZKUtil.asyncCreate(this.watcher, path, slt.toByteArray(), new CreateAsyncCallback(), retry_count);
SplitLogCounters.tot_mgr_node_create_queued.incrementAndGet();
return;
}
private void createNodeSuccess(String path) {
LOG.debug("put up splitlog task at znode " + path);
getDataSetWatch(path, zkretries);
}
private void createNodeFailure(String path) {
// TODO the Manager should split the log locally instead of giving up
LOG.warn("failed to create task node" + path);
setDone(path, FAILURE);
}
private void getDataSetWatch(String path, Long retry_count) {
this.watcher.getRecoverableZooKeeper().getZooKeeper().
getData(path, this.watcher,
new GetDataAsyncCallback(true), retry_count);
SplitLogCounters.tot_mgr_get_data_queued.incrementAndGet();
}
private void tryGetDataSetWatch(String path) {
// A negative retry count will lead to ignoring all error processing.
this.watcher.getRecoverableZooKeeper().getZooKeeper().
getData(path, this.watcher,
new GetDataAsyncCallback(false), Long.valueOf(-1) /* retry count */);
SplitLogCounters.tot_mgr_get_data_queued.incrementAndGet();
}
private void getDataSetWatchSuccess(String path, byte[] data, int version)
throws DeserializationException {
if (data == null) {
if (version == Integer.MIN_VALUE) {
// assume all done. The task znode suddenly disappeared.
setDone(path, SUCCESS);
return;
}
SplitLogCounters.tot_mgr_null_data.incrementAndGet();
LOG.fatal("logic error - got null data " + path);
setDone(path, FAILURE);
return;
}
data = this.watcher.getRecoverableZooKeeper().removeMetaData(data);
SplitLogTask slt = SplitLogTask.parseFrom(data);
if (slt.isUnassigned()) {
LOG.debug("task not yet acquired " + path + " ver = " + version);
handleUnassignedTask(path);
} else if (slt.isOwned()) {
heartbeat(path, version, slt.getServerName());
} else if (slt.isResigned()) {
LOG.info("task " + path + " entered state: " + slt.toString());
resubmitOrFail(path, FORCE);
} else if (slt.isDone()) {
LOG.info("task " + path + " entered state: " + slt.toString());
if (taskFinisher != null && !ZKSplitLog.isRescanNode(watcher, path)) {
if (taskFinisher.finish(slt.getServerName(), ZKSplitLog.getFileName(path)) == Status.DONE) {
setDone(path, SUCCESS);
} else {
resubmitOrFail(path, CHECK);
}
} else {
setDone(path, SUCCESS);
}
} else if (slt.isErr()) {
LOG.info("task " + path + " entered state: " + slt.toString());
resubmitOrFail(path, CHECK);
} else {
LOG.fatal("logic error - unexpected zk state for path = " + path + " data = " + slt.toString());
setDone(path, FAILURE);
}
}
private void getDataSetWatchFailure(String path) {
LOG.warn("failed to set data watch " + path);
setDone(path, FAILURE);
}
/**
* It is possible for a task to stay in UNASSIGNED state indefinitely - say
* SplitLogManager wants to resubmit a task. It forces the task to UNASSIGNED
* state but it dies before it could create the RESCAN task node to signal
* the SplitLogWorkers to pick up the task. To prevent this scenario the
* SplitLogManager resubmits all orphan and UNASSIGNED tasks at startup.
*
* @param path
*/
private void handleUnassignedTask(String path) {
if (ZKSplitLog.isRescanNode(watcher, path)) {
return;
}
Task task = findOrCreateOrphanTask(path);
if (task.isOrphan() && (task.incarnation == 0)) {
LOG.info("resubmitting unassigned orphan task " + path);
// ignore failure to resubmit. The timeout-monitor will handle it later
// albeit in a more crude fashion
resubmit(path, task, FORCE);
}
}
/**
* Helper function to check whether to abandon retries in ZooKeeper AsyncCallback functions
* @param statusCode integer value of a ZooKeeper exception code
* @param action description message about the retried action
* @return true when need to abandon retries otherwise false
*/
private boolean needAbandonRetries(int statusCode, String action) {
if (statusCode == KeeperException.Code.SESSIONEXPIRED.intValue()) {
LOG.error("ZK session expired. Master is expected to shut down. Abandoning retries for "
+ "action=" + action);
return true;
}
return false;
}
private void heartbeat(String path, int new_version, ServerName workerName) {
Task task = findOrCreateOrphanTask(path);
if (new_version != task.last_version) {
if (task.isUnassigned()) {
LOG.info("task " + path + " acquired by " + workerName);
}
task.heartbeat(EnvironmentEdgeManager.currentTimeMillis(), new_version, workerName);
SplitLogCounters.tot_mgr_heartbeat.incrementAndGet();
} else {
// duplicate heartbeats - heartbeats w/o zk node version
// changing - are possible. The timeout thread does
// getDataSetWatch() just to check whether a node still
// exists or not
}
return;
}
private boolean resubmit(String path, Task task, ResubmitDirective directive) {
// its ok if this thread misses the update to task.deleted. It will fail later
if (task.status != IN_PROGRESS) {
return false;
}
int version;
if (directive != FORCE) {
// We're going to resubmit:
// 1) immediately if the worker server is now marked as dead
// 2) after a configurable timeout if the server is not marked as dead but has still not
// finished the task. This allows to continue if the worker cannot actually handle it,
// for any reason.
final long time = EnvironmentEdgeManager.currentTimeMillis() - task.last_update;
final boolean alive = master.getServerManager() != null ?
master.getServerManager().isServerOnline(task.cur_worker_name) : true;
if (alive && time < timeout) {
LOG.trace("Skipping the resubmit of " + task.toString() + " because the server " +
task.cur_worker_name + " is not marked as dead, we waited for " + time +
" while the timeout is " + timeout);
return false;
}
if (task.unforcedResubmits.get() >= resubmit_threshold) {
if (!task.resubmitThresholdReached) {
task.resubmitThresholdReached = true;
SplitLogCounters.tot_mgr_resubmit_threshold_reached.incrementAndGet();
LOG.info("Skipping resubmissions of task " + path +
" because threshold " + resubmit_threshold + " reached");
}
return false;
}
// race with heartbeat() that might be changing last_version
version = task.last_version;
} else {
SplitLogCounters.tot_mgr_resubmit_force.incrementAndGet();
version = -1;
}
LOG.info("resubmitting task " + path);
task.incarnation++;
try {
// blocking zk call but this is done from the timeout thread
SplitLogTask slt = new SplitLogTask.Unassigned(this.serverName, this.recoveryMode);
if (ZKUtil.setData(this.watcher, path, slt.toByteArray(), version) == false) {
LOG.debug("failed to resubmit task " + path +
" version changed");
task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
return false;
}
} catch (NoNodeException e) {
LOG.warn("failed to resubmit because znode doesn't exist " + path +
" task done (or forced done by removing the znode)");
try {
getDataSetWatchSuccess(path, null, Integer.MIN_VALUE);
} catch (DeserializationException e1) {
LOG.debug("Failed to re-resubmit task " + path + " because of deserialization issue", e1);
task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
return false;
}
return false;
} catch (KeeperException.BadVersionException e) {
LOG.debug("failed to resubmit task " + path + " version changed");
task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
return false;
} catch (KeeperException e) {
SplitLogCounters.tot_mgr_resubmit_failed.incrementAndGet();
LOG.warn("failed to resubmit " + path, e);
return false;
}
// don't count forced resubmits
if (directive != FORCE) {
task.unforcedResubmits.incrementAndGet();
}
task.setUnassigned();
createRescanNode(Long.MAX_VALUE);
SplitLogCounters.tot_mgr_resubmit.incrementAndGet();
return true;
}
private void resubmitOrFail(String path, ResubmitDirective directive) {
if (resubmit(path, findOrCreateOrphanTask(path), directive) == false) {
setDone(path, FAILURE);
}
}
private void deleteNode(String path, Long retries) {
SplitLogCounters.tot_mgr_node_delete_queued.incrementAndGet();
// Once a task znode is ready for delete, that is it is in the TASK_DONE
// state, then no one should be writing to it anymore. That is no one
// will be updating the znode version any more.
this.watcher.getRecoverableZooKeeper().getZooKeeper().
delete(path, -1, new DeleteAsyncCallback(),
retries);
}
private void deleteNodeSuccess(String path) {
if (ignoreZKDeleteForTesting) {
return;
}
Task task;
task = tasks.remove(path);
if (task == null) {
if (ZKSplitLog.isRescanNode(watcher, path)) {
SplitLogCounters.tot_mgr_rescan_deleted.incrementAndGet();
}
SplitLogCounters.tot_mgr_missing_state_in_delete.incrementAndGet();
LOG.debug("deleted task without in memory state " + path);
return;
}
synchronized (task) {
task.status = DELETED;
task.notify();
}
SplitLogCounters.tot_mgr_task_deleted.incrementAndGet();
}
private void deleteNodeFailure(String path) {
LOG.info("Failed to delete node " + path + " and will retry soon.");
return;
}
/**
* signal the workers that a task was resubmitted by creating the
* RESCAN node.
* @throws KeeperException
*/
private void createRescanNode(long retries) {
// The RESCAN node will be deleted almost immediately by the
// SplitLogManager as soon as it is created because it is being
// created in the DONE state. This behavior prevents a buildup
// of RESCAN nodes. But there is also a chance that a SplitLogWorker
// might miss the watch-trigger that creation of RESCAN node provides.
// Since the TimeoutMonitor will keep resubmitting UNASSIGNED tasks
// therefore this behavior is safe.
lastTaskCreateTime = EnvironmentEdgeManager.currentTimeMillis();
SplitLogTask slt = new SplitLogTask.Done(this.serverName, this.recoveryMode);
this.watcher.getRecoverableZooKeeper().getZooKeeper().
create(ZKSplitLog.getRescanNode(watcher), slt.toByteArray(),
Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL,
new CreateRescanAsyncCallback(), Long.valueOf(retries));
}
private void createRescanSuccess(String path) {
SplitLogCounters.tot_mgr_rescan.incrementAndGet();
getDataSetWatch(path, zkretries);
}
private void createRescanFailure() {
LOG.fatal("logic failure, rescan failure must not happen");
}
/**
* @param path
* @param batch
* @return null on success, existing task on error
*/
private Task createTaskIfAbsent(String path, TaskBatch batch) {
Task oldtask;
// batch.installed is only changed via this function and
// a single thread touches batch.installed.
Task newtask = new Task();
newtask.batch = batch;
oldtask = tasks.putIfAbsent(path, newtask);
if (oldtask == null) {
batch.installed++;
return null;
}
// new task was not used.
synchronized (oldtask) {
if (oldtask.isOrphan()) {
if (oldtask.status == SUCCESS) {
// The task is already done. Do not install the batch for this
// task because it might be too late for setDone() to update
// batch.done. There is no need for the batch creator to wait for
// this task to complete.
return (null);
}
if (oldtask.status == IN_PROGRESS) {
oldtask.batch = batch;
batch.installed++;
LOG.debug("Previously orphan task " + path + " is now being waited upon");
return null;
}
while (oldtask.status == FAILURE) {
LOG.debug("wait for status of task " + path + " to change to DELETED");
SplitLogCounters.tot_mgr_wait_for_zk_delete.incrementAndGet();
try {
oldtask.wait();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
LOG.warn("Interrupted when waiting for znode delete callback");
// fall through to return failure
break;
}
}
if (oldtask.status != DELETED) {
LOG.warn("Failure because previously failed task" +
" state still present. Waiting for znode delete callback" +
" path=" + path);
return oldtask;
}
// reinsert the newTask and it must succeed this time
Task t = tasks.putIfAbsent(path, newtask);
if (t == null) {
batch.installed++;
return null;
}
LOG.fatal("Logic error. Deleted task still present in tasks map");
assert false : "Deleted task still present in tasks map";
return t;
}
LOG.warn("Failure because two threads can't wait for the same task; path=" + path);
return oldtask;
}
}
Task findOrCreateOrphanTask(String path) {
Task orphanTask = new Task();
Task task;
task = tasks.putIfAbsent(path, orphanTask);
if (task == null) {
LOG.info("creating orphan task " + path);
SplitLogCounters.tot_mgr_orphan_task_acquired.incrementAndGet();
task = orphanTask;
}
return task;
}
@Override
public void nodeDataChanged(String path) {
Task task;
task = tasks.get(path);
if (task != null || ZKSplitLog.isRescanNode(watcher, path)) {
if (task != null) {
task.heartbeatNoDetails(EnvironmentEdgeManager.currentTimeMillis());
}
getDataSetWatch(path, zkretries);
}
}
public void stop() {
if (timeoutMonitor != null) {
timeoutMonitor.interrupt();
}
}
private void lookForOrphans() {
List orphans;
try {
orphans = ZKUtil.listChildrenNoWatch(this.watcher,
this.watcher.splitLogZNode);
if (orphans == null) {
LOG.warn("could not get children of " + this.watcher.splitLogZNode);
return;
}
} catch (KeeperException e) {
LOG.warn("could not get children of " + this.watcher.splitLogZNode +
" " + StringUtils.stringifyException(e));
return;
}
int rescan_nodes = 0;
for (String path : orphans) {
String nodepath = ZKUtil.joinZNode(watcher.splitLogZNode, path);
if (ZKSplitLog.isRescanNode(watcher, nodepath)) {
rescan_nodes++;
LOG.debug("found orphan rescan node " + path);
} else {
LOG.info("found orphan task " + path);
}
getDataSetWatch(nodepath, zkretries);
}
LOG.info("Found " + (orphans.size() - rescan_nodes) + " orphan tasks and " +
rescan_nodes + " rescan nodes");
}
/**
* Create znodes /hbase/recovering-regions/[region_ids...]/[failed region server names ...] for
* all regions of the passed in region servers
* @param serverName the name of a region server
* @param userRegions user regiones assigned on the region server
*/
void markRegionsRecoveringInZK(final ServerName serverName, Set userRegions)
throws KeeperException {
if (userRegions == null || (this.recoveryMode != RecoveryMode.LOG_REPLAY)) {
return;
}
try {
this.recoveringRegionLock.lock();
// mark that we're creating recovering znodes
this.lastRecoveringNodeCreationTime = EnvironmentEdgeManager.currentTimeMillis();
for (HRegionInfo region : userRegions) {
String regionEncodeName = region.getEncodedName();
long retries = this.zkretries;
do {
String nodePath = ZKUtil.joinZNode(watcher.recoveringRegionsZNode, regionEncodeName);
long lastRecordedFlushedSequenceId = -1;
try {
long lastSequenceId = this.master.getServerManager().getLastFlushedSequenceId(
regionEncodeName.getBytes());
/*
* znode layout: .../region_id[last known flushed sequence id]/failed server[last known
* flushed sequence id for the server]
*/
byte[] data = ZKUtil.getData(this.watcher, nodePath);
if (data == null) {
ZKUtil.createSetData(this.watcher, nodePath,
ZKUtil.positionToByteArray(lastSequenceId));
} else {
lastRecordedFlushedSequenceId = SplitLogManager.parseLastFlushedSequenceIdFrom(data);
if (lastRecordedFlushedSequenceId < lastSequenceId) {
// update last flushed sequence id in the region level
ZKUtil.setData(this.watcher, nodePath, ZKUtil.positionToByteArray(lastSequenceId));
}
}
// go one level deeper with server name
nodePath = ZKUtil.joinZNode(nodePath, serverName.getServerName());
if (lastSequenceId <= lastRecordedFlushedSequenceId) {
// the newly assigned RS failed even before any flush to the region
lastSequenceId = lastRecordedFlushedSequenceId;
}
ZKUtil.createSetData(this.watcher, nodePath,
ZKUtil.regionSequenceIdsToByteArray(lastSequenceId, null));
LOG.debug("Mark region " + regionEncodeName + " recovering from failed region server "
+ serverName);
// break retry loop
break;
} catch (KeeperException e) {
// ignore ZooKeeper exceptions inside retry loop
if (retries <= 1) {
throw e;
}
// wait a little bit for retry
try {
Thread.sleep(20);
} catch (Exception ignoreE) {
// ignore
}
}
} while ((--retries) > 0 && (!this.stopper.isStopped()));
}
} finally {
this.recoveringRegionLock.unlock();
}
}
/**
* @param bytes - Content of a failed region server or recovering region znode.
* @return long - The last flushed sequence Id for the region server
*/
public static long parseLastFlushedSequenceIdFrom(final byte[] bytes) {
long lastRecordedFlushedSequenceId = -1l;
try {
lastRecordedFlushedSequenceId = ZKUtil.parseHLogPositionFrom(bytes);
} catch (DeserializationException e) {
lastRecordedFlushedSequenceId = -1l;
LOG.warn("Can't parse last flushed sequence Id", e);
}
return lastRecordedFlushedSequenceId;
}
/**
* check if /hbase/recovering-regions/ exists. Returns true if exists
* and set watcher as well.
* @param zkw
* @param regionEncodedName region encode name
* @return true when /hbase/recovering-regions/ exists
* @throws KeeperException
*/
public static boolean
isRegionMarkedRecoveringInZK(ZooKeeperWatcher zkw, String regionEncodedName)
throws KeeperException {
boolean result = false;
String nodePath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, regionEncodedName);
byte[] node = ZKUtil.getDataAndWatch(zkw, nodePath);
if (node != null) {
result = true;
}
return result;
}
/**
* This function is used in distributedLogReplay to fetch last flushed sequence id from ZK
* @param zkw
* @param serverName
* @param encodedRegionName
* @return the last flushed sequence ids recorded in ZK of the region for serverName
* @throws IOException
*/
public static RegionStoreSequenceIds getRegionFlushedSequenceId(ZooKeeperWatcher zkw,
String serverName, String encodedRegionName) throws IOException {
// when SplitLogWorker recovers a region by directly replaying unflushed WAL edits,
// last flushed sequence Id changes when newly assigned RS flushes writes to the region.
// If the newly assigned RS fails again(a chained RS failures scenario), the last flushed
// sequence Id name space (sequence Id only valid for a particular RS instance), changes
// when different newly assigned RS flushes the region.
// Therefore, in this mode we need to fetch last sequence Ids from ZK where we keep history of
// last flushed sequence Id for each failed RS instance.
RegionStoreSequenceIds result = null;
String nodePath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, encodedRegionName);
nodePath = ZKUtil.joinZNode(nodePath, serverName);
try {
byte[] data = ZKUtil.getData(zkw, nodePath);
if (data != null) {
result = ZKUtil.parseRegionStoreSequenceIds(data);
}
} catch (KeeperException e) {
throw new IOException("Cannot get lastFlushedSequenceId from ZooKeeper for server="
+ serverName + "; region=" + encodedRegionName, e);
} catch (DeserializationException e) {
LOG.warn("Can't parse last flushed sequence Id from znode:" + nodePath, e);
}
return result;
}
private List listSplitLogTasks() throws KeeperException {
List taskOrRescanList = ZKUtil.listChildrenNoWatch(watcher, watcher.splitLogZNode);
if (taskOrRescanList == null || taskOrRescanList.isEmpty()) {
return Collections. emptyList();
}
List taskList = new ArrayList();
for (String taskOrRescan : taskOrRescanList) {
// Remove rescan nodes
if (!ZKSplitLog.isRescanNode(taskOrRescan)) {
taskList.add(taskOrRescan);
}
}
return taskList;
}
/**
* This function is to set recovery mode from outstanding split log tasks from before or
* current configuration setting
* @param isForInitialization
* @throws KeeperException
* @throws InterruptedIOException
*/
public void setRecoveryMode(boolean isForInitialization) throws KeeperException {
if(this.isDrainingDone) {
// when there is no outstanding splitlogtask after master start up, we already have up to date
// recovery mode
return;
}
if(this.watcher == null) {
// when watcher is null(testing code) and recovery mode can only be LOG_SPLITTING
this.isDrainingDone = true;
this.recoveryMode = RecoveryMode.LOG_SPLITTING;
return;
}
boolean hasSplitLogTask = false;
boolean hasRecoveringRegions = false;
RecoveryMode previousRecoveryMode = RecoveryMode.UNKNOWN;
RecoveryMode recoveryModeInConfig = (isDistributedLogReplay(conf)) ?
RecoveryMode.LOG_REPLAY : RecoveryMode.LOG_SPLITTING;
// Firstly check if there are outstanding recovering regions
List regions = ZKUtil.listChildrenNoWatch(watcher, watcher.recoveringRegionsZNode);
if (regions != null && !regions.isEmpty()) {
hasRecoveringRegions = true;
previousRecoveryMode = RecoveryMode.LOG_REPLAY;
}
if (previousRecoveryMode == RecoveryMode.UNKNOWN) {
// Secondly check if there are outstanding split log task
List tasks = listSplitLogTasks();
if (!tasks.isEmpty()) {
hasSplitLogTask = true;
if (isForInitialization) {
// during initialization, try to get recovery mode from splitlogtask
for (String task : tasks) {
try {
byte[] data = ZKUtil.getData(this.watcher,
ZKUtil.joinZNode(watcher.splitLogZNode, task));
if (data == null) continue;
SplitLogTask slt = SplitLogTask.parseFrom(data);
previousRecoveryMode = slt.getMode();
if (previousRecoveryMode == RecoveryMode.UNKNOWN) {
// created by old code base where we don't set recovery mode in splitlogtask
// we can safely set to LOG_SPLITTING because we're in master initialization code
// before SSH is enabled & there is no outstanding recovering regions
previousRecoveryMode = RecoveryMode.LOG_SPLITTING;
}
break;
} catch (DeserializationException e) {
LOG.warn("Failed parse data for znode " + task, e);
}
}
}
}
}
synchronized(this) {
if(this.isDrainingDone) {
return;
}
if (!hasSplitLogTask && !hasRecoveringRegions) {
this.isDrainingDone = true;
this.recoveryMode = recoveryModeInConfig;
return;
} else if (!isForInitialization) {
// splitlogtask hasn't drained yet, keep existing recovery mode
return;
}
if (previousRecoveryMode != RecoveryMode.UNKNOWN) {
this.isDrainingDone = (previousRecoveryMode == recoveryModeInConfig);
this.recoveryMode = previousRecoveryMode;
} else {
this.recoveryMode = recoveryModeInConfig;
}
}
}
public RecoveryMode getRecoveryMode() {
return this.recoveryMode;
}
/**
* Returns if distributed log replay is turned on or not
* @param conf
* @return true when distributed log replay is turned on
*/
private boolean isDistributedLogReplay(Configuration conf) {
boolean dlr = conf.getBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY,
HConstants.DEFAULT_DISTRIBUTED_LOG_REPLAY_CONFIG);
int version = conf.getInt(HFile.FORMAT_VERSION_KEY, HFile.MAX_FORMAT_VERSION);
if (LOG.isDebugEnabled()) {
LOG.debug("Distributed log replay=" + dlr + ", " + HFile.FORMAT_VERSION_KEY + "=" + version);
}
// For distributed log replay, hfile version must be 3 at least; we need tag support.
return dlr && (version >= 3);
}
/**
* Keeps track of the batch of tasks submitted together by a caller in splitLogDistributed().
* Clients threads use this object to wait for all their tasks to be done.
*
* All access is synchronized.
*/
static class TaskBatch {
int installed = 0;
int done = 0;
int error = 0;
volatile boolean isDead = false;
@Override
public String toString() {
return ("installed = " + installed + " done = " + done + " error = " + error);
}
}
/**
* in memory state of an active task.
*/
static class Task {
volatile long last_update;
volatile int last_version;
volatile ServerName cur_worker_name;
volatile TaskBatch batch;
volatile TerminationStatus status;
volatile int incarnation;
final AtomicInteger unforcedResubmits = new AtomicInteger();
volatile boolean resubmitThresholdReached;
@Override
public String toString() {
return ("last_update = " + last_update +
" last_version = " + last_version +
" cur_worker_name = " + cur_worker_name +
" status = " + status +
" incarnation = " + incarnation +
" resubmits = " + unforcedResubmits.get() +
" batch = " + batch);
}
Task() {
incarnation = 0;
last_version = -1;
status = IN_PROGRESS;
setUnassigned();
}
public boolean isOrphan() {
return (batch == null || batch.isDead);
}
public boolean isUnassigned() {
return (cur_worker_name == null);
}
public void heartbeatNoDetails(long time) {
last_update = time;
}
public void heartbeat(long time, int version, ServerName worker) {
last_version = version;
last_update = time;
cur_worker_name = worker;
}
public void setUnassigned() {
cur_worker_name = null;
last_update = -1;
}
}
void handleDeadWorker(ServerName workerName) {
// resubmit the tasks on the TimeoutMonitor thread. Makes it easier
// to reason about concurrency. Makes it easier to retry.
synchronized (deadWorkersLock) {
if (deadWorkers == null) {
deadWorkers = new HashSet(100);
}
deadWorkers.add(workerName);
}
LOG.info("dead splitlog worker " + workerName);
}
void handleDeadWorkers(Set serverNames) {
synchronized (deadWorkersLock) {
if (deadWorkers == null) {
deadWorkers = new HashSet(100);
}
deadWorkers.addAll(serverNames);
}
LOG.info("dead splitlog workers " + serverNames);
}
/**
* Periodically checks all active tasks and resubmits the ones that have timed
* out
*/
private class TimeoutMonitor extends Chore {
private long lastLog = 0;
public TimeoutMonitor(final int period, Stoppable stopper) {
super("SplitLogManager Timeout Monitor", period, stopper);
}
@Override
protected void chore() {
int resubmitted = 0;
int unassigned = 0;
int tot = 0;
boolean found_assigned_task = false;
Set localDeadWorkers;
synchronized (deadWorkersLock) {
localDeadWorkers = deadWorkers;
deadWorkers = null;
}
for (Map.Entry e : tasks.entrySet()) {
String path = e.getKey();
Task task = e.getValue();
ServerName cur_worker = task.cur_worker_name;
tot++;
// don't easily resubmit a task which hasn't been picked up yet. It
// might be a long while before a SplitLogWorker is free to pick up a
// task. This is because a SplitLogWorker picks up a task one at a
// time. If we want progress when there are no region servers then we
// will have to run a SplitLogWorker thread in the Master.
if (task.isUnassigned()) {
unassigned++;
continue;
}
found_assigned_task = true;
if (localDeadWorkers != null && localDeadWorkers.contains(cur_worker)) {
SplitLogCounters.tot_mgr_resubmit_dead_server_task.incrementAndGet();
if (resubmit(path, task, FORCE)) {
resubmitted++;
} else {
handleDeadWorker(cur_worker);
LOG.warn("Failed to resubmit task " + path + " owned by dead " +
cur_worker + ", will retry.");
}
} else if (resubmit(path, task, CHECK)) {
resubmitted++;
}
}
if (tot > 0) {
long now = EnvironmentEdgeManager.currentTimeMillis();
if (now > lastLog + 5000) {
lastLog = now;
LOG.info("total tasks = " + tot + " unassigned = " + unassigned + " tasks=" + tasks);
}
}
if (resubmitted > 0) {
LOG.info("resubmitted " + resubmitted + " out of " + tot + " tasks");
}
// If there are pending tasks and all of them have been unassigned for
// some time then put up a RESCAN node to ping the workers.
// ZKSplitlog.DEFAULT_UNASSIGNED_TIMEOUT is of the order of minutes
// because a. it is very unlikely that every worker had a
// transient error when trying to grab the task b. if there are no
// workers then all tasks wills stay unassigned indefinitely and the
// manager will be indefinitely creating RESCAN nodes. TODO may be the
// master should spawn both a manager and a worker thread to guarantee
// that there is always one worker in the system
if (tot > 0 && !found_assigned_task &&
((EnvironmentEdgeManager.currentTimeMillis() - lastTaskCreateTime) >
unassignedTimeout)) {
for (Map.Entry e : tasks.entrySet()) {
String path = e.getKey();
Task task = e.getValue();
// we have to do task.isUnassigned() check again because tasks might
// have been asynchronously assigned. There is no locking required
// for these checks ... it is OK even if tryGetDataSetWatch() is
// called unnecessarily for a task
if (task.isUnassigned() && (task.status != FAILURE)) {
// We just touch the znode to make sure its still there
tryGetDataSetWatch(path);
}
}
createRescanNode(Long.MAX_VALUE);
SplitLogCounters.tot_mgr_resubmit_unassigned.incrementAndGet();
LOG.debug("resubmitting unassigned task(s) after timeout");
}
// Retry previously failed deletes
if (failedDeletions.size() > 0) {
List tmpPaths = new ArrayList(failedDeletions);
for (String tmpPath : tmpPaths) {
// deleteNode is an async call
deleteNode(tmpPath, zkretries);
}
failedDeletions.removeAll(tmpPaths);
}
// Garbage collect left-over /hbase/recovering-regions/... znode
long timeInterval = EnvironmentEdgeManager.currentTimeMillis()
- lastRecoveringNodeCreationTime;
if (!failedRecoveringRegionDeletions.isEmpty()
|| (tot == 0 && tasks.size() == 0 && (timeInterval > checkRecoveringTimeThreshold))) {
// inside the function there have more checks before GC anything
if (!failedRecoveringRegionDeletions.isEmpty()) {
List, Boolean>> previouslyFailedDeletions =
new ArrayList, Boolean>>(failedRecoveringRegionDeletions);
failedRecoveringRegionDeletions.removeAll(previouslyFailedDeletions);
for (Pair, Boolean> failedDeletion : previouslyFailedDeletions) {
removeRecoveringRegionsFromZK(failedDeletion.getFirst(), failedDeletion.getSecond());
}
} else {
removeRecoveringRegionsFromZK(null, null);
}
}
}
}
/**
* Asynchronous handler for zk create node results.
* Retries on failures.
*/
class CreateAsyncCallback implements AsyncCallback.StringCallback {
private final Log LOG = LogFactory.getLog(CreateAsyncCallback.class);
@Override
public void processResult(int rc, String path, Object ctx, String name) {
SplitLogCounters.tot_mgr_node_create_result.incrementAndGet();
if (rc != 0) {
if (needAbandonRetries(rc, "Create znode " + path)) {
createNodeFailure(path);
return;
}
if (rc == KeeperException.Code.NODEEXISTS.intValue()) {
// What if there is a delete pending against this pre-existing
// znode? Then this soon-to-be-deleted task znode must be in TASK_DONE
// state. Only operations that will be carried out on this node by
// this manager are get-znode-data, task-finisher and delete-znode.
// And all code pieces correctly handle the case of suddenly
// disappearing task-znode.
LOG.debug("found pre-existing znode " + path);
SplitLogCounters.tot_mgr_node_already_exists.incrementAndGet();
} else {
Long retry_count = (Long)ctx;
LOG.warn("create rc =" + KeeperException.Code.get(rc) + " for " +
path + " remaining retries=" + retry_count);
if (retry_count == 0) {
SplitLogCounters.tot_mgr_node_create_err.incrementAndGet();
createNodeFailure(path);
} else {
SplitLogCounters.tot_mgr_node_create_retry.incrementAndGet();
createNode(path, retry_count - 1);
}
return;
}
}
createNodeSuccess(path);
}
}
/**
* Asynchronous handler for zk get-data-set-watch on node results.
* Retries on failures.
*/
class GetDataAsyncCallback implements AsyncCallback.DataCallback {
private final Log LOG = LogFactory.getLog(GetDataAsyncCallback.class);
private boolean completeTaskOnNoNode;
/**
* @param completeTaskOnNoNode Complete the task if the znode cannot be found.
* Since in-memory task creation and znode creation are not atomic, there might be
* a race where there is a task in memory but the znode is not created yet (TimeoutMonitor).
* In this case completeTaskOnNoNode should be set to false. See HBASE-11217.
*/
public GetDataAsyncCallback(boolean completeTaskOnNoNode) {
this.completeTaskOnNoNode = completeTaskOnNoNode;
}
@Override
public void processResult(int rc, String path, Object ctx, byte[] data,
Stat stat) {
SplitLogCounters.tot_mgr_get_data_result.incrementAndGet();
if (rc != 0) {
if (needAbandonRetries(rc, "GetData from znode " + path)) {
return;
}
if (rc == KeeperException.Code.NONODE.intValue()) {
SplitLogCounters.tot_mgr_get_data_nonode.incrementAndGet();
LOG.warn("task znode " + path + " vanished.");
if (completeTaskOnNoNode) {
// The task znode has been deleted. Must be some pending delete
// that deleted the task. Assume success because a task-znode is
// is only deleted after TaskFinisher is successful.
try {
getDataSetWatchSuccess(path, null, Integer.MIN_VALUE);
} catch (DeserializationException e) {
LOG.warn("Deserialization problem", e);
}
}
return;
}
Long retry_count = (Long) ctx;
if (retry_count < 0) {
LOG.warn("getdata rc = " + KeeperException.Code.get(rc) + " " +
path + ". Ignoring error. No error handling. No retrying.");
return;
}
LOG.warn("getdata rc = " + KeeperException.Code.get(rc) + " " +
path + " remaining retries=" + retry_count);
if (retry_count == 0) {
SplitLogCounters.tot_mgr_get_data_err.incrementAndGet();
getDataSetWatchFailure(path);
} else {
SplitLogCounters.tot_mgr_get_data_retry.incrementAndGet();
getDataSetWatch(path, retry_count - 1);
}
return;
}
try {
getDataSetWatchSuccess(path, data, stat.getVersion());
} catch (DeserializationException e) {
LOG.warn("Deserialization problem", e);
}
return;
}
}
/**
* Asynchronous handler for zk delete node results.
* Retries on failures.
*/
class DeleteAsyncCallback implements AsyncCallback.VoidCallback {
private final Log LOG = LogFactory.getLog(DeleteAsyncCallback.class);
@Override
public void processResult(int rc, String path, Object ctx) {
SplitLogCounters.tot_mgr_node_delete_result.incrementAndGet();
if (rc != 0) {
if (needAbandonRetries(rc, "Delete znode " + path)) {
failedDeletions.add(path);
return;
}
if (rc != KeeperException.Code.NONODE.intValue()) {
SplitLogCounters.tot_mgr_node_delete_err.incrementAndGet();
Long retry_count = (Long) ctx;
LOG.warn("delete rc=" + KeeperException.Code.get(rc) + " for " +
path + " remaining retries=" + retry_count);
if (retry_count == 0) {
LOG.warn("delete failed " + path);
failedDeletions.add(path);
deleteNodeFailure(path);
} else {
deleteNode(path, retry_count - 1);
}
return;
} else {
LOG.info(path +
" does not exist. Either was created but deleted behind our" +
" back by another pending delete OR was deleted" +
" in earlier retry rounds. zkretries = " + (Long) ctx);
}
} else {
LOG.debug("deleted " + path);
}
deleteNodeSuccess(path);
}
}
/**
* Asynchronous handler for zk create RESCAN-node results.
* Retries on failures.
*
* A RESCAN node is created using PERSISTENT_SEQUENTIAL flag. It is a signal
* for all the {@link SplitLogWorker}s to rescan for new tasks.
*/
class CreateRescanAsyncCallback implements AsyncCallback.StringCallback {
private final Log LOG = LogFactory.getLog(CreateRescanAsyncCallback.class);
@Override
public void processResult(int rc, String path, Object ctx, String name) {
if (rc != 0) {
if (needAbandonRetries(rc, "CreateRescan znode " + path)) {
return;
}
Long retry_count = (Long)ctx;
LOG.warn("rc=" + KeeperException.Code.get(rc) + " for "+ path +
" remaining retries=" + retry_count);
if (retry_count == 0) {
createRescanFailure();
} else {
createRescanNode(retry_count - 1);
}
return;
}
// path is the original arg, name is the actual name that was created
createRescanSuccess(name);
}
}
/**
* {@link SplitLogManager} can use objects implementing this interface to
* finish off a partially done task by {@link SplitLogWorker}. This provides
* a serialization point at the end of the task processing. Must be
* restartable and idempotent.
*/
public interface TaskFinisher {
/**
* status that can be returned finish()
*/
enum Status {
/**
* task completed successfully
*/
DONE(),
/**
* task completed with error
*/
ERR();
}
/**
* finish the partially done task. workername provides clue to where the
* partial results of the partially done tasks are present. taskname is the
* name of the task that was put up in zookeeper.
*
* @param workerName
* @param taskname
* @return DONE if task completed successfully, ERR otherwise
*/
Status finish(ServerName workerName, String taskname);
}
enum ResubmitDirective {
CHECK(),
FORCE();
}
enum TerminationStatus {
IN_PROGRESS("in_progress"),
SUCCESS("success"),
FAILURE("failure"),
DELETED("deleted");
String statusMsg;
TerminationStatus(String msg) {
statusMsg = msg;
}
@Override
public String toString() {
return statusMsg;
}
}
}