Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/* This file is part of VoltDB.
* Copyright (C) 2008-2020 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see .
*/
package org.voltcore.zk;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.zookeeper_voltpatches.AsyncCallback;
import org.apache.zookeeper_voltpatches.CreateMode;
import org.apache.zookeeper_voltpatches.KeeperException;
import org.apache.zookeeper_voltpatches.KeeperException.ConnectionLossException;
import org.apache.zookeeper_voltpatches.KeeperException.NoNodeException;
import org.apache.zookeeper_voltpatches.KeeperException.SessionExpiredException;
import org.apache.zookeeper_voltpatches.WatchedEvent;
import org.apache.zookeeper_voltpatches.Watcher;
import org.apache.zookeeper_voltpatches.ZooDefs.Ids;
import org.apache.zookeeper_voltpatches.ZooKeeper;
import org.apache.zookeeper_voltpatches.data.Stat;
import org.voltcore.logging.VoltLogger;
import org.voltcore.utils.CoreUtils;
import org.voltdb.VoltDB;
import org.voltdb.utils.Mutex;
import com.google_voltpatches.common.base.Throwables;
import com.google_voltpatches.common.collect.ImmutableSet;
import com.google_voltpatches.common.collect.Sets;
import com.google_voltpatches.common.util.concurrent.ListenableFuture;
import com.google_voltpatches.common.util.concurrent.ListeningExecutorService;
import com.google_voltpatches.common.util.concurrent.MoreExecutors;
import com.google_voltpatches.common.util.concurrent.SettableFuture;
/*
* This state machine coexists with other SynchronizedStateManagers by using different branches of the
* ZooKeeper tree. This class provides a service to the StateMachineInstance class that reports changes
* to the membership. The ZooKeeper tree hierarchy is (quoted are reserved node names):
* |- rootNode
* |\
* | |- 'MEMBERS'
* | \
* | |- memberId
* \
* |- instanceName
* |\
* | |- 'LOCK_CONTENDERS'
* |\
* | |- 'BARRIER_PARTICIPANTS'
* \
* |- 'BARRIER_RESULTS'
*
* This hierarchy supports multiple topologies so one SynchronizedStatesManager could be used to track
* cluster membership while another could be used to track partition membership. Each SynchronizedStatesManager
* instance has a memberId that must be unique to the member community it is participating in. As the
* SynchronizedStatesManager is a shared service that can provide membership changes of other
* StateMachineInstances joining or leaving the community, all instances are considered tightly bound to the
* SynchronizedStatesManager. This means that the SynchronizedStatesManager will not join a community until
* all instances using that manager have registered with the SynchronizedStatesManager. Thus when a
* SynchronizedStatesManager informs its instances that a new member has joined the community (the set of
* nodes under the 'MEMBERS' node), the instance can be assured that it's peer instance is also there. This
* implies that StateMachineInstances can only coexist in the same SynchronizedStatesManager if all instances
* can be initialized (registered) before any individual instance needs to access the shared state.
*/
public class SynchronizedStatesManager {
private final static String s_memberNode = "MEMBERS";
private final AtomicReference m_state = new AtomicReference<>(State.RUNNING);
private Set m_groupMembers = new HashSet();
private final StateMachineInstance m_registeredStateMachines[];
private int m_registeredStateMachineInstances = 0;
private static final ListeningExecutorService s_sharedEs = CoreUtils.getListeningExecutorService("SSM Daemon", 1);
// We assume that we are far enough along that the HostMessenger is up and running. Otherwise add to constructor.
private final ZooKeeper m_zk;
private final String m_ssmRootNode;
private final String m_stateMachineRoot;
private final String m_stateMachineMemberPath;
private String m_memberId;
private final String m_canonical_memberId;
private int m_resetCounter;
private int m_resetLimit;
private final int m_resetAllowance; // number of resets allowed within a reset clear threshold duration
private long m_lastResetTimeInMillis = -1L;
private SettableFuture m_initComplete;
// Log for global events that happen outside individual state machines
private static final VoltLogger ssmLog = new VoltLogger("SSM");
private static final long RESET_CLEAR_THRESHOLD = TimeUnit.DAYS.toMillis(1);
private enum REQUEST_TYPE {
INITIALIZING,
LAST_CHANGE_OUTCOME_REQUEST,
STATE_CHANGE_REQUEST,
CORRELATED_COORDINATED_TASK,
UNCORRELATED_COORDINATED_TASK
}
private enum RESULT_CONCENSUS {
AGREE,
DISAGREE,
NO_QUORUM
}
private enum State {
RUNNING, ERROR, SHUTDOWN
}
// Utility methods for handling uncaught exceptions from runnables and callables
private static void addExceptionHandler(ListenableFuture future) {
future.addListener(() -> {
try {
future.get();
} catch (ExecutionException e) {
VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e.getCause());
} catch (Throwable t) {
VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, t);
}
}, MoreExecutors.directExecutor());
}
static void submitRunnable(Runnable runnable) {
addExceptionHandler(s_sharedEs.submit(runnable));
}
static void submitCallable(Callable callable) {
addExceptionHandler(s_sharedEs.submit(callable));
}
protected boolean addIfMissing(String absolutePath, CreateMode createMode, byte[] data)
throws KeeperException, InterruptedException {
return ZKUtil.addIfMissing(m_zk, absolutePath, createMode, data);
}
protected String getMemberId() {
return m_memberId;
}
protected abstract class ZKAsyncCreateHandler implements AsyncCallback.StringCallback, Runnable {
KeeperException.Code m_resultCode;
String m_resultString;
ZKAsyncCreateHandler(String path, byte[] data, CreateMode createMode) {
m_zk.create(path, data, Ids.OPEN_ACL_UNSAFE, createMode, this, null);
}
@Override
public void processResult(int rc, String path, Object ctx, String name) {
try {
m_resultCode = KeeperException.Code.get(rc);
if (m_resultCode == KeeperException.Code.OK) {
m_resultString = name;
}
if (isRunning()) {
// Will execute the specialized implementation of Run()
submitRunnable(this);
}
} catch (RejectedExecutionException e) {
ssmLog.warn("Async initialization of ZK directory for SSM was rejected by the SSM Thread: " + path);
}
}
@Override
public final void run() {
if (isRunning()) {
runImpl();
}
}
abstract void runImpl();
}
protected abstract class ZKAsyncChildrenHandler implements AsyncCallback.ChildrenCallback, Runnable {
KeeperException.Code m_resultCode; // should be either OK or NONODE
List m_resultChildren;
ZKAsyncChildrenHandler(String path, Watcher childrenWatcher) {
m_zk.getChildren(path, childrenWatcher, this, null);
}
@Override
public void processResult(int rc, String path, Object ctx, List children) {
try {
m_resultCode = KeeperException.Code.get(rc);
if (m_resultCode == KeeperException.Code.OK) {
m_resultChildren = children;
}
else {
assert(m_resultCode == KeeperException.Code.NONODE ||
m_resultCode == KeeperException.Code.CONNECTIONLOSS ||
m_resultCode == KeeperException.Code.SESSIONEXPIRED);
}
if (isRunning()) {
// Will execute the specialized implementation of Run()
submitRunnable(this);
}
} catch (RejectedExecutionException e) {
ssmLog.warn("Async getChildren for SSM was rejected by the SSM Thread: " + path);
}
}
@Override
public final void run() {
if (isRunning()) {
runImpl();
}
}
abstract void runImpl();
}
/*
* Rules for each StateMachineInstance:
* 1. All access to the state machine is serialized by a per state machine local lock.
* 2. State changes may only be initiated by the oldest ephemeral lock node under the
* LOCK_CONTENDERS directory. This represents the distributed lock used to
* synchronize proposals by the members of the state machine.
* 3. When a new member joins an operational state machine, it must grab the lock node
* (2) before it can determine the current agreed state. However since it has to
* participate and benignly generate null results for all proposals, it can glean
* the state from the outcome of the next request it fully participates in. Note
* that an operation in progress during initialization of the new member is not
* used by the new member because the proposal initiator may have accepted the
* results and released the distributed lock before the new member can evaluate the
* results to determine the current state.
* 4. A member holding the distributed lock can propose a state change by clearing out
* the reported result nodes under the BARRIER_RESULTS node, updating the
* BARRIER_RESULTS node with the old and proposed state, and adding an ephemeral
* unique node under BARRIER_PARTICIPANTS to trigger the state machine members.
* Each proposal in the BARRIER_RESULTS node has a version number that is used
* by other members to distinguish a new proposal from a proposal it has already
* processed.
* 5. When a member detects a transition in the BARRIER_PARTICIPANTS node and a new
* version of the BARRIER_RESULTS node, it adds an ephemeral node for itself under
* BARRIER_PARTICIPANTS and copies the old and proposed state from the BARRIER_RESULTS
* node.
* 6. Each member evaluates the proposed state and inserts a SUCCESS or FAILURE indication
* in a persistent unique node under the BARRIER_RESULTS path.
* 7. A member may also request that an action be performed by all fully participating
* members (including itself) by assigning the existing state and a task proposal
* in the BARRIER_RESULTS node with a new version id and then adding a node under the
* BARRIER_PARTICIPANTS directory.
* 8. All members detecting the task request perform the task specified in the task
* proposal and inserting a result node under the BARRIER_RESULTS directory.
* 9. When the set of BARRIER_RESULTS nodes is a superset of all members currently in the
* state machine, all members are considered to have reported in and each member can
* independently determine the outcome. If the proposal was a state change request,
* and all BARRIER_RESULTS nodes report SUCCESS each member applies the new state
* and then removes itself from BARRIER_PARTICIPANTS. If the proposal was a task
* request, then members remove themselves from BARRIER_PARTICIPANTS to acknowledge
* receipt of all task results.
*10. Membership in the state machine can change as members join or fail. Membership is
* monitored by the SynchronizedStatesManager and changes are reported to each state
* machine instance. While members have joined but have not determined the current
* state, they report a NULL result for each state change or task proposal made by
* other members.
*11. If any member reports FAILURE node under the BARRIER_RESULTS directory after a
* state change request proposal, the new state is not applied by any member.
*12. Each member that has processed BARRIER_RESULTS removes its ephemeral node from the
* BARRIER_PARTICIPANTS directory node.
*13. The initiator of a state change or task releases the lock node when 9 is satisfied.
*14. A member is not told they have the lock node until the last member releases the
* lock node AND all ephemeral nodes under the BARRIER_PARTICIPANTS node are gone,
* indicating that all members waiting on the outcome of the last state change or
* task results have processed the previous outcome
*/
public abstract class StateMachineInstance {
protected String m_stateMachineId;
private final String m_stateMachineName;
private Set m_knownMembers;
private volatile boolean m_membershipChangePending = false;
private volatile boolean m_participantsChangePending = false;
private final String m_statePath;
private final String m_lockPath;
private final String m_barrierResultsPath;
private String m_myResultPath;
private final String m_barrierParticipantsPath;
private String m_myParticipantPath;
private boolean m_stateChangeInitiator = false;
private String m_ourDistributedLockName = null;
private String m_lockWaitingOn = null;
private boolean m_holdingDistributedLock = false;
protected final VoltLogger m_log;
private ByteBuffer m_requestedInitialState = ByteBuffer.allocate(0);
private ByteBuffer m_synchronizedState = null;
private ByteBuffer m_pendingProposal = null;
private REQUEST_TYPE m_currentRequestType = REQUEST_TYPE.INITIALIZING;
private int m_currentParticipants = 0;
private Set m_memberResults = null;
private int m_lastProposalVersion = 0;
private boolean m_initializationCompleted = false;
private boolean isInitializationCompleted() {
try (Mutex.Releaser r = m_mutex.acquire()) {
return m_initializationCompleted;
}
}
public int getResetCounter() {
return m_resetCounter;
}
private final Mutex m_mutex = new Mutex();
boolean debugIsLocalStateLocked() {
return m_mutex.isHeldByCurrentThread();
}
private class LockWatcher implements Watcher {
@Override
public void process(final WatchedEvent event) {
try {
if (isRunning()) {
submitCallable(HandlerForDistributedLockEvent);
}
} catch (RejectedExecutionException e) {
}
}
}
private final LockWatcher m_lockWatcher = new LockWatcher();
private class BarrierParticipantsWatcher implements Watcher {
@Override
public void process(final WatchedEvent event) {
try {
if (isRunning()) {
m_participantsChangePending = true;
submitCallable(HandlerForBarrierParticipantsEvent);
}
} catch (RejectedExecutionException e) {
ssmLog.warn("ZK watch of Participant Barrier was rejected by the SSM Thread");
}
}
}
private class StateChangeRequest {
public final REQUEST_TYPE m_requestType;
public final ByteBuffer m_previousState;
public final ByteBuffer m_proposal;
StateChangeRequest(REQUEST_TYPE requestType, ByteBuffer previousState, ByteBuffer proposedState) {
m_requestType = requestType;
m_previousState = previousState;
m_proposal = proposedState;
}
}
private final BarrierParticipantsWatcher m_barrierParticipantsWatcher = new BarrierParticipantsWatcher();
private class BarrierResultsWatcher implements Watcher {
@Override
public void process(final WatchedEvent event) {
try {
if (isRunning()) {
submitCallable(HandlerForBarrierResultsEvent);
}
} catch (RejectedExecutionException e) {
ssmLog.warn("ZK watch of Result Barrier was rejected by the SSM Thread");
}
}
}
private final BarrierResultsWatcher m_barrierResultsWatcher = new BarrierResultsWatcher();
// Used only for Mocking StateMachineInstance
public StateMachineInstance()
{
m_statePath = "MockInstanceStatePath";
m_barrierResultsPath = "MockBarrierResultsPath";
m_myResultPath = "MockMyResultPath";
m_barrierParticipantsPath = "MockParticipantsPath";
m_myParticipantPath = "MockMyParticipantPath";
m_lockPath = "MockLockPath";
m_log = null;
m_stateMachineId = "MockStateMachineId";
m_stateMachineName = "MockStateMachineName";
}
public StateMachineInstance(String instanceName, VoltLogger logger) throws RuntimeException {
if (instanceName.equals(s_memberNode)) {
throw new RuntimeException("State machine name may not be named " + s_memberNode);
}
assert(!instanceName.equals(s_memberNode));
m_stateMachineName = instanceName;
m_statePath = ZKUtil.joinZKPath(m_stateMachineRoot, instanceName);
m_lockPath = ZKUtil.joinZKPath(m_statePath, "LOCK_CONTENDERS");
m_barrierResultsPath = ZKUtil.joinZKPath(m_statePath, "BARRIER_RESULTS");
m_myResultPath = ZKUtil.joinZKPath(m_barrierResultsPath, m_memberId);
m_barrierParticipantsPath = ZKUtil.joinZKPath(m_statePath, "BARRIER_PARTICIPANTS");
m_myParticipantPath = ZKUtil.joinZKPath(m_barrierParticipantsPath, m_memberId);
m_log = logger;
m_stateMachineId = "SMI " + m_ssmRootNode + "/" + m_stateMachineName + "/" + m_memberId;
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + " created.");
}
}
private void setRequestedInitialState(ByteBuffer requestedInitialState) {
assert(requestedInitialState != null);
assert(requestedInitialState.remaining() < Short.MAX_VALUE);
m_requestedInitialState = requestedInitialState;
}
/**
* This method is called to register new StateMachine instances with the SSM. The SSM will start the
* full initialization (join the community) when the correct number of instances have registered. If
* this registration is the last instance, the method will wait until the initialization has been
* completed. However, if this is not the last instance it will return immediately. This is generally
* not an issue because callback will be generated to {@link setInitialState} when each StateMachine
* knows the current state from the Quorum. When the registering object wants to directly wait on a
* {@link Future}, the method {@link registerStateMachineWithManagerAsync} should be used instead.
* @param requestedInitialState
* @throws InterruptedException
*/
public void registerStateMachineWithManager(ByteBuffer requestedInitialState) throws InterruptedException {
setRequestedInitialState(requestedInitialState);
ListenableFuture initComplete = registerStateMachine(this, false);
if (initComplete == null) {
return;
}
try {
Boolean initSuccessful = initComplete.get();
assert(initSuccessful);
}
catch (ExecutionException e) {
Throwables.throwIfUnchecked(e);
throw new RuntimeException(e.getCause());
}
}
/**
* This method is called to register new StateMachine instances with the SSM. The SSM will start the
* full initialization (join the community) when the correct number of instances have registered. This
* method will always return immediately with a {@link Future} that can be used to determine when the
* SSM has completed registration. Note that it is not necessary to wait for initialization to complete
* since the SSM will also execute a callback to {@link setInitialState} when each StateMachine knows
* the current state from the Quorum.
* @param requestedInitialState
* @throws InterruptedException
* @return a Future indicating when the initialization is complete and a Boolean indication that the
* initialization was successful
*/
public ListenableFuture registerStateMachineWithManagerAsync(ByteBuffer requestedInitialState) throws InterruptedException {
setRequestedInitialState(requestedInitialState);
return registerStateMachine(this, true);
}
/**
* Notify the derived class that the state machine instance is being reset,
* the derived class should reset its own specific states and provide a reset state
* @param isDirectVictim true if the reset is caused by callback exception in this instance
* @return a ByteBuffer encapsulating the reset state provided by the derived class
*/
protected abstract ByteBuffer notifyOfStateMachineReset(boolean isDirectVictim);
private class AsyncStateMachineInitializer {
final AtomicInteger m_pendingStateMachineInits;
final Set m_startingMemberSet;
private void initializationFailed() {
m_state.set(State.ERROR);
m_initComplete.set(false);
}
private boolean noCommonKeeperExceptions(KeeperException.Code code) {
if (code == KeeperException.Code.SESSIONEXPIRED ||
code == KeeperException.Code.CONNECTIONLOSS) {
// lost the full connection. some test cases do this...
// means zk shutdown without the elector being shutdown.
// ignore.
initializationFailed();
return false;
}
else if (code != KeeperException.Code.NODEEXISTS && code != KeeperException.Code.OK) {
VoltDB.crashLocalVoltDB(
"Unexpected failure (" + code.name() + ") in ZooKeeper while initializeInstances.",
true, null);
}
return true;
}
public AsyncStateMachineInitializer(final AtomicInteger pendingStateMachineInits,
Set knownMembers) {
m_pendingStateMachineInits = pendingStateMachineInits;
m_startingMemberSet = knownMembers;
}
public void startInitialization() {
new ZKAsyncCreateHandler(m_statePath, null, CreateMode.PERSISTENT) {
@Override
public void runImpl() {
if (noCommonKeeperExceptions(m_resultCode)) {
addLockPath();
}
}
};
}
private void addLockPath() {
new ZKAsyncCreateHandler(m_lockPath, null, CreateMode.PERSISTENT) {
@Override
public void runImpl() {
if (noCommonKeeperExceptions(m_resultCode)) {
addBarrierParticipantPath();
}
}
};
}
private void addBarrierParticipantPath() {
new ZKAsyncCreateHandler(m_barrierParticipantsPath, null, CreateMode.PERSISTENT) {
@Override
public void runImpl() {
if (noCommonKeeperExceptions(m_resultCode)) {
try {
initializeStateMachine(m_startingMemberSet);
if (m_pendingStateMachineInits.decrementAndGet() == 0) {
m_initComplete.set(true);
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
// Lost the connection or interrupted for shutdown
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in addBarrierParticipantPath");
}
initializationFailed();
} catch (KeeperException e) {
VoltDB.crashLocalVoltDB("Unexpected failure in StateMachine.", true, e);
}
}
}
};
}
}
private AsyncStateMachineInitializer createAsyncInitializer(final AtomicInteger pendingStateMachineInits,
final Set knownMembers) {
return new AsyncStateMachineInitializer(pendingStateMachineInits, knownMembers);
}
private void syncStateMachineInitialize(final Set knownMembers) throws KeeperException, InterruptedException {
addIfMissing(m_statePath, CreateMode.PERSISTENT, null);
addIfMissing(m_lockPath, CreateMode.PERSISTENT, null);
addIfMissing(m_barrierParticipantsPath, CreateMode.PERSISTENT, null);
initializeStateMachine(knownMembers);
}
private void initializeStateMachine(Set knownMembers)
throws KeeperException, InterruptedException {
SmiCallable outOfLockWork = null;
try (Mutex.Releaser r = m_mutex.acquire()) {
// Make sure the child count is correct so that an init does not race with a released lock where the
// results have not been processed yet. If it is non-zero, it means results still need to be collected
// by some nodes even if the distributed lock list is empty.
m_currentParticipants = m_zk.getChildren(m_barrierParticipantsPath, null).size();
boolean ownDistributedLock = requestDistributedLock();
ByteBuffer startStates = buildProposal(REQUEST_TYPE.INITIALIZING,
m_requestedInitialState.asReadOnlyBuffer(), m_requestedInitialState.asReadOnlyBuffer());
addIfMissing(m_barrierResultsPath, CreateMode.PERSISTENT, startStates.array());
boolean stateMachineNodeCreated = false;
if (ownDistributedLock) {
// Only the very first initializer of the state machine will both get the lock and successfully
// allocate "STATE_INITIALIZED". This guarantees that only one node will assign the initial state.
stateMachineNodeCreated = addIfMissing(ZKUtil.joinZKPath(m_statePath, "STATE_INITIALIZED"),
CreateMode.PERSISTENT, null);
}
if (m_membershipChangePending) {
getLatestMembership();
} else if (m_knownMembers == null) {
// Members could be set by the callback which was handled between the async initialization calls
m_knownMembers = knownMembers;
}
// We need to always monitor participants so that if we are initialized we can add ourselves and insert
// our results and if we are not initialized, we can always auto-insert a null result.
if (stateMachineNodeCreated) {
assert (ownDistributedLock);
m_synchronizedState = m_requestedInitialState;
m_requestedInitialState = null;
m_lastProposalVersion = getProposalVersion();
ByteBuffer readOnlyResult = m_synchronizedState.asReadOnlyBuffer();
// Add an acceptable result so the next initializing member recognizes an immediate quorum.
addResultEntry(new byte[] { 1 });
m_lockWaitingOn = "bogus"; // Avoids call to notifyDistributedLockWaiter
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Initialized (first member) with State "
+ stateToString(m_synchronizedState.asReadOnlyBuffer()));
}
m_initializationCompleted = true;
cancelDistributedLock();
outOfLockWork = new ChainedCallable(checkForBarrierParticipantsChange()) {
@Override
protected void callImpl() {
// Notify the derived object that we have a stable state
try {
setInitialState(readOnlyResult);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(StateMachineInstance.this));
}
}
};
}
else {
// To get a stable result set, we need to get the lock for this state machine. If someone else has
// the
// lock they can clear the stale results out from under us.
if (ownDistributedLock) {
outOfLockWork = initializeFromActiveCommunity();
} else {
// This means we will ignore the current update if there is one in progress.
// Note that if we are not the next waiter for the lock, we will blindly
// accept the next proposal and use the outcome to set our initial state.
addResultEntry(null);
Stat nodeStat = new Stat();
boolean resultNodeFound = false;
do {
try {
m_zk.getData(m_barrierResultsPath, false, nodeStat);
resultNodeFound = true;
} catch (NoNodeException noNode) {
// This is a race between another node who got the Distributed lock but
// Has not set up the result path yet. Keep Retrying.
}
} while (!resultNodeFound);
m_lastProposalVersion = nodeStat.getVersion();
outOfLockWork = checkForBarrierParticipantsChange();
}
}
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
}
/*
* This state machine and all other state machines under this manager are being removed
*/
private void disableMembership() {
try (Mutex.Releaser r = m_mutex.acquire()) {
if (m_log.isTraceEnabled()) {
m_log.trace(m_stateMachineId + ": Disabling member");
}
// put in two separate try-catch blocks so that both actions are attempted
try {
m_zk.delete(m_myParticipantPath, -1);
}
catch (KeeperException | InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in disableMembership");
}
}
try {
if (m_ourDistributedLockName != null) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": cancelLockRequest (Shutdown) for "
+ m_ourDistributedLockName);
}
m_zk.delete(m_ourDistributedLockName, -1);
}
}
catch (KeeperException | InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in disableMembership");
}
}
m_initializationCompleted = false;
}
}
private void reset(boolean isDirectVictim) {
ByteBuffer resetState = notifyOfStateMachineReset(isDirectVictim);
// if we are the direct victim, use the reset state as requested initial state
if (isDirectVictim) {
m_requestedInitialState = resetState;
}
// else simply use currently requested initial state or synchronized state as requested initial state
else {
if (m_requestedInitialState == null) {
assert(m_synchronizedState != null);
m_requestedInitialState = m_synchronizedState;
}
}
m_synchronizedState = null;
m_membershipChangePending = false;
m_stateChangeInitiator = false;
m_ourDistributedLockName = null;
m_lockWaitingOn = null;
m_knownMembers = null;
m_holdingDistributedLock = false;
m_pendingProposal = null;
m_currentRequestType = REQUEST_TYPE.INITIALIZING;
m_memberResults = null;
m_lastProposalVersion = 0;
m_myResultPath = ZKUtil.joinZKPath(m_barrierResultsPath, m_memberId);
m_myParticipantPath = ZKUtil.joinZKPath(m_barrierParticipantsPath, m_memberId);
m_stateMachineId = "SMI " + m_ssmRootNode + "/" + m_stateMachineName + "/" + m_memberId;
}
private int getProposalVersion() throws KeeperException {
int proposalVersion = -1;
try {
Stat nodeStat = new Stat();
m_zk.getData(m_barrierResultsPath, null, nodeStat);
proposalVersion = nodeStat.getVersion();
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received "+e.getClass().getSimpleName()+" in getProposalVersion");
}
}
return proposalVersion;
}
private ByteBuffer buildProposal(REQUEST_TYPE requestType, ByteBuffer existingState, ByteBuffer proposedState) {
if (m_log.isTraceEnabled()) {
m_log.trace(String.format("%s: Building proposal %s: previous %s action %s", m_stateMachineId,
requestType, stateToString(existingState.slice()),
(requestType == REQUEST_TYPE.CORRELATED_COORDINATED_TASK
|| requestType == REQUEST_TYPE.UNCORRELATED_COORDINATED_TASK)
? taskToString(proposedState.slice())
: stateToString(proposedState.slice())));
}
ByteBuffer states = ByteBuffer.allocate(proposedState.remaining() + existingState.remaining() + 4);
states.putShort((short)requestType.ordinal());
states.putShort((short)existingState.remaining());
states.put(existingState);
states.put(proposedState);
states.flip();
return states;
}
private StateChangeRequest getExistingAndProposedBuffersFromResultsNode(byte oldAndNewState[]) {
// Either the barrier or the state node should have the current state
assert(oldAndNewState != null);
ByteBuffer states = ByteBuffer.wrap(oldAndNewState).asReadOnlyBuffer();
// Maximum state size is 64K
REQUEST_TYPE requestType = REQUEST_TYPE.values()[states.getShort()];
int oldStateSize = states.getShort();
states.position(oldStateSize+4);
ByteBuffer proposedState = states.slice();
states.flip(); // just the old state
states.position(4);
states.limit(oldStateSize+4);
return new StateChangeRequest(requestType, states.slice(), proposedState);
}
private SmiCallable checkForBarrierParticipantsChange() throws KeeperException {
assert(debugIsLocalStateLocked());
try {
m_participantsChangePending = false;
int newParticipantCnt = m_zk.getChildren(m_barrierParticipantsPath, m_barrierParticipantsWatcher).size();
Stat nodeStat = new Stat();
// inspect the m_barrierResultsPath and get the new and old states and version
// At some point this can be optimized to not examine the proposal all the time
byte statePair[] = m_zk.getData(m_barrierResultsPath, false, nodeStat);
int proposalVersion = nodeStat.getVersion();
if (proposalVersion != m_lastProposalVersion) {
m_lastProposalVersion = proposalVersion;
m_currentParticipants = newParticipantCnt;
if (!m_stateChangeInitiator) {
assert(m_pendingProposal == null);
// This is an indication that a new state change is being requested
StateChangeRequest existingAndProposedStates = getExistingAndProposedBuffersFromResultsNode(statePair);
m_currentRequestType = existingAndProposedStates.m_requestType;
if (m_requestedInitialState != null) {
// Since we have not initialized yet, we acknowledge this proposal with an empty result.
addResultEntry(null);
}
else {
// Don't add ourselves as a participant because we don't care about the results
REQUEST_TYPE type = m_currentRequestType;
if (type == REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST) {
// The request is from a new member who found the results in an ambiguous state.
byte result[] = new byte[1];
if (existingAndProposedStates.m_proposal.equals(m_synchronizedState)) {
result[0] = (byte)1;
}
else {
assert(existingAndProposedStates.m_previousState.equals(m_synchronizedState));
result[0] = (byte)0;
}
addResultEntry(result);
}
else {
// We track the number of people waiting on the results so we know when the result is stale and
// the next lock holder can initiate a new state proposal.
m_zk.create(m_myParticipantPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
// increment the participant count by 1, so that lock notifications can be correctly guarded
m_currentParticipants += 1;
m_pendingProposal = existingAndProposedStates.m_proposal;
ByteBuffer proposedState = m_pendingProposal.asReadOnlyBuffer();
assert (existingAndProposedStates.m_previousState.equals(m_synchronizedState)) : String
.format("%s: %s proposed previous: %s, actual previous: %s", m_stateMachineId,
type, stateToString(existingAndProposedStates.m_previousState),
stateToString(m_synchronizedState));
if (m_log.isDebugEnabled()) {
if (type == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
m_log.debug(m_stateMachineId + ": Received new State proposal " +
stateToString(proposedState.asReadOnlyBuffer()));
}
else {
m_log.debug(m_stateMachineId + ": Received new Task request " +
taskToString(proposedState.asReadOnlyBuffer()));
}
}
return () -> {
if (type == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
try {
stateChangeProposed(proposedState);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
}
else {
try {
taskRequested(proposedState);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
}
};
}
}
}
else {
// We initiated a proposal and if it is a TASK we need to call our derived object to kick it off
assert(m_pendingProposal != null);
if (m_currentRequestType == REQUEST_TYPE.CORRELATED_COORDINATED_TASK ||
m_currentRequestType == REQUEST_TYPE.UNCORRELATED_COORDINATED_TASK) {
// This is a Task request made by us so notify the derived state machine to perform the task
// provide a result.
ByteBuffer taskRequest = m_pendingProposal.asReadOnlyBuffer();
return () -> {
try {
taskRequested(taskRequest);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
};
}
}
}
else {
m_currentParticipants = newParticipantCnt;
if (canObtainDistributedLock()) {
// We can finally notify the lock waiter because everyone is finished evaluating the previous state proposal
return notifyDistributedLockWaiter();
}
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
// Lost the connection or interrupted for shutdown
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in checkForBarrierParticipantsChange");
}
}
return null;
}
/**
* Utility method to validate that all of the conditions are met to be able to obtain the distributed lock
*
* @return {@code true} if this host is able to obtain the distributed lock
*/
private boolean canObtainDistributedLock() {
return m_ourDistributedLockName != null && !m_participantsChangePending
&& m_ourDistributedLockName.equals(m_lockWaitingOn) && m_currentParticipants == 0;
}
private void monitorParticipantChanges() throws KeeperException {
// always start checking for participation changes after the result notifications
// or initialization notifications to ensure these notifications happen before lock
// ownership notifications.
SmiCallable outOfLockWork;
try (Mutex.Releaser r = m_mutex.acquire()) {
outOfLockWork = checkForBarrierParticipantsChange();
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
}
private RESULT_CONCENSUS resultsAgreeOnSuccess(Set memberList)
throws KeeperException, InterruptedException {
boolean agree = false;
for (String memberId : memberList) {
byte result[];
try {
result = m_zk.getData(ZKUtil.joinZKPath(m_barrierResultsPath, memberId), false, null);
if (m_log.isDebugEnabled()) {
m_log.debug(String.format("%s: Checking result from member %s: %s", m_stateMachineId, memberId,
Arrays.toString(result)));
}
if (result != null) {
if (result[0] == 0) {
return RESULT_CONCENSUS.DISAGREE;
}
agree = true;
}
} catch (NoNodeException ke) {
// This can happen when a new member joins and other members detect the new member before
// it's initialization code is called and a Null result is supplied to treat this as a null result.
if (m_log.isDebugEnabled()) {
m_log.debug(String.format("%s: Could not find a result from member %s", m_stateMachineId,
memberId));
}
}
}
if (agree) {
return RESULT_CONCENSUS.AGREE;
}
return RESULT_CONCENSUS.NO_QUORUM;
}
private ArrayList getUncorrelatedResults(ByteBuffer taskRequest, Set memberList)
throws KeeperException {
// Treat ZooKeeper failures as empty result
ArrayList results = new ArrayList();
try {
for (String memberId : memberList) {
byte result[];
result = m_zk.getData(ZKUtil.joinZKPath(m_barrierResultsPath, memberId), false, null);
if (result != null) {
ByteBuffer bb = ByteBuffer.wrap(result);
results.add(bb);
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": " + memberId + " reports Result " +
taskResultToString(taskRequest.asReadOnlyBuffer(), bb.asReadOnlyBuffer()));
}
}
else {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": " + memberId + " did not supply a Task Result");
}
}
}
// Remove ourselves from the participants list to unblock the next distributed lock waiter
m_zk.delete(m_myParticipantPath, -1);
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
results = new ArrayList();
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in getUncorrelatedResults");
}
}
return results;
}
private Map getCorrelatedResults(ByteBuffer taskRequest, Set memberList)
throws KeeperException {
// Treat ZooKeeper failures as empty result
Map results = new HashMap();
try {
for (String memberId : memberList) {
byte result[];
result = m_zk.getData(ZKUtil.joinZKPath(m_barrierResultsPath, memberId), false, null);
if (result != null) {
ByteBuffer bb = ByteBuffer.wrap(result);
results.put(memberId, bb);
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": " + memberId + " reports Result " +
taskResultToString(taskRequest.asReadOnlyBuffer(), bb.asReadOnlyBuffer()));
}
}
else {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": " + memberId + " did not supply a Task Result");
}
// Signal the caller that a member didn't answer
results.put(memberId, null);
}
}
// Remove ourselves from the participants list to unblock the next distributed lock waiter
m_zk.delete(m_myParticipantPath, -1);
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
results = new HashMap();
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in getCorrelatedResults");
}
}
return results;
}
// The number of results is a superset of the membership so analyze the results
private SmiCallable processResultQuorum(Set memberList) throws KeeperException {
assert(m_currentRequestType != REQUEST_TYPE.INITIALIZING);
m_memberResults = null;
if (m_requestedInitialState != null) {
// We can now initialize this state machine instance
assert(m_holdingDistributedLock);
try {
assert(m_synchronizedState == null);
Stat versionInfo = new Stat();
byte oldAndProposedState[] = m_zk.getData(m_barrierResultsPath, false, versionInfo);
assert(versionInfo.getVersion() == m_lastProposalVersion);
StateChangeRequest existingAndProposedStates =
getExistingAndProposedBuffersFromResultsNode(oldAndProposedState);
m_currentRequestType = existingAndProposedStates.m_requestType;
// We are waiting to initialize. We are here either because we are piggy-backing
// another member's proposal or because we initiated a LAST_CHANGE_OUTCOME_REQUEST
if (m_currentRequestType == REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST) {
// We don't have a result yet but it is available
RESULT_CONCENSUS result = resultsAgreeOnSuccess(memberList);
if (result == RESULT_CONCENSUS.NO_QUORUM) {
// Bad news. There is no definitive state so if we are the distributed
// lock owner we will assign our initial state as our initial state
if (m_stateChangeInitiator) {
m_synchronizedState = m_requestedInitialState;
ByteBuffer stableState = buildProposal(REQUEST_TYPE.INITIALIZING,
m_synchronizedState.asReadOnlyBuffer(), m_synchronizedState.asReadOnlyBuffer());
Stat newProposalStat = m_zk.setData(m_barrierResultsPath, stableState.array(), -1);
m_lastProposalVersion = newProposalStat.getVersion();
}
}
else {
// There was at least one member that knows the definitive state
if (result == RESULT_CONCENSUS.AGREE) {
m_synchronizedState = existingAndProposedStates.m_proposal;
}
else {
m_synchronizedState = existingAndProposedStates.m_previousState;
}
}
}
else
if (m_currentRequestType == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
// We don't have a result yet but it is available
RESULT_CONCENSUS result = resultsAgreeOnSuccess(memberList);
if (result == RESULT_CONCENSUS.AGREE) {
m_synchronizedState = existingAndProposedStates.m_proposal;
}
else {
assert(result == RESULT_CONCENSUS.DISAGREE);
m_synchronizedState = existingAndProposedStates.m_previousState;
}
}
else {
// If we are processing a TASK or a updated STABLE result, we don't need a quorum
m_synchronizedState = existingAndProposedStates.m_previousState;
}
// Remove ourselves from the participants list to unblock the next distributed lock waiter
m_zk.delete(m_myParticipantPath, -1);
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in processResultQuorum");
}
}
if (m_stateChangeInitiator) {
m_stateChangeInitiator = false;
cancelDistributedLock();
}
if (m_synchronizedState != null) {
ByteBuffer readOnlyResult = m_synchronizedState.asReadOnlyBuffer();
// We were not yet participating in the state machine so but now we can
m_requestedInitialState = null;
m_pendingProposal = null;
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Initialized (concensus) with State " +
stateToString(m_synchronizedState.asReadOnlyBuffer()));
}
m_initializationCompleted = true;
return () -> {
try {
setInitialState(readOnlyResult);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
if (m_initializationCompleted) {
// If we are ready to provide an initial state to the derived state machine, add us to
// participants watcher so we can see the next request
monitorParticipantChanges();
}
};
}
}
else {
assert(m_currentRequestType != REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST);
boolean initiator = m_stateChangeInitiator;
boolean success = false;
if (m_currentRequestType == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
// Treat ZooKeeper failures as unsuccessful proposal
ByteBuffer attemptedChange = m_pendingProposal.asReadOnlyBuffer();
try {
// We don't have a result yet but it is available
RESULT_CONCENSUS result = resultsAgreeOnSuccess(memberList);
// Now that we have a result we can remove ourselves from the participants list.
m_zk.delete(m_myParticipantPath, -1);
if (result == RESULT_CONCENSUS.AGREE) {
success = true;
m_synchronizedState = m_pendingProposal;
}
else {
assert(result == RESULT_CONCENSUS.DISAGREE);
}
m_pendingProposal = null;
if (m_stateChangeInitiator) {
// Since we don't care if we are the last to go away, remove ourselves from the participant list
assert(m_holdingDistributedLock);
m_stateChangeInitiator = false;
cancelDistributedLock();
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
success = false;
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in processResultQuorum");
}
}
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Proposed state " + (success?"succeeded ":"failed ") +
stateToString(attemptedChange.asReadOnlyBuffer()));
}
final boolean finalSuccess = success;
return () -> {
// Notify the derived state machine engine of the current state
try {
proposedStateResolved(initiator, attemptedChange, finalSuccess);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
if (m_initializationCompleted) {
monitorParticipantChanges();
}
};
}
else {
// Process the results of a TASK request
ByteBuffer taskRequest = m_pendingProposal.asReadOnlyBuffer();
m_pendingProposal = null;
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": All members completed task " + taskToString(taskRequest.asReadOnlyBuffer()));
}
if (m_currentRequestType == REQUEST_TYPE.CORRELATED_COORDINATED_TASK) {
Map results = getCorrelatedResults(taskRequest, memberList);
if (m_stateChangeInitiator) {
// Since we don't care if we are the last to go away, remove ourselves from the participant
// list
assert (m_holdingDistributedLock);
m_stateChangeInitiator = false;
cancelDistributedLock();
}
return () -> {
try {
correlatedTaskCompleted(initiator, taskRequest, results);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
if (m_initializationCompleted) {
monitorParticipantChanges();
}
};
}
else {
ArrayList results = getUncorrelatedResults(taskRequest, memberList);
if (m_stateChangeInitiator) {
// Since we don't care if we are the last to go away, remove ourselves from the participant list
assert(m_holdingDistributedLock);
m_stateChangeInitiator = false;
cancelDistributedLock();
}
return () -> {
try {
uncorrelatedTaskCompleted(initiator, taskRequest, results);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
if (m_initializationCompleted) {
monitorParticipantChanges();
}
};
}
}
}
return null;
}
private SmiCallable checkForBarrierResultsChanges() throws KeeperException {
assert(debugIsLocalStateLocked());
if (m_pendingProposal == null) {
// Don't check for barrier results until we notice the participant list change.
return null;
}
Set membersWithResults;
try {
membersWithResults = ImmutableSet.copyOf(m_zk.getChildren(m_barrierResultsPath,
m_barrierResultsWatcher));
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
membersWithResults = new TreeSet(Arrays.asList("We died so avoid Quorum path"));
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in checkForBarrierResultsChanges");
}
}
if (Sets.difference(m_knownMembers, membersWithResults).isEmpty()) {
return processResultQuorum(membersWithResults);
}
m_memberResults = membersWithResults;
return null;
}
/*
* Assumes this state machine owns the distributed lock and can either interrogate the existing state
* or request the current state from the community (if the existing state is ambiguous)
*/
private SmiCallable initializeFromActiveCommunity() throws KeeperException {
byte oldAndProposedState[];
try {
Stat lastProposal = new Stat();
oldAndProposedState = m_zk.getData(m_barrierResultsPath, false, lastProposal);
StateChangeRequest existingAndProposedStates =
getExistingAndProposedBuffersFromResultsNode(oldAndProposedState);
if (existingAndProposedStates.m_requestType == REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST) {
RESULT_CONCENSUS result = resultsAgreeOnSuccess(m_knownMembers);
if (result == RESULT_CONCENSUS.AGREE) {
// Fall through to set up the initial state and provide notification of initialization
existingAndProposedStates = new StateChangeRequest(REQUEST_TYPE.INITIALIZING,
existingAndProposedStates.m_proposal.asReadOnlyBuffer(),
existingAndProposedStates.m_proposal.asReadOnlyBuffer());
}
else
if (result == RESULT_CONCENSUS.DISAGREE) {
// Fall through to set up the initial state and provide notification of initialization
existingAndProposedStates = new StateChangeRequest(REQUEST_TYPE.INITIALIZING,
existingAndProposedStates.m_previousState.asReadOnlyBuffer(),
existingAndProposedStates.m_previousState.asReadOnlyBuffer());
}
else {
// Another members outcome request was completed but a subsequent proposing member died
// between the time the results of this last request were removed and the new proposal
// was made. Fall through and propose a new LAST_CHANGE_OUTCOME_REQUEST.
existingAndProposedStates = new StateChangeRequest(REQUEST_TYPE.STATE_CHANGE_REQUEST,
existingAndProposedStates.m_previousState.asReadOnlyBuffer(),
existingAndProposedStates.m_proposal.asReadOnlyBuffer());
}
}
if (existingAndProposedStates.m_requestType == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
// The last lock owner died before completing a state change or determining the current state
m_stateChangeInitiator = true;
m_pendingProposal = m_requestedInitialState;
m_currentRequestType = REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST;
ByteBuffer stateChange = buildProposal(REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST,
existingAndProposedStates.m_previousState, existingAndProposedStates.m_proposal);
m_lastProposalVersion = wakeCommunityWithProposal(stateChange.array());
addResultEntry(null);
return checkForBarrierResultsChanges();
}
else {
assert(existingAndProposedStates.m_requestType == REQUEST_TYPE.INITIALIZING ||
existingAndProposedStates.m_requestType == REQUEST_TYPE.CORRELATED_COORDINATED_TASK ||
existingAndProposedStates.m_requestType == REQUEST_TYPE.UNCORRELATED_COORDINATED_TASK);
m_synchronizedState = existingAndProposedStates.m_previousState;
m_requestedInitialState = null;
ByteBuffer readOnlyResult = m_synchronizedState.asReadOnlyBuffer();
m_lastProposalVersion = lastProposal.getVersion();
m_pendingProposal = null;
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Initialized (existing) with State " +
stateToString(m_synchronizedState.asReadOnlyBuffer()));
}
ByteBuffer staleTask;
if (existingAndProposedStates.m_requestType != REQUEST_TYPE.INITIALIZING) {
staleTask = existingAndProposedStates.m_proposal.asReadOnlyBuffer();
} else {
staleTask = null;
}
m_initializationCompleted = true;
cancelDistributedLock();
// Add an acceptable result so the next initializing member recognizes an immediate quorum.
m_lockWaitingOn = "bogus"; // Avoids call to notifyDistributedLockWaiter below
return new ChainedCallable(checkForBarrierParticipantsChange()) {
@Override
public void callImpl() {
// Notify the derived object that we have a stable state
try {
setInitialState(readOnlyResult);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(StateMachineInstance.this));
}
if (staleTask != null) {
try {
staleTaskRequestNotification(staleTask);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(StateMachineInstance.this));
}
}
}
};
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in initializeFromActiveCommunity");
}
}
return null;
}
private int wakeCommunityWithProposal(byte[] proposal) throws KeeperException {
assert(m_holdingDistributedLock);
assert(m_currentParticipants == 0);
int newProposalVersion = -1;
try {
List results = m_zk.getChildren(m_barrierResultsPath, false);
for (String resultNode : results) {
try {
m_zk.delete(ZKUtil.joinZKPath(m_barrierResultsPath, resultNode), -1);
}
catch(KeeperException.NoNodeException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Skipped externally deleted " +
"barrier child node in wakeCommunityWithProposal");
}
}
}
Stat newProposalStat = m_zk.setData(m_barrierResultsPath, proposal, -1);
m_zk.create(m_myParticipantPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
newProposalVersion = newProposalStat.getVersion();
// force the participant count to be 1, so that lock notifications can be correctly guarded
m_currentParticipants = 1;
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in wakeCommunityWithProposal");
}
}
return newProposalVersion;
}
private final Callable HandlerForBarrierParticipantsEvent = new Callable() {
@Override
public Void call() throws KeeperException {
SmiCallable outOfLockWork;
try (Mutex.Releaser r = m_mutex.acquire()) {
if (!isRunning()) {
return null;
}
outOfLockWork = checkForBarrierParticipantsChange();
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
return null;
}
};
private final Callable HandlerForBarrierResultsEvent = new Callable() {
@Override
public Void call() throws KeeperException {
SmiCallable outOfLockWork;
try (Mutex.Releaser r = m_mutex.acquire()) {
if (!isRunning()) {
return null;
}
outOfLockWork = checkForBarrierResultsChanges();
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
return null;
}
};
private String getNextLockNodeFromList() throws KeeperException, InterruptedException {
List lockRequestors = m_zk.getChildren(m_lockPath, false);
Collections.sort(lockRequestors);
ListIterator iter = lockRequestors.listIterator();
String currRequestor = null;
while (iter.hasNext()) {
currRequestor = ZKUtil.joinZKPath(m_lockPath, iter.next());
if (currRequestor.equals(m_ourDistributedLockName)) {
break;
}
}
assert (currRequestor != null); // We should be able to find ourselves
//Back on currRequestor
iter.previous();
String nextLower = null;
//Until we have previous nodes and we set a watch on previous node.
while (iter.hasPrevious()) {
//Process my lower nodes and put a watch on whats live
String previous = ZKUtil.joinZKPath(m_lockPath, iter.previous());
if (m_zk.exists(previous, m_lockWatcher) != null) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": " + m_ourDistributedLockName + " waiting on " + previous);
}
nextLower = previous;
break;
}
}
//If we could not watch any lower node we are lowest and must own the lock.
if (nextLower == null) {
return m_ourDistributedLockName;
}
return nextLower;
}
private final Callable HandlerForDistributedLockEvent = new Callable() {
@Override
public Void call() throws KeeperException {
SmiCallable outOfLockWork = null;
try (Mutex.Releaser r = m_mutex.acquire()) {
if (!isRunning()) {
return null;
}
if (m_ourDistributedLockName != null) {
try {
m_lockWaitingOn = getNextLockNodeFromList();
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in HandlerForDistributedLockEvent");
}
m_lockWaitingOn = "We died so we can't ever get the distributed lock";
}
if (canObtainDistributedLock()) {
// There are no more members still processing the last result
outOfLockWork = notifyDistributedLockWaiter();
}
}
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
return null;
}
};
private void cancelDistributedLock() throws KeeperException {
assert(debugIsLocalStateLocked());
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": cancelLockRequest for " + m_ourDistributedLockName);
}
assert(m_holdingDistributedLock);
try {
m_zk.delete(m_ourDistributedLockName, -1);
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in cancelDistributedLock");
}
}
m_ourDistributedLockName = null;
m_holdingDistributedLock = false;
}
/*
* Warns about membership change notification waiting in the executor thread
*/
private void checkMembership() {
m_membershipChangePending = true;
}
/*
* Callback notification from executor thread of membership change
*/
private void membershipChanged(Set knownHosts, Set addedMembers, Set removedMembers)
throws KeeperException {
boolean notInitializing;
SmiCallable outOfLockWork = null;
try (Mutex.Releaser r = m_mutex.acquire()) {
// Even though we got a direct update, membership could have changed again between the
m_knownMembers = knownHosts;
m_membershipChangePending = false;
notInitializing = m_requestedInitialState == null;
if (m_pendingProposal != null && m_memberResults != null
&& Sets.difference(m_knownMembers, m_memberResults).isEmpty()) {
// We can stop watching for results since we have a quorum.
outOfLockWork = processResultQuorum(m_memberResults);
}
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
if (notInitializing) {
try {
membershipChanged(addedMembers, removedMembers);
} catch (Exception e) {
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
}
}
/*
* Retrieves member set from ZooKeeper
*/
private void getLatestMembership() throws KeeperException {
try {
m_knownMembers = ImmutableSet.copyOf(m_zk.getChildren(m_stateMachineMemberPath, null));
if (m_log.isDebugEnabled()) {
m_log.debug(String.format("%s: getLatestMembership Updating known members to: %s", m_stateMachineId,
m_knownMembers));
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in getLatestMembership");
}
}
}
private SmiCallable notifyDistributedLockWaiter() throws KeeperException {
assert(debugIsLocalStateLocked());
assert(m_currentParticipants == 0);
m_holdingDistributedLock = true;
if (m_membershipChangePending) {
getLatestMembership();
}
if (m_requestedInitialState != null) {
// We are still waiting to initialize the state. Now that we have have the state lock we can
// look at the last set of results. We need to set proposedState so results will be checked.
assert(m_pendingProposal == null);
m_pendingProposal = m_requestedInitialState;
return initializeFromActiveCommunity();
}
else {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Granted lockRequest for " + m_ourDistributedLockName);
}
m_lockWaitingOn = null;
// Notify the derived class that the lock is available
return () -> {
try {
lockRequestCompleted();
} catch (Exception e) {
cancelLockRequest();
if (m_log.isDebugEnabled()) {
m_log.debug("Error in StateMachineInstance callbacks.", e);
}
m_initializationCompleted = false;
submitCallable(new CallbackExceptionHandler(this));
}
};
}
}
private boolean requestDistributedLock() throws KeeperException {
try {
if (m_ourDistributedLockName != null) {
m_log.error(m_stateMachineId + ": Requested distributed lock before prior state change or task has been completed");
return false;
}
assert(debugIsLocalStateLocked());
m_ourDistributedLockName = m_zk.create(ZKUtil.joinZKPath(m_lockPath, "lock_"), null,
Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL);
m_lockWaitingOn = getNextLockNodeFromList();
if (canObtainDistributedLock()) {
// Prevents a second notification.
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": requestLock successful for " + m_ourDistributedLockName);
}
m_lockWaitingOn = null;
m_holdingDistributedLock = true;
if (m_membershipChangePending) {
getLatestMembership();
}
return true;
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in requestDistributedLock");
}
}
return false;
}
private void addResultEntry(byte result[]) throws KeeperException {
if (m_log.isDebugEnabled()) {
m_log.debug(String.format("%s: Responding to request %d:%s with result: %s", m_stateMachineId,
m_lastProposalVersion, m_currentRequestType,
(result == null || result.length < 10 || m_log.isTraceEnabled()) ? Arrays.toString(result)
: "suppressed"));
}
try {
m_zk.create(m_myResultPath, result, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName() + " in addResultEntry");
}
} catch (KeeperException.NodeExistsException e) {
// There is a race during initialization where a null result is assigned once so a current proposal
// is not hung. However, if a new proposal is submitted by another instance between that assignment
// and the call to checkForBarrierParticipantsChange, a second null result is assigned.
if (m_requestedInitialState == null || result != null) {
byte[] previousResult = null;
try {
previousResult = m_zk.getData(m_myResultPath, null, null);
} catch (Exception e1) {
m_log.error("Failed to retrieve existing result", e1);
}
org.voltdb.VoltDB.crashLocalVoltDB(
"Unexpected failure in StateMachine; Two results created for one proposal.: "
+ Arrays.toString(previousResult),
true, e);
}
}
}
private SmiCallable assignStateChangeAgreement(boolean acceptable) throws KeeperException {
assert(debugIsLocalStateLocked());
assert(m_pendingProposal != null);
assert(m_currentRequestType == REQUEST_TYPE.STATE_CHANGE_REQUEST);
byte result[] = { (byte) (acceptable ? 1 : 0) };
addResultEntry(result);
if (acceptable) {
// Since we are only interested in the results when we agree with the proposal
return checkForBarrierResultsChanges();
}
else {
m_pendingProposal = null;
try {
// Since we don't care about the outcome remove ourself from the participant list
m_zk.delete(m_myParticipantPath, -1);
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Received " + e.getClass().getSimpleName()
+ " in assignStateChangeAgreement");
}
}
}
return null;
}
/*
* Returns true when this state machine is a full participant of the community and has the
* current state of the community
*/
protected boolean isInitialized() {
try (Mutex.Releaser r = m_mutex.acquire()) {
return m_initializationCompleted && m_requestedInitialState == null;
}
}
/*
* Notifies of full participation and the current agreed state
* warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected abstract void setInitialState(ByteBuffer currentAgreedState);
/*
* Attempts to get the distributed lock. Returns true if the lock was acquired
*/
protected boolean requestLock() {
try (Mutex.Releaser r = m_mutex.acquire()) {
if (m_initializationCompleted) {
return requestDistributedLock();
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e);
}
return false;
}
/*
* Cancels an outstanding lock request. Should only be used if a previously desirable
* proposal is now obsolete (ie. use when lockRequestCompleted is called and a proposal
* is no longer necessary).
*/
protected void cancelLockRequest() {
try (Mutex.Releaser r = m_mutex.acquire()) {
if (m_initializationCompleted) {
assert (m_pendingProposal == null);
cancelDistributedLock();
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e);
}
}
/*
* Notifies of the successful completion of a previous lock request
*/
protected void lockRequestCompleted() {}
/*
* Propose a new state. Only call after successful acquisition of the distributed lock.
*/
protected void proposeStateChange(ByteBuffer proposedState) {
assert m_holdingDistributedLock;
assert(proposedState != null);
assert(proposedState.remaining() < Short.MAX_VALUE);
try {
SmiCallable outOfLockWork;
try (Mutex.Releaser r = m_mutex.acquire()) {
if (!m_initializationCompleted) {
return;
}
// Only the lock owner can initiate a barrier request
assert (m_requestedInitialState == null);
if (proposedState.position() == 0) {
m_pendingProposal = proposedState;
} else {
// Move to a new 0 aligned buffer
m_pendingProposal = ByteBuffer.allocate(proposedState.remaining());
m_pendingProposal.put(proposedState.array(),
proposedState.arrayOffset() + proposedState.position(), proposedState.remaining());
m_pendingProposal.flip();
}
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId + ": Proposing new state "
+ stateToString(m_pendingProposal.asReadOnlyBuffer()));
}
m_stateChangeInitiator = true;
m_currentRequestType = REQUEST_TYPE.STATE_CHANGE_REQUEST;
ByteBuffer stateChange = buildProposal(REQUEST_TYPE.STATE_CHANGE_REQUEST,
m_synchronizedState.asReadOnlyBuffer(), m_pendingProposal.asReadOnlyBuffer());
m_lastProposalVersion = wakeCommunityWithProposal(stateChange.array());
outOfLockWork = assignStateChangeAgreement(true);
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
} catch (Exception e) {
throw VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e);
}
}
/*
* Notification of a new proposed state change by another member.
* warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected void stateChangeProposed(ByteBuffer proposedState) {}
/*
* Called to accept or reject a new proposed state change by another member.
*/
protected void requestedStateChangeAcceptable(boolean acceptable) {
try {
SmiCallable outOfLockWork;
try (Mutex.Releaser r = m_mutex.acquire()) {
if (!m_initializationCompleted) {
return;
}
assert (!m_stateChangeInitiator);
if (m_log.isDebugEnabled()) {
m_log.debug(m_stateMachineId
+ (acceptable ? ": Agrees with State proposal" : ": Disagrees with State proposal"));
}
outOfLockWork = assignStateChangeAgreement(acceptable);
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e);
}
}
/*
* Notification of the outcome of a previous state change proposal.
* warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected void proposedStateResolved(boolean ourProposal, ByteBuffer proposedState, boolean success) {}
/*
* Propose a new task. Only call after successful acquisition of the distributed lock.
*/
protected void initiateCoordinatedTask(boolean correlated, ByteBuffer proposedTask) {
assert (proposedTask != null);
assert (proposedTask.remaining() < Short.MAX_VALUE);
try (Mutex.Releaser r = m_mutex.acquire()) {
if (m_initializationCompleted) {
// Only the lock owner can initiate a barrier request
assert (m_requestedInitialState == null);
if (proposedTask.position() == 0) {
m_pendingProposal = proposedTask;
} else {
// Move to a new 0 aligned buffer
m_pendingProposal = ByteBuffer.allocate(proposedTask.remaining());
m_pendingProposal.put(proposedTask.array(),
proposedTask.arrayOffset() + proposedTask.position(), proposedTask.remaining());
m_pendingProposal.flip();
}
if (m_log.isDebugEnabled()) {
if (m_pendingProposal.hasRemaining()) {
m_log.debug(m_stateMachineId + ": Requested new Task "
+ taskToString(m_pendingProposal.asReadOnlyBuffer()));
} else {
m_log.debug(m_stateMachineId + ": Requested unspecified new Task");
}
}
m_stateChangeInitiator = true;
m_currentRequestType = correlated ? REQUEST_TYPE.CORRELATED_COORDINATED_TASK
: REQUEST_TYPE.UNCORRELATED_COORDINATED_TASK;
ByteBuffer taskProposal = buildProposal(m_currentRequestType,
m_synchronizedState.asReadOnlyBuffer(), proposedTask.asReadOnlyBuffer());
m_pendingProposal = proposedTask;
// Since we don't update m_lastProposalVersion, we will wake ourselves up
wakeCommunityWithProposal(taskProposal.array());
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e);
}
}
/*
* Notification of a new task request by this member or another member.
* warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected void taskRequested(ByteBuffer proposedTask) {}
/*
* Notification of a task request for newly joined members.
* warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected void staleTaskRequestNotification(ByteBuffer proposedTask) {}
/*
* Called to accept or reject a new proposed state change by another member.
*/
protected void requestedTaskComplete(ByteBuffer result)
{
SmiCallable outOfLockWork;
try {
try (Mutex.Releaser r = m_mutex.acquire()) {
if (!m_initializationCompleted) {
return;
}
assert (m_pendingProposal != null);
if (m_log.isDebugEnabled()) {
if (result.hasRemaining()) {
m_log.debug(m_stateMachineId + ": Local Task completed with result " + taskResultToString(
m_pendingProposal.asReadOnlyBuffer(), result.asReadOnlyBuffer()));
} else {
m_log.debug(m_stateMachineId + ": Local Task completed with empty result");
}
}
addResultEntry(result.array());
outOfLockWork = checkForBarrierResultsChanges();
}
if (outOfLockWork != null) {
outOfLockWork.call();
}
} catch (Exception e) {
VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e);
}
}
/*
* Notification of the outcome of the task by all members sorted by memberId.
* warning: The ByteBuffer taskRequest is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected void correlatedTaskCompleted(boolean ourTask, ByteBuffer taskRequest, Map results) {}
/*
* Notification of the outcome of the task by all members.
* warning: The ByteBuffer taskRequest is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected void uncorrelatedTaskCompleted(boolean ourTask, ByteBuffer taskRequest, List results) {}
/*
* Returns the current State. This request ignores pending proposals or outcomes. And may well be invalid as soon
* as the result is returned if a new proposed state is pending. To avoid this get the Distributed Lock first.
* warning: The ByteBuffer taskRequest is not guaranteed to start at position 0 (avoid rewind, flip, ...)
*/
protected ByteBuffer getCurrentState() {
try (Mutex.Releaser r = m_mutex.acquire()) {
return m_initializationCompleted ? m_synchronizedState.asReadOnlyBuffer() : ByteBuffer.allocate(0);
}
}
protected void membershipChanged(Set addedMembers, Set removedMembers) {}
protected Set getCurrentMembers() {
try (Mutex.Releaser r = m_mutex.acquire()) {
if (m_membershipChangePending) {
getLatestMembership();
}
return ImmutableSet.copyOf(m_knownMembers);
} catch (Exception e) {
throw VoltDB.crashLocalVoltDB("SSM: Unexpected error encountered", true, e);
}
}
protected String getMemberId() {
return m_memberId;
}
/*
* Provides the version of the last received proposal or task.
* Note that the version number is not the number of state transitions but counts the total number of
* successful state transitions, failed state transitions, task requests, and last change outcome requests
*/
protected int getCurrentStateVersion() {
return m_lastProposalVersion;
}
protected boolean holdingDistributedLock() {
try (Mutex.Releaser r = m_mutex.acquire()) {
return m_holdingDistributedLock;
}
}
protected abstract String stateToString(ByteBuffer state);
protected String taskToString(ByteBuffer task) { return ""; }
protected String taskResultToString(ByteBuffer task, ByteBuffer taskResult) { return ""; }
/** Simple class which chains one callable to another */
private abstract class ChainedCallable implements SmiCallable {
private final SmiCallable m_previous;
ChainedCallable(SmiCallable callable) {
assert (debugIsLocalStateLocked());
m_previous = callable;
}
@Override
public final void call() throws KeeperException {
if (m_previous != null) {
m_previous.call();
}
callImpl();
}
protected abstract void callImpl() throws KeeperException;
}
}
public SynchronizedStatesManager(ZooKeeper zk, String rootPath, String ssmNodeName, String memberId)
throws KeeperException, InterruptedException {
this(zk, rootPath, ssmNodeName, memberId, 1);
}
// Used only for Mocking StateMachineInstance
public SynchronizedStatesManager() {
m_zk = null;
m_registeredStateMachines = null;
m_ssmRootNode = "MockRootForZooKeeper";
m_stateMachineRoot = "MockRootForSSM";
m_stateMachineMemberPath = "MockRootMembershipNode";
m_memberId = "MockMemberId";
m_canonical_memberId = "MockCanonicalMemberId";
m_resetAllowance = 5;
m_resetLimit = m_resetAllowance;
}
public SynchronizedStatesManager(ZooKeeper zk, String rootPath, String ssmNodeName, String memberId, int registeredInstances)
throws KeeperException, InterruptedException {
this(zk, rootPath, ssmNodeName, memberId, registeredInstances, 5); // default resetAllowance is 5
}
public SynchronizedStatesManager(ZooKeeper zk, String rootPath, String ssmNodeName, String memberId, int registeredInstances, int resetAllowance)
throws KeeperException, InterruptedException {
m_zk = zk;
// We will not add ourselves as members in ZooKeeper until all StateMachineInstances have registered
m_registeredStateMachines = new StateMachineInstance[registeredInstances];
m_ssmRootNode = ssmNodeName;
m_stateMachineRoot = ZKUtil.joinZKPath(rootPath, ssmNodeName);
ByteBuffer numberOfInstances = ByteBuffer.allocate(4);
numberOfInstances.putInt(registeredInstances);
addIfMissing(m_stateMachineRoot, CreateMode.PERSISTENT, numberOfInstances.array());
m_stateMachineMemberPath = ZKUtil.joinZKPath(m_stateMachineRoot, s_memberNode);
m_canonical_memberId = memberId;
m_resetCounter = 0;
m_resetAllowance = resetAllowance;
m_resetLimit = m_resetAllowance;
m_memberId = m_canonical_memberId + "_v" + m_resetCounter;
}
public void shutdownSynchronizedStatesManager() throws InterruptedException {
ListenableFuture disableComplete = shutdownSynchronizedStatesManagerAsync();
try {
disableComplete.get();
}
catch (ExecutionException e) {
Throwables.throwIfUnchecked(e.getCause());
throw new RuntimeException(e.getCause());
}
}
public ListenableFuture shutdownSynchronizedStatesManagerAsync() {
return s_sharedEs.submit(disableInstances);
}
boolean isRunning() {
return m_state.get() == State.RUNNING;
}
private final Callable disableInstances = new Callable() {
@Override
public Void call() throws KeeperException {
if (ssmLog.isDebugEnabled()) {
ssmLog.debug(m_stateMachineRoot + ": Shutting down");
}
try {
m_state.set(State.SHUTDOWN);
for (StateMachineInstance stateMachine : m_registeredStateMachines) {
stateMachine.disableMembership();
}
m_zk.delete(ZKUtil.joinZKPath(m_stateMachineMemberPath, m_memberId), -1);
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
// lost the full connection. some test cases do this...
// means zk shutdown without the elector being shutdown.
// ignore.
} catch (KeeperException.NoNodeException e) {
// FIXME: need to investigate why this happens on multinode shutdown
}
return null;
}
};
private final Callable membershipEventHandler = new Callable() {
@Override
public Void call() throws KeeperException {
if (isRunning()) {
try {
checkForMembershipChanges();
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
// lost the full connection. some test cases do this...
// means shutdown without the elector being
// shutdown; ignore.
}
}
return null;
}
};
private class MembershipWatcher implements Watcher {
@Override
public void process(WatchedEvent event) {
try {
if (isRunning()) {
for (StateMachineInstance stateMachine : m_registeredStateMachines) {
stateMachine.checkMembership();
}
submitCallable(membershipEventHandler);
}
} catch (RejectedExecutionException e) {
ssmLog.warn("ZK watch of Membership change was rejected by the SSM Thread");
}
}
}
private final MembershipWatcher m_membershipWatcher = new MembershipWatcher();
private class AsyncSSMInitializer implements Callable {
final AtomicInteger m_pendingStateMachineInits;
public AsyncSSMInitializer() {
assert(m_initComplete != null && !m_initComplete.isDone());
m_pendingStateMachineInits = new AtomicInteger(m_registeredStateMachineInstances);
}
private void initializationFailed() {
m_state.set(State.ERROR);
m_initComplete.set(false);
}
private boolean noCommonKeeperExceptions(KeeperException.Code code) {
if (code == KeeperException.Code.SESSIONEXPIRED ||
code == KeeperException.Code.CONNECTIONLOSS) {
// lost the full connection. some test cases do this...
// means zk shutdown without the elector being shutdown.
// ignore.
initializationFailed();
return false;
}
else if (code != KeeperException.Code.NODEEXISTS && code != KeeperException.Code.OK) {
org.voltdb.VoltDB.crashLocalVoltDB(
"Unexpected failure (" + code.name() + ") in ZooKeeper while initializeInstances.",
true, null);
}
return true;
}
@Override
public Void call() throws KeeperException {
try {
assert (m_registeredStateMachineInstances == ByteBuffer
.wrap(m_zk.getData(m_stateMachineRoot, false, null)).getInt());
} catch (InterruptedException | SessionExpiredException | ConnectionLossException e) {
if (ssmLog.isDebugEnabled()) {
ssmLog.debug(m_stateMachineRoot + ": Failed to double check registered state machine instances", e);
}
initializationFailed();
return null;
}
// First become a member of the community
new ZKAsyncCreateHandler(m_stateMachineMemberPath, null, CreateMode.PERSISTENT) {
@Override
public void runImpl() {
if (noCommonKeeperExceptions(m_resultCode)) {
addMemberIdIfMissing();
}
}
};
return null;
}
private void addMemberIdIfMissing() {
// This could fail because of an old ephemeral that has not aged out yet but assume this does not happen
new ZKAsyncCreateHandler(ZKUtil.joinZKPath(m_stateMachineMemberPath, m_memberId),
null, CreateMode.EPHEMERAL) {
@Override
public void runImpl() {
if (noCommonKeeperExceptions(m_resultCode)) {
setInitialGroupMembers();
}
}
};
}
private void setInitialGroupMembers() {
new ZKAsyncChildrenHandler(m_stateMachineMemberPath, m_membershipWatcher) {
@Override
public void runImpl() {
if (noCommonKeeperExceptions(m_resultCode)) {
m_groupMembers = ImmutableSet.copyOf(m_resultChildren);
// Then initialize each instance
for (StateMachineInstance instance : m_registeredStateMachines) {
StateMachineInstance.AsyncStateMachineInitializer init = instance.createAsyncInitializer(
m_pendingStateMachineInits, m_groupMembers);
init.startInitialization();
}
}
}
};
}
}
// During normal initialization the ZK tree can be set up for each statemachine asynchronously.
// However, after a unexpected failure such as a fault inside a callback, the whole the SSM needs to
// be locked down and reinitialized in a single blocking action so we will do all the ZK work
// synchronously on the SSM thread and block all other SSM state machines.
// TODO: See if it is possible to do this work asynchronously so other SSMs like DR are not blocked
// by a reset of a given Export SSM.
void syncSSMInitialize() throws KeeperException {
try {
// First become a member of the community
addIfMissing(m_stateMachineMemberPath, CreateMode.PERSISTENT, null);
// This could fail because of an old ephemeral that has not aged out yet but assume this does not happen
addIfMissing(ZKUtil.joinZKPath(m_stateMachineMemberPath, m_memberId), CreateMode.EPHEMERAL, null);
m_groupMembers = ImmutableSet.copyOf(m_zk.getChildren(m_stateMachineMemberPath, m_membershipWatcher));
// Then initialize each instance
for (StateMachineInstance instance : m_registeredStateMachines) {
instance.syncStateMachineInitialize(m_groupMembers);
}
} catch (KeeperException.SessionExpiredException | KeeperException.ConnectionLossException
| InterruptedException e) {
// lost the full connection. some test cases do this...
// means zk shutdown without the elector being shutdown.
// ignore.
m_state.set(State.SHUTDOWN);
}
}
private synchronized ListenableFuture registerStateMachine(StateMachineInstance machine, boolean asyncRequested) throws InterruptedException {
assert(m_registeredStateMachineInstances < m_registeredStateMachines.length);
m_registeredStateMachines[m_registeredStateMachineInstances] = (machine);
if (m_registeredStateMachineInstances == 0) {
m_initComplete = SettableFuture.create();
}
++m_registeredStateMachineInstances;
if (m_registeredStateMachineInstances == m_registeredStateMachines.length) {
if (machine.m_log.isDebugEnabled()) {
// lets make sure all the state machines are using unique paths
Set instanceNames = new HashSet();
for (StateMachineInstance instance : m_registeredStateMachines) {
if (!instanceNames.add(instance.m_statePath)) {
machine.m_log.error(": Multiple state machine instances with the same instanceName [" +
instance.m_statePath + "]");
}
}
}
// Do all the initialization and notifications on the executor to avoid a race with
// the group membership changes
submitCallable(new AsyncSSMInitializer());
return m_initComplete;
}
return asyncRequested ? m_initComplete : null;
}
private class CallbackExceptionHandler implements Callable {
final StateMachineInstance m_directVictim;
CallbackExceptionHandler(StateMachineInstance directVictim) {
m_directVictim = directVictim;
}
@Override
public Void call() throws Exception {
// if the direct victim has already been reset, ignore the stale callback exception handling task
if (!m_directVictim.isInitializationCompleted()) {
assert (m_registeredStateMachineInstances > 0 && m_registeredStateMachineInstances == m_registeredStateMachines.length);
disableInstances.call();
if (m_lastResetTimeInMillis == -1L) {
m_lastResetTimeInMillis = System.currentTimeMillis();
} else {
long currentTimeInMillis = System.currentTimeMillis();
// if a full reset clear threshold duration has passed after last reset, bump the reset limit
if (currentTimeInMillis - m_lastResetTimeInMillis >= RESET_CLEAR_THRESHOLD) {
m_resetLimit = m_resetCounter + m_resetAllowance;
}
m_lastResetTimeInMillis = currentTimeInMillis;
}
++m_resetCounter;
if (m_resetCounter > m_resetLimit) {
return null;
}
m_memberId = m_canonical_memberId + "_v" + m_resetCounter;
try {
for (StateMachineInstance instance : m_registeredStateMachines) {
instance.reset(instance == m_directVictim);
}
} catch (Exception e) {
return null; // if something wrong happened in reset(), give up as if the reset limit is hit
}
m_state.set(State.RUNNING);
syncSSMInitialize();
}
return null;
}
}
/*
* Track state machine membership. If it changes, notify all state machine instances
*/
private void checkForMembershipChanges() throws KeeperException, InterruptedException {
Set children = ImmutableSet.copyOf(m_zk.getChildren(m_stateMachineMemberPath, m_membershipWatcher));
Set removedMembers;
Set addedMembers;
if (m_registeredStateMachineInstances == m_registeredStateMachines.length && !m_groupMembers.equals(children)) {
removedMembers = Sets.difference(m_groupMembers, children);
addedMembers = Sets.difference(children, m_groupMembers);
m_groupMembers = children;
for (StateMachineInstance stateMachine : m_registeredStateMachines) {
stateMachine.membershipChanged(m_groupMembers, addedMembers, removedMembers);
}
}
}
/** Interface used by {@link StateMachineInstance} for work that needs to be performed outside of the member lock */
private interface SmiCallable {
void call() throws KeeperException;
}
}