All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.voltcore.zk.SynchronizedStatesManager Maven / Gradle / Ivy

There is a newer version: 10.1.1
Show newest version
/* This file is part of VoltDB.
 * Copyright (C) 2008-2018 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see .
 */

package org.voltcore.zk;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.zookeeper_voltpatches.CreateMode;
import org.apache.zookeeper_voltpatches.KeeperException;
import org.apache.zookeeper_voltpatches.KeeperException.NoNodeException;
import org.apache.zookeeper_voltpatches.WatchedEvent;
import org.apache.zookeeper_voltpatches.Watcher;
import org.apache.zookeeper_voltpatches.ZooDefs.Ids;
import org.apache.zookeeper_voltpatches.ZooKeeper;
import org.apache.zookeeper_voltpatches.data.Stat;
import org.voltcore.logging.VoltLogger;
import org.voltcore.utils.CoreUtils;

import com.google_voltpatches.common.base.Throwables;
import com.google_voltpatches.common.collect.ImmutableSet;
import com.google_voltpatches.common.collect.Sets;
import com.google_voltpatches.common.util.concurrent.ListenableFuture;
import com.google_voltpatches.common.util.concurrent.ListeningExecutorService;

/*
 * This state machine coexists with other SynchronizedStateManagers by using different branches of the
 * ZooKeeper tree. This class provides a service to the StateMachineInstance class that reports changes
 * to the membership. The ZooKeeper tree hierarchy is (quoted are reserved node names):
 *     |- rootNode
 *     |\
 *     | |- 'MEMBERS'
 *     |  \
 *     |   |- memberId
 *     \
 *      |- instanceName
 *      |\
 *      | |- 'LOCK_CONTENDERS'
 *      |\
 *      | |- 'BARRIER_PARTICIPANTS'
 *       \
 *        |- 'BARRIER_RESULTS'
 *
 * This hierarchy supports multiple topologies so one SynchronizedStatesManager could be used to track
 * cluster membership while another could be used to track partition membership. Each SynchronizedStatesManager
 * instance has a memberId that must be unique to the member community it is participating in. As the
 * SynchronizedStatesManager is a shared service that can provide membership changes of other
 * StateMachineInstances joining or leaving the community, all instances are considered tightly bound to the
 * SynchronizedStatesManager. This means that the SynchronizedStatesManager will not join a community until
 * all instances using that manager have registered with the SynchronizedStatesManager. Thus when a
 * SynchronizedStatesManager informs its instances that a new member has joined the community (the set of
 * nodes under the 'MEMBERS' node), the instance can be assured that it's peer instance is also there. This
 * implies that StateMachineInstances can only coexist in the same SynchronizedStatesManager if all instances
 * can be initialized (registered) before any individual instance needs to access the shared state.
 */
public class SynchronizedStatesManager {
    private final AtomicBoolean m_done = new AtomicBoolean(false);
    private final static String m_memberNode = "MEMBERS";
    private Set m_groupMembers = new HashSet();
    private final StateMachineInstance m_registeredStateMachines [];
    private int m_registeredStateMachineInstances = 0;
    private static final ListeningExecutorService m_shared_es = CoreUtils.getListeningExecutorService("SSM Daemon", 1);
    // We assume that we are far enough along that the HostMessenger is up and running. Otherwise add to constructor.
    private final ZooKeeper m_zk;
    private final String m_ssmRootNode;
    private final String m_stateMachineRoot;
    private final String m_stateMachineMemberPath;
    private String m_memberId;
    private final String m_canonical_memberId;
    private int m_resetCounter;
    private int m_resetLimit;
    private final int m_resetAllowance; // number of resets allowed within a reset clear threshold duration
    private long m_lastResetTimeInMillis = -1L;

    private static final long RESET_CLEAR_THRESHOLD = TimeUnit.DAYS.toMillis(1);

    private enum REQUEST_TYPE {
        INITIALIZING,
        LAST_CHANGE_OUTCOME_REQUEST,
        STATE_CHANGE_REQUEST,
        CORRELATED_COORDINATED_TASK,
        UNCORRELATED_COORDINATED_TASK
    };

    private enum RESULT_CONCENSUS {
        AGREE,
        DISAGREE,
        NO_QUORUM
    }

    protected boolean addIfMissing(String absolutePath, CreateMode createMode, byte[] data)
            throws KeeperException, InterruptedException {
        return ZKUtil.addIfMissing(m_zk, absolutePath, createMode, data);
    }

    protected String getMemberId() {
        return m_memberId;
    }

    /*
     * Rules for each StateMachineInstance:
     * 1. All access to the state machine is serialized by a per state machine local lock.
     * 2. State changes may only be initiated by the oldest ephemeral lock node under the
     *    LOCK_CONTENDERS directory. This represents the distributed lock used to
     *    synchronize proposals by the members of the state machine.
     * 3. When a new member joins an operational state machine, it must grab the lock node
     *    (2) before it can determine the current agreed state. However since it has to
     *    participate and benignly generate null results for all proposals, it can glean
     *    the state from the outcome of the next request it fully participates in. Note
     *    that an operation in progress during initialization of the new member is not
     *    used by the new member because the proposal initiator may have accepted the
     *    results and released the distributed lock before the new member can evaluate the
     *    results to determine the current state.
     * 4. A member holding the distributed lock can propose a state change by clearing out
     *    the reported result nodes under the BARRIER_RESULTS node, updating the
     *    BARRIER_RESULTS node with the old and proposed state, and adding an ephemeral
     *    unique node under BARRIER_PARTICIPANTS to trigger the state machine members.
     *    Each proposal in the BARRIER_RESULTS node has a version number that is used
     *    by other members to distinguish a new proposal from a proposal it has already
     *    processed.
     * 5. When a member detects a transition in the BARRIER_PARTICIPANTS node and a new
     *    version of the BARRIER_RESULTS node, it adds an ephemeral node for itself under
     *    BARRIER_PARTICIPANTS and copies the old and proposed state from the BARRIER_RESULTS
     *    node.
     * 6. Each member evaluates the proposed state and inserts a SUCCESS or FAILURE indication
     *    in a persistent unique node under the BARRIER_RESULTS path.
     * 7. A member may also request that an action be performed by all fully participating
     *    members (including itself) by assigning the existing state and a task proposal
     *    in the BARRIER_RESULTS node with a new version id and then adding a node under the
     *    BARRIER_PARTICIPANTS directory.
     * 8. All members detecting the task request perform the task specified in the task
     *    proposal and inserting a result node under the BARRIER_RESULTS directory.
     * 9. When the set of BARRIER_RESULTS nodes is a superset of all members currently in the
     *    state machine, all members are considered to have reported in and each member can
     *    independently determine the outcome. If the proposal was a state change request,
     *    and all BARRIER_RESULTS nodes report SUCCESS each member applies the new state
     *    and then removes itself from BARRIER_PARTICIPANTS. If the proposal was a task
     *    request, then members remove themselves from BARRIER_PARTICIPANTS to acknowledge
     *    receipt of all task results.
     *10. Membership in the state machine can change as members join or fail. Membership is
     *    monitored by the SynchronizedStatesManager and changes are reported to each state
     *    machine instance. While members have joined but have not determined the current
     *    state, they report a NULL result for each state change or task proposal made by
     *    other members.
     *11. If any member reports FAILURE node under the BARRIER_RESULTS directory after a
     *    state change request proposal, the new state is not applied by any member.
     *12. Each member that has processed BARRIER_RESULTS removes its ephemeral node from the
     *    BARRIER_PARTICIPANTS directory node.
     *13. The initiator of a state change or task releases the lock node when 9 is satisfied.
     *14. A member is not told they have the lock node until the last member releases the
     *    lock node AND all ephemeral nodes under the BARRIER_PARTICIPANTS node are gone,
     *    indicating that all members waiting on the outcome of the last state change or
     *    task results have processed the previous outcome
     */

    public abstract class StateMachineInstance {
        protected String m_stateMachineId;
        private final String m_stateMachineName;
        private Set m_knownMembers;
        private boolean m_membershipChangePending = false;
        private final String m_statePath;
        private final String m_lockPath;
        private final String m_barrierResultsPath;
        private String m_myResultPath;
        private final String m_barrierParticipantsPath;
        private String m_myParticipantPath;
        private boolean m_stateChangeInitiator = false;
        private String m_ourDistributedLockName = null;
        private String m_lockWaitingOn = null;
        private boolean m_holdingDistributedLock = false;
        protected final VoltLogger m_log;
        private ByteBuffer m_requestedInitialState = ByteBuffer.allocate(0);
        private ByteBuffer m_synchronizedState = null;
        private ByteBuffer m_pendingProposal = null;
        private REQUEST_TYPE m_currentRequestType = REQUEST_TYPE.INITIALIZING;
        private int m_currentParticipants = 0;
        private Set m_memberResults = null;
        private int m_lastProposalVersion = 0;

        private boolean m_initializationCompleted = false;

        private boolean isInitializationCompleted() {
            lockLocalState();
            unlockLocalState();
            return m_initializationCompleted;
        }

        public int getResetCounter() {
            return m_resetCounter;
        }

        private final Lock m_mutex = new ReentrantLock();
        private int m_mutexLockedCnt = 0;
        private final ThreadLocal m_mutexLocked = new ThreadLocal() {
            @Override
            protected Boolean initialValue()
            {
                return new Boolean(false);
            }
        };

        private boolean debugVerifyLockAcquire() {
            m_mutexLocked.set(new Boolean(true));
            return m_mutexLockedCnt++ == 0;
        }

        private boolean debugVerifyLockRelease() {
            m_mutexLocked.set(new Boolean(false));
            return (m_mutexLockedCnt-- == 1);
        }

        protected boolean debugIsLocalStateLocked() {
            return m_mutexLocked.get();
        }

        private void lockLocalState() {
            m_mutex.lock();
            assert(debugVerifyLockAcquire());
        }

        // This wrapper resolves getStackTrace correctly when debugging locking
        private void lockLocalStateForLockRunner() {
            lockLocalState();
        }

        // This wrapper resolves getStackTrace correctly when debugging locking
        private void lockLocalStateForResultsRunner() {
            lockLocalState();
        }

        // This wrapper resolves getStackTrace correctly when debugging locking
        private void lockLocalStateForParticipantRunner() {
            lockLocalState();
        }

        private void unlockLocalState() {
            assert(debugVerifyLockRelease());
            m_mutex.unlock();
        }

        private class LockWatcher implements Watcher {

            @Override
            public void process(final WatchedEvent event) {
                try {
                    if (!m_done.get()) {
                        m_shared_es.submit(HandlerForDistributedLockEvent);
                    }
                } catch (RejectedExecutionException e) {
                }
            }
        }

        private final LockWatcher m_lockWatcher = new LockWatcher();

        private class BarrierParticipantsWatcher implements Watcher {

            @Override
            public void process(final WatchedEvent event) {
                try {
                    if (!m_done.get()) {
                        m_shared_es.submit(HandlerForBarrierParticipantsEvent);
                    }
                } catch (RejectedExecutionException e) {
                }
            }
        }

        class StateChangeRequest {
            public final REQUEST_TYPE m_requestType;
            public final ByteBuffer m_previousState;
            public final ByteBuffer m_proposal;

            private StateChangeRequest(REQUEST_TYPE requestType, ByteBuffer previousState, ByteBuffer proposedState) {
                m_requestType = requestType;
                m_previousState = previousState;
                m_proposal = proposedState;
            }
        }

        private final BarrierParticipantsWatcher m_barrierParticipantsWatcher = new BarrierParticipantsWatcher();

        private class BarrierResultsWatcher implements Watcher {

            @Override
            public void process(final WatchedEvent event) {
                try {
                    if (!m_done.get()) {
                        m_shared_es.submit(HandlerForBarrierResultsEvent);
                    }
                } catch (RejectedExecutionException e) {
                }
            }
        };

        private final BarrierResultsWatcher m_barrierResultsWatcher = new BarrierResultsWatcher();

        // Used only for Mocking StateMachineInstance
        public StateMachineInstance()
        {
            m_statePath = "MockInstanceStatePath";
            m_barrierResultsPath = "MockBarrierResultsPath";
            m_myResultPath = "MockMyResultPath";
            m_barrierParticipantsPath = "MockParticipantsPath";
            m_myParticipantPath = "MockMyParticipantPath";
            m_lockPath = "MockLockPath";
            m_log = null;
            m_stateMachineId = "MockStateMachineId";
            m_stateMachineName = "MockStateMachineName";
        }

        public StateMachineInstance(String instanceName, VoltLogger logger) throws RuntimeException {
            if (instanceName.equals(m_memberNode)) {
                throw new RuntimeException("State machine name may not be named " + m_memberNode);
            }
            assert(!instanceName.equals(m_memberNode));
            m_stateMachineName = instanceName;
            m_statePath = ZKUtil.joinZKPath(m_stateMachineRoot, instanceName);
            m_lockPath = ZKUtil.joinZKPath(m_statePath, "LOCK_CONTENDERS");
            m_barrierResultsPath = ZKUtil.joinZKPath(m_statePath, "BARRIER_RESULTS");
            m_myResultPath = ZKUtil.joinZKPath(m_barrierResultsPath, m_memberId);
            m_barrierParticipantsPath = ZKUtil.joinZKPath(m_statePath, "BARRIER_PARTICIPANTS");
            m_myParticipantPath = ZKUtil.joinZKPath(m_barrierParticipantsPath, m_memberId);
            m_log = logger;
            m_stateMachineId = "SMI " + m_ssmRootNode + "/" + m_stateMachineName + "/" + m_memberId;
            if (m_log.isDebugEnabled()) {
                m_log.debug(m_stateMachineId + " created.");
            }
        }


        public void registerStateMachineWithManager(ByteBuffer requestedInitialState) throws InterruptedException {
            assert(requestedInitialState != null);
            assert(requestedInitialState.remaining() < Short.MAX_VALUE);
            m_requestedInitialState = requestedInitialState;

            registerStateMachine(this);
        }

        /**
         * Notify the derived class that the state machine instance is being reset,
         * the derived class should reset its own specific states and provide a reset state
         * @param isDirectVictim true if the reset is caused by callback exception in this instance
         * @return a ByteBuffer encapsulating the reset state provided by the derived class
         */
        protected abstract ByteBuffer notifyOfStateMachineReset(boolean isDirectVictim);

        private void initializeStateMachine(Set knownMembers) throws KeeperException, InterruptedException {
            addIfMissing(m_statePath, CreateMode.PERSISTENT, null);
            addIfMissing(m_lockPath, CreateMode.PERSISTENT, null);
            addIfMissing(m_barrierParticipantsPath, CreateMode.PERSISTENT, null);
            lockLocalState();
            boolean ownDistributedLock = requestDistributedLock();
            ByteBuffer startStates = buildProposal(REQUEST_TYPE.INITIALIZING,
                    m_requestedInitialState.asReadOnlyBuffer(), m_requestedInitialState.asReadOnlyBuffer());
            addIfMissing(m_barrierResultsPath, CreateMode.PERSISTENT, startStates.array());
            boolean stateMachineNodeCreated = false;
            if (ownDistributedLock) {
                // Only the very first initializer of the state machine will both get the lock and successfully
                // allocate "STATE_INITIALIZED". This guarantees that only one node will assign the initial state.
                stateMachineNodeCreated = addIfMissing(ZKUtil.joinZKPath(m_statePath, "STATE_INITIALIZED"),
                        CreateMode.PERSISTENT, null);
            }

            if (m_membershipChangePending) {
                getLatestMembership();
            }
            else {
                m_knownMembers = knownMembers;
            }
            // We need to always monitor participants so that if we are initialized we can add ourselves and insert
            // our results and if we are not initialized, we can always auto-insert a null result.
            if (stateMachineNodeCreated) {
                assert(ownDistributedLock);
                m_synchronizedState = m_requestedInitialState;
                m_requestedInitialState = null;
                m_lastProposalVersion = getProposalVersion();
                ByteBuffer readOnlyResult = m_synchronizedState.asReadOnlyBuffer();
                // Add an acceptable result so the next initializing member recognizes an immediate quorum.
                byte result[] = new byte[1];
                result[0] = (byte)(1);
                addResultEntry(result);
                m_lockWaitingOn = "bogus"; // Avoids call to notifyDistributedLockWaiter
                m_log.info(m_stateMachineId + ": Initialized (first member) with State " +
                        stateToString(m_synchronizedState.asReadOnlyBuffer()));
                m_initializationCompleted = true;
                cancelDistributedLock();
                checkForBarrierParticipantsChange();
                // Notify the derived object that we have a stable state
                try {
                    setInitialState(readOnlyResult);
                } catch (Exception e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug("Error in StateMachineInstance callbacks.", e);
                    }
                    m_initializationCompleted = false;
                    m_shared_es.submit(new CallbackExceptionHandler(this));
                }
            }
            else {
                // To get a stable result set, we need to get the lock for this state machine. If someone else has the
                // lock they can clear the stale results out from under us.
                if (ownDistributedLock) {
                    initializeFromActiveCommunity();
                }
                else {
                    // This means we will ignore the current update if there is one in progress.
                    // Note that if we are not the next waiter for the lock, we will blindly
                    // accept the next proposal and use the outcome to set our initial state.
                    addResultEntry(null);
                    Stat nodeStat = new Stat();
                    boolean resultNodeFound = false;
                    {
                        try {
                            m_zk.getData(m_barrierResultsPath, false, nodeStat);
                            resultNodeFound = true;
                        }
                        catch (NoNodeException noNode) {
                            // This is a race between another node who got the Distributed lock but
                            // Has not set up the result path yet. Keep Retrying.
                        }
                    } while (!resultNodeFound);
                    m_lastProposalVersion = nodeStat.getVersion();
                    checkForBarrierParticipantsChange();
                }
                assert(!debugIsLocalStateLocked());
            }
        }

        /*
         * This state machine and all other state machines under this manager are being removed
         */
        private void disableMembership() throws InterruptedException {
            lockLocalState();
            // put in two separate try-catch blocks so that both actions are attempted
            try {
                m_zk.delete(m_myParticipantPath, -1);
            }
            catch (KeeperException e) {
            }
            try {
                if (m_ourDistributedLockName != null) {
                    m_zk.delete(m_ourDistributedLockName, -1);
                }
            }
            catch (KeeperException e) {
            }
            m_initializationCompleted = false;
            unlockLocalState();
        }

        private void reset(boolean isDirectVictim) {
            ByteBuffer resetState = notifyOfStateMachineReset(isDirectVictim);
            // if we are the direct victim, use the reset state as requested initial state
            if (isDirectVictim) {
                m_requestedInitialState = resetState;
            }
            // else simply use currently requested initial state or synchronized state as requested initial state
            else {
                if (m_requestedInitialState == null) {
                    assert(m_synchronizedState != null);
                    m_requestedInitialState = m_synchronizedState;
                }
            }
            m_synchronizedState = null;

            m_membershipChangePending = false;
            m_stateChangeInitiator = false;
            m_ourDistributedLockName = null;
            m_lockWaitingOn = null;
            m_holdingDistributedLock = false;
            m_pendingProposal = null;
            m_currentRequestType = REQUEST_TYPE.INITIALIZING;
            m_memberResults = null;
            m_lastProposalVersion = 0;
            m_mutexLockedCnt = 0;

            m_myResultPath = ZKUtil.joinZKPath(m_barrierResultsPath, m_memberId);
            m_myParticipantPath = ZKUtil.joinZKPath(m_barrierParticipantsPath, m_memberId);
            m_stateMachineId = "SMI " + m_ssmRootNode + "/" + m_stateMachineName + "/" + m_memberId;
        }

        private int getProposalVersion() {
            int proposalVersion = -1;
            try {
                Stat nodeStat = new Stat();
                m_zk.getData(m_barrierResultsPath, null, nodeStat);
                proposalVersion = nodeStat.getVersion();
            } catch (KeeperException.SessionExpiredException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in getProposalVersion");
                }
            } catch (KeeperException.ConnectionLossException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in getProposalVersion");
                }
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in getProposalVersion");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
            return proposalVersion;
        }

        private ByteBuffer buildProposal(REQUEST_TYPE requestType,  ByteBuffer existingState, ByteBuffer proposedState) {
            ByteBuffer states = ByteBuffer.allocate(proposedState.remaining() + existingState.remaining() + 4);
            states.putShort((short)requestType.ordinal());
            states.putShort((short)existingState.remaining());
            states.put(existingState);
            states.put(proposedState);
            states.flip();
            return states;
        }

        private StateChangeRequest getExistingAndProposedBuffersFromResultsNode(byte oldAndNewState[]) {
            // Either the barrier or the state node should have the current state
            assert(oldAndNewState != null);
            ByteBuffer states = ByteBuffer.wrap(oldAndNewState);
            // Maximum state size is 64K
            REQUEST_TYPE requestType = REQUEST_TYPE.values()[states.getShort()];
            int oldStateSize = states.getShort();
            states.position(oldStateSize+4);
            ByteBuffer proposedState = states.slice();
            states.flip();      // just the old state
            states.position(4);
            states.limit(oldStateSize+4);
            return new StateChangeRequest(requestType, states, proposedState);
        }

        private void checkForBarrierParticipantsChange() {
            assert(debugIsLocalStateLocked());
            try {
                Set children = ImmutableSet.copyOf(m_zk.getChildren(m_barrierParticipantsPath, m_barrierParticipantsWatcher));
                Stat nodeStat = new Stat();
                // inspect the m_barrierResultsPath and get the new and old states and version
                // At some point this can be optimized to not examine the proposal all the time
                byte statePair[] = m_zk.getData(m_barrierResultsPath, false, nodeStat);
                int proposalVersion = nodeStat.getVersion();
                if (proposalVersion != m_lastProposalVersion) {
                    m_lastProposalVersion = proposalVersion;
                    m_currentParticipants = children.size();
                    if (!m_stateChangeInitiator) {
                        assert(m_pendingProposal == null);

                        // This is an indication that a new state change is being requested
                        StateChangeRequest existingAndProposedStates = getExistingAndProposedBuffersFromResultsNode(statePair);
                        m_currentRequestType = existingAndProposedStates.m_requestType;
                        if (m_requestedInitialState != null) {
                            // Since we have not initialized yet, we acknowledge this proposal with an empty result.
                            addResultEntry(null);
                            unlockLocalState();
                        }
                        else {
                            // Don't add ourselves as a participant because we don't care about the results
                            REQUEST_TYPE type = m_currentRequestType;
                            if (type == REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST) {
                                // The request is from a new member who found the results in an ambiguous state.
                                byte result[] = new byte[1];
                                if (existingAndProposedStates.m_proposal.equals(m_synchronizedState)) {
                                    result[0] = (byte)1;
                                    addResultEntry(result);
                                }
                                else {
                                    assert(existingAndProposedStates.m_previousState.equals(m_synchronizedState));
                                    result[0] = (byte)0;
                                    addResultEntry(result);
                                }
                                unlockLocalState();
                            }
                            else {
                                // We track the number of people waiting on the results so we know when the result is stale and
                                // the next lock holder can initiate a new state proposal.
                                m_zk.create(m_myParticipantPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);

                                m_pendingProposal = existingAndProposedStates.m_proposal;
                                ByteBuffer proposedState = m_pendingProposal.asReadOnlyBuffer();
                                assert(existingAndProposedStates.m_previousState.equals(m_synchronizedState));
                                if (m_log.isDebugEnabled()) {
                                    if (type == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
                                        m_log.debug(m_stateMachineId + ": Received new State proposal " +
                                                stateToString(proposedState.asReadOnlyBuffer()));
                                    }
                                    else {
                                        m_log.debug(m_stateMachineId + ": Received new Task request " +
                                                taskToString(proposedState.asReadOnlyBuffer()));
                                    }
                                }
                                unlockLocalState();
                                if (type == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
                                    try {
                                        stateChangeProposed(proposedState);
                                    } catch (Exception e) {
                                        if (m_log.isDebugEnabled()) {
                                            m_log.debug("Error in StateMachineInstance callbacks.", e);
                                        }
                                        m_initializationCompleted = false;
                                        m_shared_es.submit(new CallbackExceptionHandler(this));
                                    }
                                }
                                else {
                                    try {
                                        taskRequested(proposedState);
                                    } catch (Exception e) {
                                        if (m_log.isDebugEnabled()) {
                                            m_log.debug("Error in StateMachineInstance callbacks.", e);
                                        }
                                        m_initializationCompleted = false;
                                        m_shared_es.submit(new CallbackExceptionHandler(this));
                                    }
                                }
                            }
                        }
                    }
                    else {
                        // We initiated a proposal and if it is a TASK we need to call our derived object to kick it off
                        assert(m_pendingProposal != null);
                        if (m_currentRequestType == REQUEST_TYPE.CORRELATED_COORDINATED_TASK ||
                                m_currentRequestType == REQUEST_TYPE.UNCORRELATED_COORDINATED_TASK) {
                            // This is a Task request made by us so notify the derived state machine to perform the task
                            // provide a result.
                            ByteBuffer taskRequest = m_pendingProposal.asReadOnlyBuffer();
                            unlockLocalState();
                            try {
                                taskRequested(taskRequest);
                            } catch (Exception e) {
                                if (m_log.isDebugEnabled()) {
                                    m_log.debug("Error in StateMachineInstance callbacks.", e);
                                }
                                m_initializationCompleted = false;
                                m_shared_es.submit(new CallbackExceptionHandler(this));
                            }
                        }
                        else {
                            unlockLocalState();
                        }
                    }
                }
                else {
                    m_currentParticipants = children.size();
                    if (m_ourDistributedLockName != null && m_ourDistributedLockName == m_lockWaitingOn && children.size() == 0) {
                        // We can finally notify the lock waiter because everyone is finished evaluating the previous state proposal
                        notifyDistributedLockWaiter();
                    }
                    else {
                        unlockLocalState();
                    }
                }
            } catch (KeeperException.SessionExpiredException e) {
                // lost the full connection. some test cases do this...
                // means zk shutdown without the elector being shutdown.
                // ignore.
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in checkForBarrierParticipantsChange");
                }
                unlockLocalState();
            } catch (KeeperException.ConnectionLossException e) {
                // lost the full connection. some test cases do this...
                // means shutdown without the elector being
                // shutdown; ignore.
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in checkForBarrierParticipantsChange");
                }
                unlockLocalState();
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in checkForBarrierParticipantsChange");
                }
                unlockLocalState();
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
            assert(!debugIsLocalStateLocked());
        }

        private void monitorParticipantChanges() {
            // always start checking for participation changes after the result notifications
            // or initialization notifications to ensure these notifications happen before lock
            // ownership notifications.
            lockLocalState();
            checkForBarrierParticipantsChange();
        }

        private RESULT_CONCENSUS resultsAgreeOnSuccess(Set memberList) throws Exception {
            boolean agree = false;
            for (String memberId : memberList) {
                byte result[];
                try {
                    result = m_zk.getData(ZKUtil.joinZKPath(m_barrierResultsPath, memberId), false, null);
                    if (result != null) {
                        if (result[0] == 0) {
                            return RESULT_CONCENSUS.DISAGREE;
                        }
                        agree = true;
                    }
                }
                catch (NoNodeException ke) {
                    // This can happen when a new member joins and other members detect the new member before
                    // it's initialization code is called and a Null result is supplied to treat this as a null result.
                }
            }
            if (agree) {
                return RESULT_CONCENSUS.AGREE;
            }
            return RESULT_CONCENSUS.NO_QUORUM;
        }

        private ArrayList getUncorrelatedResults(ByteBuffer taskRequest, Set memberList) {
            // Treat ZooKeeper failures as empty result
            ArrayList results = new ArrayList();
            try {
                for (String memberId : memberList) {
                    byte result[];
                    result = m_zk.getData(ZKUtil.joinZKPath(m_barrierResultsPath, memberId), false, null);
                    if (result != null) {
                        ByteBuffer bb = ByteBuffer.wrap(result);
                        results.add(bb);
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ":    " + memberId + " reports Result " +
                                    taskResultToString(taskRequest.asReadOnlyBuffer(), bb.asReadOnlyBuffer()));
                        }
                    }
                    else {
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ":    " + memberId + " did not supply a Task Result");
                        }
                    }
                }
                // Remove ourselves from the participants list to unblock the next distributed lock waiter
                m_zk.delete(m_myParticipantPath, -1);
            } catch (KeeperException.SessionExpiredException e) {
                results = new ArrayList();
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in getUncorrelatedResults");
                }
            } catch (KeeperException.ConnectionLossException e) {
                results = new ArrayList();
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in getUncorrelatedResults");
                }
            } catch (InterruptedException e) {
                results = new ArrayList();
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in getUncorrelatedResults");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
            return results;
        }

        private Map getCorrelatedResults(ByteBuffer taskRequest, Set memberList) {
            // Treat ZooKeeper failures as empty result
            Map results = new HashMap();
            try {
                for (String memberId : memberList) {
                    byte result[];
                    result = m_zk.getData(ZKUtil.joinZKPath(m_barrierResultsPath, memberId), false, null);
                    if (result != null) {
                        ByteBuffer bb = ByteBuffer.wrap(result);
                        results.put(memberId, bb);
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ":    " + memberId + " reports Result " +
                                    taskResultToString(taskRequest.asReadOnlyBuffer(), bb.asReadOnlyBuffer()));
                        }
                    }
                    else {
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ":    " + memberId + " did not supply a Task Result");
                        }
                    }
                }
                // Remove ourselves from the participants list to unblock the next distributed lock waiter
                m_zk.delete(m_myParticipantPath, -1);
            } catch (KeeperException.SessionExpiredException e) {
                results = new HashMap();
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in getCorrelatedResults");
                }
            } catch (KeeperException.ConnectionLossException e) {
                results = new HashMap();
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in getCorrelatedResults");
                }
            } catch (InterruptedException e) {
                results = new HashMap();
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in getCorrelatedResults");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
            return results;
        }

        // The number of results is a superset of the membership so analyze the results
        private void processResultQuorum(Set memberList) {
            assert(m_currentRequestType != REQUEST_TYPE.INITIALIZING);
            m_memberResults = null;
            if (m_requestedInitialState != null) {
                // We can now initialize this state machine instance
                assert(m_holdingDistributedLock);
                try {
                    assert(m_synchronizedState == null);
                    Stat versionInfo = new Stat();
                    byte oldAndProposedState[] = m_zk.getData(m_barrierResultsPath, false, versionInfo);
                    assert(versionInfo.getVersion() == m_lastProposalVersion);
                    StateChangeRequest existingAndProposedStates =
                            getExistingAndProposedBuffersFromResultsNode(oldAndProposedState);
                    m_currentRequestType = existingAndProposedStates.m_requestType;

                    // We are waiting to initialize. We are here either because we are piggy-backing
                    // another member's proposal or because we initiated a LAST_CHANGE_OUTCOME_REQUEST
                    if (m_currentRequestType == REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST) {
                        // We don't have a result yet but it is available
                        RESULT_CONCENSUS result = resultsAgreeOnSuccess(memberList);
                        if (result == RESULT_CONCENSUS.NO_QUORUM) {
                            // Bad news. There is no definitive state so if we are the distributed
                            // lock owner we will assign our initial state as our initial state
                            if (m_stateChangeInitiator) {
                                m_synchronizedState = m_requestedInitialState;
                                ByteBuffer stableState = buildProposal(REQUEST_TYPE.INITIALIZING,
                                        m_synchronizedState.asReadOnlyBuffer(), m_synchronizedState.asReadOnlyBuffer());
                                Stat newProposalStat = m_zk.setData(m_barrierResultsPath, stableState.array(), -1);
                                m_lastProposalVersion = newProposalStat.getVersion();
                            }
                        }
                        else {
                            // There was at least one member that knows the definitive state
                            if (result == RESULT_CONCENSUS.AGREE) {
                                m_synchronizedState = existingAndProposedStates.m_proposal;
                            }
                            else {
                                m_synchronizedState = existingAndProposedStates.m_previousState;
                            }
                        }
                    }
                    else
                    if (m_currentRequestType == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
                        // We don't have a result yet but it is available
                        RESULT_CONCENSUS result = resultsAgreeOnSuccess(memberList);
                        if (result == RESULT_CONCENSUS.AGREE) {
                            m_synchronizedState = existingAndProposedStates.m_proposal;
                        }
                        else {
                            assert(result == RESULT_CONCENSUS.DISAGREE);
                            m_synchronizedState = existingAndProposedStates.m_previousState;
                        }
                    }
                    else {
                        // If we are processing a TASK or a updated STABLE result, we don't need a quorum
                        m_synchronizedState = existingAndProposedStates.m_previousState;
                    }

                    // Remove ourselves from the participants list to unblock the next distributed lock waiter
                    m_zk.delete(m_myParticipantPath, -1);
                } catch (KeeperException.SessionExpiredException e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": Received SessionExpiredException in processResultQuorum");
                    }
                } catch (KeeperException.ConnectionLossException e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": Received ConnectionLossException in processResultQuorum");
                    }
                } catch (InterruptedException e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": Received InterruptedException in processResultQuorum");
                    }
                } catch (Exception e) {
                    org.voltdb.VoltDB.crashLocalVoltDB(
                            "Unexpected failure in StateMachine.", true, e);
                }
                if (m_stateChangeInitiator) {
                    m_stateChangeInitiator = false;
                    cancelDistributedLock();
                }
                if (m_synchronizedState != null) {
                    ByteBuffer readOnlyResult = m_synchronizedState.asReadOnlyBuffer();
                    // We were not yet participating in the state machine so but now we can
                    m_requestedInitialState = null;
                    m_pendingProposal = null;
                    m_log.info(m_stateMachineId + ": Initialized (concensus) with State " +
                            stateToString(m_synchronizedState.asReadOnlyBuffer()));
                    m_initializationCompleted = true;
                    unlockLocalState();
                    try {
                        setInitialState(readOnlyResult);
                    } catch (Exception e) {
                        if (m_log.isDebugEnabled()) {
                            m_log.debug("Error in StateMachineInstance callbacks.", e);
                        }
                        m_initializationCompleted = false;
                        m_shared_es.submit(new CallbackExceptionHandler(this));
                    }

                    if (m_initializationCompleted) {
                        // If we are ready to provide an initial state to the derived state machine, add us to
                        // participants watcher so we can see the next request
                        monitorParticipantChanges();
                    }
                }
                else {
                    unlockLocalState();
                }
            }
            else {
                assert(m_currentRequestType != REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST);
                boolean initiator = m_stateChangeInitiator;
                boolean success = false;
                if (m_currentRequestType == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
                    // Treat ZooKeeper failures as unsuccessful proposal
                    ByteBuffer attemptedChange = m_pendingProposal.asReadOnlyBuffer();
                    try {
                        // We don't have a result yet but it is available
                        RESULT_CONCENSUS result = resultsAgreeOnSuccess(memberList);
                        // Now that we have a result we can remove ourselves from the participants list.
                        m_zk.delete(m_myParticipantPath, -1);
                        if (result == RESULT_CONCENSUS.AGREE) {
                            success = true;
                            m_synchronizedState = m_pendingProposal;
                        }
                        else {
                            assert(result == RESULT_CONCENSUS.DISAGREE);
                        }
                        m_pendingProposal = null;
                        if (m_stateChangeInitiator) {
                            // Since we don't care if we are the last to go away, remove ourselves from the participant list
                            assert(m_holdingDistributedLock);
                            m_stateChangeInitiator = false;
                            cancelDistributedLock();
                        }
                    } catch (KeeperException.SessionExpiredException e) {
                        success = false;
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ": Received SessionExpiredException in processResultQuorum");
                        }
                    } catch (KeeperException.ConnectionLossException e) {
                        success = false;
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ": Received ConnectionLossException in processResultQuorum");
                        }
                    } catch (InterruptedException e) {
                        success = false;
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ": Received InterruptedException in processResultQuorum");
                        }
                    } catch (Exception e) {
                        org.voltdb.VoltDB.crashLocalVoltDB(
                                "Unexpected failure in StateMachine.", true, e);
                        success = false;
                    }
                    m_log.info(m_stateMachineId + ": Proposed state " + (success?"succeeded ":"failed ") +
                            stateToString(attemptedChange.asReadOnlyBuffer()));
                    unlockLocalState();
                    // Notify the derived state machine engine of the current state
                    try {
                        proposedStateResolved(initiator, attemptedChange, success);
                    } catch (Exception e) {
                        if (m_log.isDebugEnabled()) {
                            m_log.debug("Error in StateMachineInstance callbacks.", e);
                        }
                        m_initializationCompleted = false;
                        m_shared_es.submit(new CallbackExceptionHandler(this));
                    }

                    if (m_initializationCompleted) {
                        monitorParticipantChanges();
                    }
                }
                else {
                    // Process the results of a TASK request
                    ByteBuffer taskRequest = m_pendingProposal.asReadOnlyBuffer();
                    m_pendingProposal = null;
                    m_log.info(m_stateMachineId + ": All members completed task " + taskToString(taskRequest.asReadOnlyBuffer()));
                    if (m_currentRequestType == REQUEST_TYPE.CORRELATED_COORDINATED_TASK) {
                        Map results = getCorrelatedResults(taskRequest, memberList);
                        if (m_stateChangeInitiator) {
                            // Since we don't care if we are the last to go away, remove ourselves from the participant list
                            assert(m_holdingDistributedLock);
                            m_stateChangeInitiator = false;
                            cancelDistributedLock();
                        }
                        unlockLocalState();
                        try {
                            correlatedTaskCompleted(initiator, taskRequest, results);
                        } catch (Exception e) {
                            if (m_log.isDebugEnabled()) {
                                m_log.debug("Error in StateMachineInstance callbacks.", e);
                            }
                            m_initializationCompleted = false;
                            m_shared_es.submit(new CallbackExceptionHandler(this));
                        }
                    }
                    else {
                        ArrayList results = getUncorrelatedResults(taskRequest, memberList);
                        if (m_stateChangeInitiator) {
                            // Since we don't care if we are the last to go away, remove ourselves from the participant list
                            assert(m_holdingDistributedLock);
                            m_stateChangeInitiator = false;
                            cancelDistributedLock();
                        }
                        unlockLocalState();
                        try {
                            uncorrelatedTaskCompleted(initiator, taskRequest, results);
                        } catch (Exception e) {
                            if (m_log.isDebugEnabled()) {
                                m_log.debug("Error in StateMachineInstance callbacks.", e);
                            }
                            m_initializationCompleted = false;
                            m_shared_es.submit(new CallbackExceptionHandler(this));
                        }
                    }

                    if (m_initializationCompleted) {
                        monitorParticipantChanges();
                    }
                }
            }
        }

        private void checkForBarrierResultsChanges() {
            assert(debugIsLocalStateLocked());
            if (m_pendingProposal == null) {
                // Don't check for barrier results until we notice the participant list change.
                unlockLocalState();
                return;
            }
            Set membersWithResults;
            try {
                membersWithResults = ImmutableSet.copyOf(m_zk.getChildren(m_barrierResultsPath,
                        m_barrierResultsWatcher));
            } catch (KeeperException.SessionExpiredException e) {
                membersWithResults = new TreeSet(Arrays.asList("We died so avoid Quorum path"));
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in checkForBarrierResultsChanges");
                }
            } catch (KeeperException.ConnectionLossException e) {
                membersWithResults = new TreeSet(Arrays.asList("We died so avoid Quorum path"));
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in checkForBarrierResultsChanges");
                }
            } catch (InterruptedException e) {
                membersWithResults = new TreeSet(Arrays.asList("We died so avoid Quorum path"));
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in checkForBarrierResultsChanges");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
                membersWithResults = new TreeSet();
            }
            if (Sets.difference(m_knownMembers, membersWithResults).isEmpty()) {
                processResultQuorum(membersWithResults);
                assert(!debugIsLocalStateLocked());
            }
            else {
                m_memberResults = membersWithResults;
                unlockLocalState();
            }
        }

        /*
         * Assumes this state machine owns the distributed lock and can either interrogate the existing state
         * or request the current state from the community (if the existing state is ambiguous)
         */
        private void initializeFromActiveCommunity() {
            ByteBuffer readOnlyResult = null;
            ByteBuffer staleTask = null;
            byte oldAndProposedState[];
            try {
                Stat lastProposal = new Stat();
                oldAndProposedState = m_zk.getData(m_barrierResultsPath, false, lastProposal);
                StateChangeRequest existingAndProposedStates =
                        getExistingAndProposedBuffersFromResultsNode(oldAndProposedState);
                if (existingAndProposedStates.m_requestType == REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST) {
                    RESULT_CONCENSUS result = resultsAgreeOnSuccess(m_knownMembers);
                    if (result == RESULT_CONCENSUS.AGREE) {
                        // Fall through to set up the initial state and provide notification of initialization
                        existingAndProposedStates = new StateChangeRequest(REQUEST_TYPE.INITIALIZING,
                                existingAndProposedStates.m_proposal.asReadOnlyBuffer(),
                                existingAndProposedStates.m_proposal.asReadOnlyBuffer());
                    }
                    else
                    if (result == RESULT_CONCENSUS.DISAGREE) {
                        // Fall through to set up the initial state and provide notification of initialization
                        existingAndProposedStates = new StateChangeRequest(REQUEST_TYPE.INITIALIZING,
                                existingAndProposedStates.m_previousState.asReadOnlyBuffer(),
                                existingAndProposedStates.m_previousState.asReadOnlyBuffer());
                    }
                    else {
                        // Another members outcome request was completed but a subsequent proposing member died
                        // between the time the results of this last request were removed and the new proposal
                        // was made. Fall through and propose a new LAST_CHANGE_OUTCOME_REQUEST.
                        existingAndProposedStates = new StateChangeRequest(REQUEST_TYPE.STATE_CHANGE_REQUEST,
                                existingAndProposedStates.m_previousState.asReadOnlyBuffer(),
                                existingAndProposedStates.m_proposal.asReadOnlyBuffer());
                    }
                }

                if (existingAndProposedStates.m_requestType == REQUEST_TYPE.STATE_CHANGE_REQUEST) {
                    // The last lock owner died before completing a state change or determining the current state
                    m_stateChangeInitiator = true;
                    m_pendingProposal = m_requestedInitialState;
                    m_currentRequestType = REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST;
                    ByteBuffer stateChange = buildProposal(REQUEST_TYPE.LAST_CHANGE_OUTCOME_REQUEST,
                            existingAndProposedStates.m_previousState, existingAndProposedStates.m_proposal);
                    m_lastProposalVersion = wakeCommunityWithProposal(stateChange.array());
                    addResultEntry(null);
                    checkForBarrierResultsChanges();
                }
                else {
                    assert(existingAndProposedStates.m_requestType == REQUEST_TYPE.INITIALIZING ||
                            existingAndProposedStates.m_requestType == REQUEST_TYPE.CORRELATED_COORDINATED_TASK ||
                            existingAndProposedStates.m_requestType == REQUEST_TYPE.UNCORRELATED_COORDINATED_TASK);
                    m_synchronizedState = existingAndProposedStates.m_previousState;
                    m_requestedInitialState = null;
                    readOnlyResult = m_synchronizedState.asReadOnlyBuffer();
                    m_lastProposalVersion = lastProposal.getVersion();
                    m_pendingProposal = null;
                    m_log.info(m_stateMachineId + ": Initialized (existing) with State " +
                            stateToString(m_synchronizedState.asReadOnlyBuffer()));
                    if (existingAndProposedStates.m_requestType != REQUEST_TYPE.INITIALIZING) {
                        staleTask = existingAndProposedStates.m_proposal.asReadOnlyBuffer();
                    }
                    m_initializationCompleted = true;
                    cancelDistributedLock();
                    // Add an acceptable result so the next initializing member recognizes an immediate quorum.
                    m_lockWaitingOn = "bogus"; // Avoids call to notifyDistributedLockWaiter below
                    checkForBarrierParticipantsChange();
                }
            } catch (KeeperException.SessionExpiredException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in initializeFromActiveCommunity");
                }
            } catch (KeeperException.ConnectionLossException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in initializeFromActiveCommunity");
                }
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in initializeFromActiveCommunity");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
            if (readOnlyResult != null) {
                // Notify the derived object that we have a stable state
                try {
                    setInitialState(readOnlyResult);
                } catch (Exception e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug("Error in StateMachineInstance callbacks.", e);
                    }
                    m_initializationCompleted = false;
                    m_shared_es.submit(new CallbackExceptionHandler(this));
                }
            }
            if (staleTask != null) {
                try {
                    staleTaskRequestNotification(staleTask);
                } catch (Exception e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug("Error in StateMachineInstance callbacks.", e);
                    }
                    m_initializationCompleted = false;
                    m_shared_es.submit(new CallbackExceptionHandler(this));
                }
            }
        }

        private int wakeCommunityWithProposal(byte[] proposal) {
            assert(m_holdingDistributedLock);
            assert(m_currentParticipants == 0);
            int newProposalVersion = -1;
            try {
                List results = m_zk.getChildren(m_barrierResultsPath, false);
                for (String resultNode : results) {
                    m_zk.delete(ZKUtil.joinZKPath(m_barrierResultsPath, resultNode), -1);
                }
                Stat newProposalStat = m_zk.setData(m_barrierResultsPath, proposal, -1);
                m_zk.create(m_myParticipantPath, null, Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
                newProposalVersion = newProposalStat.getVersion();
                // force the participant count to be 1, so that lock notifications can be correctly guarded
                m_currentParticipants = 1;
            } catch (KeeperException.SessionExpiredException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in wakeCommunityWithProposal");
                }
            } catch (KeeperException.ConnectionLossException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in wakeCommunityWithProposal");
                }
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in wakeCommunityWithProposal");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
            return newProposalVersion;
        }

        private final Runnable HandlerForBarrierParticipantsEvent = new Runnable() {
            @Override
            public void run() {
                lockLocalStateForParticipantRunner();
                checkForBarrierParticipantsChange();
                assert(!debugIsLocalStateLocked());
            }
        };

        private final Runnable HandlerForBarrierResultsEvent = new Runnable() {
            @Override
            public void run() {
                lockLocalStateForResultsRunner();
                checkForBarrierResultsChanges();
                assert(!debugIsLocalStateLocked());
            }
        };

        private String getNextLockNodeFromList() throws Exception {
            List lockRequestors = m_zk.getChildren(m_lockPath, false);
            Collections.sort(lockRequestors);
            ListIterator iter = lockRequestors.listIterator();
            String currRequestor = null;
            while (iter.hasNext()) {
                currRequestor = ZKUtil.joinZKPath(m_lockPath, iter.next());
                if (currRequestor.equals(m_ourDistributedLockName)) {
                    break;
                }
            }
            assert (currRequestor != null); // We should be able to find ourselves
            //Back on currRequestor
            iter.previous();
            String nextLower = null;
            //Until we have previous nodes and we set a watch on previous node.
            while (iter.hasPrevious()) {
                //Process my lower nodes and put a watch on whats live
                String previous = ZKUtil.joinZKPath(m_lockPath, iter.previous());
                if (m_zk.exists(previous, m_lockWatcher) != null) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": " + m_ourDistributedLockName + " waiting on " + previous);
                    }
                    nextLower = previous;
                    break;
                }
            }
            //If we could not watch any lower node we are lowest and must own the lock.
            if (nextLower == null) {
                return m_ourDistributedLockName;
            }
            return nextLower;
        }

        private final Runnable HandlerForDistributedLockEvent = new Runnable() {
            @Override
            public void run() {
                lockLocalStateForLockRunner();
                if (m_ourDistributedLockName != null) {
                    try {
                        m_lockWaitingOn = getNextLockNodeFromList();
                    } catch (KeeperException.SessionExpiredException e) {
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ": Received SessionExpiredException in HandlerForDistributedLockEvent");
                        }
                        m_lockWaitingOn = "We died so we can't ever get the distributed lock";
                    } catch (KeeperException.ConnectionLossException e) {
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ": Received ConnectionLossException in HandlerForDistributedLockEvent");
                        }
                        m_lockWaitingOn = "We died so we can't ever get the distributed lock";
                    } catch (InterruptedException e) {
                        if (m_log.isDebugEnabled()) {
                            m_log.debug(m_stateMachineId + ": Received InterruptedException in HandlerForDistributedLockEvent");
                        }
                        m_lockWaitingOn = "We died so we can't ever get the distributed lock";
                    } catch (Exception e) {
                        org.voltdb.VoltDB.crashLocalVoltDB(
                                "Unexpected failure in StateMachine.", true, e);
                    }
                    if (m_lockWaitingOn.equals(m_ourDistributedLockName) && m_currentParticipants == 0) {
                        // There are no more members still processing the last result
                        notifyDistributedLockWaiter();
                    }
                    else {
                        unlockLocalState();
                    }
                }
                else {
                    // lock was cancelled
                    unlockLocalState();
                }
                assert(!debugIsLocalStateLocked());
            }
        };

        private void cancelDistributedLock() {
            assert(debugIsLocalStateLocked());
            if (m_log.isDebugEnabled()) {
                m_log.debug(m_stateMachineId + ": cancelLockRequest for " + m_ourDistributedLockName);
            }
            assert(m_holdingDistributedLock);
            try {
                m_zk.delete(m_ourDistributedLockName, -1);
            } catch (KeeperException.SessionExpiredException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in cancelDistributedLock");
                }
            } catch (KeeperException.ConnectionLossException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in cancelDistributedLock");
                }
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in cancelDistributedLock");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in SynchronizedStatesManager.", true, e);
            }
            m_ourDistributedLockName = null;
            m_holdingDistributedLock = false;
        }

        /*
         * Warns about membership change notification waiting in the executor thread
         */
        private void checkMembership() {
            lockLocalState();
            m_membershipChangePending = true;
            unlockLocalState();
        }

        /*
         * Callback notification from executor thread of membership change
         */
        private void membershipChanged(Set knownHosts, Set addedMembers, Set removedMembers) {
            lockLocalState();
            // Even though we got a direct update, membership could have changed again between the
            m_knownMembers = knownHosts;
            m_membershipChangePending = false;
            boolean notInitializing = m_requestedInitialState == null;
            if (m_pendingProposal != null && m_memberResults != null &&
                    Sets.difference(m_knownMembers, m_memberResults).isEmpty()) {
                // We can stop watching for results since we have a quorum.
                processResultQuorum(m_memberResults);
            }
            else {
                unlockLocalState();
            }
            if (notInitializing) {
                try {
                    membershipChanged(addedMembers, removedMembers);
                } catch (Exception e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug("Error in StateMachineInstance callbacks.", e);
                    }
                    m_initializationCompleted = false;
                    m_shared_es.submit(new CallbackExceptionHandler(this));
                }
            }
            assert(!debugIsLocalStateLocked());
        }

        /*
         * Retrieves member set from ZooKeeper
         */
        private void getLatestMembership() {
            try {
                m_knownMembers = ImmutableSet.copyOf(m_zk.getChildren(m_stateMachineMemberPath, null));
            } catch (KeeperException.SessionExpiredException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in getLatestMembership");
                }
            } catch (KeeperException.ConnectionLossException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in getLatestMembership");
                }
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in getLatestMembership");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in SynchronizedStatesManager.", true, e);
            }
        }

        private void notifyDistributedLockWaiter() {
            assert(debugIsLocalStateLocked());
            assert(m_currentParticipants == 0);
            m_holdingDistributedLock = true;
            if (m_membershipChangePending) {
                getLatestMembership();
            }
            if (m_requestedInitialState != null) {
                // We are still waiting to initialize the state. Now that we have have the state lock we can
                // look at the last set of results. We need to set proposedState so results will be checked.
                assert(m_pendingProposal == null);
                m_pendingProposal = m_requestedInitialState;
                initializeFromActiveCommunity();
                assert(!debugIsLocalStateLocked());
            }
            else {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Granted lockRequest for " + m_ourDistributedLockName);
                }
                m_lockWaitingOn = null;
                unlockLocalState();
                // Notify the derived class that the lock is available
                try {
                    lockRequestCompleted();
                } catch (Exception e) {
                    cancelLockRequest();
                    if (m_log.isDebugEnabled()) {
                        m_log.debug("Error in StateMachineInstance callbacks.", e);
                    }
                    m_initializationCompleted = false;
                    m_shared_es.submit(new CallbackExceptionHandler(this));
                }
            }
        }

        private boolean requestDistributedLock() {
            try {
                if (m_ourDistributedLockName != null) {
                    m_log.error(m_stateMachineId + ": Requested distributed lock before prior state change or task has been completed");
                    return false;
                }
                assert(debugIsLocalStateLocked());
                m_ourDistributedLockName = m_zk.create(ZKUtil.joinZKPath(m_lockPath, "lock_"), null,
                        Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL_SEQUENTIAL);
                m_lockWaitingOn = getNextLockNodeFromList();
                if (m_lockWaitingOn.equals(m_ourDistributedLockName) && m_currentParticipants == 0) {
                    // Prevents a second notification.
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": requestLock successful for " + m_ourDistributedLockName);
                    }
                    m_lockWaitingOn = null;
                    m_holdingDistributedLock = true;
                    if (m_membershipChangePending) {
                        getLatestMembership();
                    }
                    return true;
                }
            } catch (KeeperException.SessionExpiredException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in requestDistributedLock");
                }
            } catch (KeeperException.ConnectionLossException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in requestDistributedLock");
                }
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in requestDistributedLock");
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
            return false;
        }

        private void addResultEntry(byte result[]) {
            try {
                m_zk.create(m_myResultPath, result, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
            } catch (KeeperException.SessionExpiredException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received SessionExpiredException in addResultEntry");
                }
            } catch (KeeperException.ConnectionLossException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received ConnectionLossException in addResultEntry");
                }
            } catch (InterruptedException e) {
                if (m_log.isDebugEnabled()) {
                    m_log.debug(m_stateMachineId + ": Received InterruptedException in addResultEntry");
                }
            }
            catch (KeeperException.NodeExistsException e) {
                // There is a race during initialization where a null result is assigned once so a current proposal
                // is not hung. However, if a new proposal is submitted by another instance between that assignment
                // and the call to checkForBarrierParticipantsChange, a second null result is assigned.
                if (m_requestedInitialState == null || result != null) {
                    org.voltdb.VoltDB.crashLocalVoltDB(
                            "Unexpected failure in StateMachine; Two results created for one proposal.", true, e);
                }
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in StateMachine.", true, e);
            }
        }

        private void assignStateChangeAgreement(boolean acceptable) {
            assert(debugIsLocalStateLocked());
            assert(m_pendingProposal != null);
            assert(m_currentRequestType == REQUEST_TYPE.STATE_CHANGE_REQUEST);
            byte result[] = new byte[1];
            result[0] = (byte)(acceptable?1:0);
            addResultEntry(result);
            if (acceptable) {
                // Since we are only interested in the results when we agree with the proposal
                checkForBarrierResultsChanges();
            }
            else {
                m_pendingProposal = null;
                try {
                    // Since we don't care about the outcome remove ourself from the participant list
                    m_zk.delete(m_myParticipantPath, -1);
                } catch (KeeperException.SessionExpiredException e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": Received SessionExpiredException in assignStateChangeAgreement");
                    }
                } catch (KeeperException.ConnectionLossException e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": Received ConnectionLossException in assignStateChangeAgreement");
                    }
                } catch (InterruptedException e) {
                    if (m_log.isDebugEnabled()) {
                        m_log.debug(m_stateMachineId + ": Received InterruptedException in assignStateChangeAgreement");
                    }
                } catch (Exception e) {
                    org.voltdb.VoltDB.crashLocalVoltDB(
                            "Unexpected failure in StateMachine.", true, e);
                }
                unlockLocalState();
            }
        }

        /*
         * Returns true when this state machine is a full participant of the community and has the
         * current state of the community
         */
        protected boolean isInitialized() {
            boolean initialized;
            lockLocalState();
            initialized = m_initializationCompleted && m_requestedInitialState == null;
            unlockLocalState();
            return initialized;
        }

        /*
         * Notifies of full participation and the current agreed state
         * warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected abstract void setInitialState(ByteBuffer currentAgreedState);

        /*
         * Attempts to get the distributed lock. Returns true if the lock was acquired
         */
        protected boolean requestLock() {
            lockLocalState();
            boolean rslt = false;
            if (m_initializationCompleted) {
                rslt = requestDistributedLock();
            }
            unlockLocalState();
            return rslt;
        }

        /*
         * Cancels an outstanding lock request. Should only be used if a previously desirable
         * proposal is now obsolete (ie. use when lockRequestCompleted is called and a proposal
         * is no longer necessary).
         */
        protected void cancelLockRequest() {
            lockLocalState();
            if (m_initializationCompleted) {
                assert (m_pendingProposal == null);
                cancelDistributedLock();
            }
            unlockLocalState();
        }

        /*
         * Notifies of the successful completion of a previous lock request
         */
        protected void lockRequestCompleted() {}

        /*
         * Propose a new state. Only call after successful acquisition of the distributed lock.
         */
        protected void proposeStateChange(ByteBuffer proposedState) {
            assert(proposedState != null);
            assert(proposedState.remaining() < Short.MAX_VALUE);
            lockLocalState();
            if (!m_initializationCompleted) {
                unlockLocalState();
                return;
            }
            // Only the lock owner can initiate a barrier request
            assert(m_requestedInitialState == null);
            if (proposedState.position() == 0) {
                m_pendingProposal = proposedState;
            }
            else {
                // Move to a new 0 aligned buffer
                m_pendingProposal = ByteBuffer.allocate(proposedState.remaining());
                m_pendingProposal.put(proposedState.array(),
                        proposedState.arrayOffset()+proposedState.position(), proposedState.remaining());
                m_pendingProposal.flip();
            }
            if (m_log.isDebugEnabled()) {
                m_log.debug(m_stateMachineId + ": Proposing new state " + stateToString(m_pendingProposal.asReadOnlyBuffer()));
            }
            m_stateChangeInitiator = true;
            m_currentRequestType = REQUEST_TYPE.STATE_CHANGE_REQUEST;
            ByteBuffer stateChange = buildProposal(REQUEST_TYPE.STATE_CHANGE_REQUEST,
                    m_synchronizedState.asReadOnlyBuffer(), m_pendingProposal.asReadOnlyBuffer());
            m_lastProposalVersion = wakeCommunityWithProposal(stateChange.array());
            assignStateChangeAgreement(true);
        }

        /*
         * Notification of a new proposed state change by another member.
         * warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected void stateChangeProposed(ByteBuffer proposedState) {}

        /*
         * Called to accept or reject a new proposed state change by another member.
         */
        protected void requestedStateChangeAcceptable(boolean acceptable) {
            lockLocalState();
            if (!m_initializationCompleted) {
                unlockLocalState();
                return;
            }
            assert(!m_stateChangeInitiator);
            if (m_log.isDebugEnabled()) {
                m_log.debug(m_stateMachineId + (acceptable ? ": Agrees with State proposal" :
                        ": Disagrees with State proposal"));
            }
            assignStateChangeAgreement(acceptable);
        }

        /*
         * Notification of the outcome of a previous state change proposal.
         * warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected void proposedStateResolved(boolean ourProposal, ByteBuffer proposedState, boolean success) {}

        /*
         * Propose a new task. Only call after successful acquisition of the distributed lock.
         */
        protected void initiateCoordinatedTask(boolean correlated, ByteBuffer proposedTask) {
            assert(proposedTask != null);
            assert(proposedTask.remaining() < Short.MAX_VALUE);
            lockLocalState();
            if (m_initializationCompleted) {
                // Only the lock owner can initiate a barrier request
                assert (m_requestedInitialState == null);
                if (proposedTask.position() == 0) {
                    m_pendingProposal = proposedTask;
                } else {
                    // Move to a new 0 aligned buffer
                    m_pendingProposal = ByteBuffer.allocate(proposedTask.remaining());
                    m_pendingProposal.put(proposedTask.array(),
                            proposedTask.arrayOffset() + proposedTask.position(), proposedTask.remaining());
                    m_pendingProposal.flip();
                }
                if (m_log.isDebugEnabled()) {
                    if (m_pendingProposal.hasRemaining()) {
                        m_log.debug(m_stateMachineId + ": Requested new Task " + taskToString(m_pendingProposal.asReadOnlyBuffer()));
                    } else {
                        m_log.debug(m_stateMachineId + ": Requested unspecified new Task");
                    }
                }
                m_stateChangeInitiator = true;
                m_currentRequestType = correlated ?
                        REQUEST_TYPE.CORRELATED_COORDINATED_TASK :
                        REQUEST_TYPE.UNCORRELATED_COORDINATED_TASK;
                ByteBuffer taskProposal = buildProposal(m_currentRequestType,
                        m_synchronizedState.asReadOnlyBuffer(), proposedTask.asReadOnlyBuffer());
                m_pendingProposal = proposedTask;
                // Since we don't update m_lastProposalVersion, we will wake ourselves up
                wakeCommunityWithProposal(taskProposal.array());
            }
            unlockLocalState();
        }

        /*
         * Notification of a new task request by this member or another member.
         * warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected void taskRequested(ByteBuffer proposedTask) {}

        /*
         * Notification of a task request for newly joined members.
         * warning: The ByteBuffer is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected void staleTaskRequestNotification(ByteBuffer proposedTask) {}

        /*
         * Called to accept or reject a new proposed state change by another member.
         */
        protected void requestedTaskComplete(ByteBuffer result)
        {
            lockLocalState();
            if (!m_initializationCompleted) {
                unlockLocalState();
                return;
            }
            assert(m_pendingProposal != null);
            if (m_log.isDebugEnabled()) {
                if (result.hasRemaining()) {
                    m_log.debug(m_stateMachineId + ": Local Task completed with result " +
                            taskResultToString(m_pendingProposal.asReadOnlyBuffer(), result.asReadOnlyBuffer()));
                }
                else {
                    m_log.debug(m_stateMachineId + ": Local Task completed with empty result");
                }
            }
            addResultEntry(result.array());
            checkForBarrierResultsChanges();
        }

        /*
         * Notification of the outcome of the task by all members sorted by memberId.
         * warning: The ByteBuffer taskRequest is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected void correlatedTaskCompleted(boolean ourTask, ByteBuffer taskRequest, Map results) {}

        /*
         * Notification of the outcome of the task by all members.
         * warning: The ByteBuffer taskRequest is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected void uncorrelatedTaskCompleted(boolean ourTask, ByteBuffer taskRequest, List results) {}

        /*
         * Returns the current State. This request ignores pending proposals or outcomes. And may well be invalid as soon
         * as the result is returned if a new proposed state is pending. To avoid this get the Distributed Lock first.
         * warning: The ByteBuffer taskRequest is not guaranteed to start at position 0 (avoid rewind, flip, ...)
         */
        protected ByteBuffer getCurrentState() {
            ByteBuffer currentState;
            lockLocalState();
            if (m_initializationCompleted) {
                currentState = m_synchronizedState.asReadOnlyBuffer();
            }
            else {
                currentState = ByteBuffer.allocate(0);
            }
            unlockLocalState();
            return currentState;
        }

        protected void membershipChanged(Set addedMembers, Set removedMembers) {}

        protected Set getCurrentMembers() {
            lockLocalState();
            if (m_membershipChangePending) {
                getLatestMembership();
            }
            Set latestAndGreatest = ImmutableSet.copyOf(m_knownMembers);
            unlockLocalState();

            return latestAndGreatest;
        }

        protected String getMemberId() {
            return m_memberId;
        }

        /*
         * Provides the version of the last received proposal or task.
         * Note that the version number is not the number of state transitions but counts the total number of
         * successful state transitions, failed state transitions, task requests, and last change outcome requests
         */
        protected int getCurrentStateVersion() {
            return m_lastProposalVersion;
        }

        protected boolean holdingDistributedLock() {
            lockLocalState();
            unlockLocalState();
            return m_holdingDistributedLock;
        }

        protected abstract String stateToString(ByteBuffer state);

        protected String taskToString(ByteBuffer task) { return ""; }

        protected String taskResultToString(ByteBuffer task, ByteBuffer taskResult) { return ""; }
    }

    public SynchronizedStatesManager(ZooKeeper zk, String rootPath, String ssmNodeName, String memberId)
            throws KeeperException, InterruptedException {
        this(zk, rootPath, ssmNodeName, memberId, 1);
    }

    // Used only for Mocking StateMachineInstance
    public SynchronizedStatesManager() {
        m_zk = null;
        m_registeredStateMachines = null;
        m_ssmRootNode = "MockRootForZooKeeper";
        m_stateMachineRoot = "MockRootForSSM";
        m_stateMachineMemberPath = "MockRootMembershipNode";
        m_memberId = "MockMemberId";
        m_canonical_memberId = "MockCanonicalMemberId";
        m_resetAllowance = 5;
        m_resetLimit = m_resetAllowance;
    }

    public SynchronizedStatesManager(ZooKeeper zk, String rootPath, String ssmNodeName, String memberId, int registeredInstances)
            throws KeeperException, InterruptedException {
        this(zk, rootPath, ssmNodeName, memberId, registeredInstances, 5); // default resetAllowance is 5
    }

    public SynchronizedStatesManager(ZooKeeper zk, String rootPath, String ssmNodeName, String memberId, int registeredInstances, int resetAllowance)
            throws KeeperException, InterruptedException {
        m_zk = zk;
        // We will not add ourselves as members in ZooKeeper until all StateMachineInstances have registered
        m_registeredStateMachines = new StateMachineInstance[registeredInstances];
        m_ssmRootNode = ssmNodeName;
        m_stateMachineRoot = ZKUtil.joinZKPath(rootPath, ssmNodeName);
        ByteBuffer numberOfInstances = ByteBuffer.allocate(4);
        numberOfInstances.putInt(registeredInstances);
        addIfMissing(m_stateMachineRoot, CreateMode.PERSISTENT, numberOfInstances.array());
        m_stateMachineMemberPath = ZKUtil.joinZKPath(m_stateMachineRoot, m_memberNode);
        m_canonical_memberId = memberId;
        m_resetCounter = 0;
        m_resetAllowance = resetAllowance;
        m_resetLimit = m_resetAllowance;
        m_memberId = m_canonical_memberId + "_v" + m_resetCounter;
    }

    public void ShutdownSynchronizedStatesManager() throws InterruptedException {
        ListenableFuture disableComplete = m_shared_es.submit(disableInstances);
        try {
            disableComplete.get();
        }
        catch (ExecutionException e) {
            Throwables.propagate(e.getCause());
        }

    }

    private final Runnable disableInstances = new Runnable() {
        @Override
        public void run() {
            try {
                m_done.set(true);
                for (StateMachineInstance stateMachine : m_registeredStateMachines) {
                    stateMachine.disableMembership();
                }
                m_zk.delete(ZKUtil.joinZKPath(m_stateMachineMemberPath, m_memberId), -1);
            } catch (KeeperException.SessionExpiredException e) {
                // lost the full connection. some test cases do this...
                // means zk shutdown without the elector being shutdown.
                // ignore.
            } catch (KeeperException.ConnectionLossException e) {
                // lost the full connection. some test cases do this...
                // means shutdown without the elector being
                // shutdown; ignore.
            } catch (InterruptedException e) {
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in SynchronizedStatesManager.", true, e);
            }
        }
    };

    private final Runnable membershipEventHandler = new Runnable() {
        @Override
        public void run() {
            try {
                checkForMembershipChanges();
            } catch (KeeperException.SessionExpiredException e) {
                // lost the full connection. some test cases do this...
                // means shutdown without the elector being
                // shutdown; ignore.
            } catch (KeeperException.ConnectionLossException e) {
                // lost the full connection. some test cases do this...
                // means shutdown without the elector being
                // shutdown; ignore.
            } catch (InterruptedException e) {
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in SynchronizedStatesManager.", true, e);
            }
        }
    };

    private class MembershipWatcher implements Watcher {

        @Override
        public void process(WatchedEvent event) {
            try {
                if (!m_done.get()) {
                    for (StateMachineInstance stateMachine : m_registeredStateMachines) {
                        stateMachine.checkMembership();
                    }
                    m_shared_es.submit(membershipEventHandler);
                }
            } catch (RejectedExecutionException e) {
            }
        }
    }

    private final MembershipWatcher m_membershipWatcher = new MembershipWatcher();

    private final Runnable initializeInstances = new Runnable() {
        @Override
        public void run() {
            try {
                byte[] expectedInstances = m_zk.getData(m_stateMachineRoot, false, null);
                assert(m_registeredStateMachineInstances == ByteBuffer.wrap(expectedInstances).getInt());
                // First become a member of the community
                addIfMissing(m_stateMachineMemberPath, CreateMode.PERSISTENT, null);
                // This could fail because of an old ephemeral that has not aged out yet but assume this does not happen
                addIfMissing(ZKUtil.joinZKPath(m_stateMachineMemberPath, m_memberId), CreateMode.EPHEMERAL, null);

                m_groupMembers = ImmutableSet.copyOf(m_zk.getChildren(m_stateMachineMemberPath, m_membershipWatcher));
                // Then initialize each instance
                for (StateMachineInstance instance : m_registeredStateMachines) {
                    instance.initializeStateMachine(m_groupMembers);
                }
            } catch (KeeperException.SessionExpiredException e) {
                // lost the full connection. some test cases do this...
                // means zk shutdown without the elector being shutdown.
                // ignore.
                m_done.set(true);
            } catch (KeeperException.ConnectionLossException e) {
                // lost the full connection. some test cases do this...
                // means shutdown without the elector being
                // shutdown; ignore.
                m_done.set(true);
            } catch (InterruptedException e) {
                m_done.set(true);
            } catch (Exception e) {
                org.voltdb.VoltDB.crashLocalVoltDB(
                        "Unexpected failure in initializeInstances.", true, e);
                m_done.set(true);
            }
        }
    };

    private synchronized void registerStateMachine(StateMachineInstance machine) throws InterruptedException {
        assert(m_registeredStateMachineInstances < m_registeredStateMachines.length);

        m_registeredStateMachines[m_registeredStateMachineInstances] = (machine);
        ++m_registeredStateMachineInstances;
        if (m_registeredStateMachineInstances == m_registeredStateMachines.length) {
            if (machine.m_log.isDebugEnabled()) {
                // lets make sure all the state machines are using unique paths
                Set instanceNames = new HashSet();
                for (StateMachineInstance instance : m_registeredStateMachines) {
                    if (!instanceNames.add(instance.m_statePath)) {
                        machine.m_log.error(": Multiple state machine instances with the same instanceName [" +
                                instance.m_statePath + "]");
                    }
                }
            }
            // Do all the initialization and notifications on the executor to avoid a race with
            // the group membership changes
            ListenableFuture initComplete = m_shared_es.submit(initializeInstances);
            try {
                initComplete.get();
            }
            catch (ExecutionException e) {
                Throwables.propagate(e.getCause());
            }
        }
    }

    private class CallbackExceptionHandler implements Runnable {
        final StateMachineInstance m_directVictim;

        CallbackExceptionHandler(StateMachineInstance directVictim) {
            m_directVictim = directVictim;
        }

        @Override
        public void run() {
            // if the direct victim has already been reset, ignore the stale callback exception handling task
            if (!m_directVictim.isInitializationCompleted()) {
                assert (m_registeredStateMachineInstances > 0 && m_registeredStateMachineInstances == m_registeredStateMachines.length);

                disableInstances.run();

                if (m_lastResetTimeInMillis == -1L) {
                    m_lastResetTimeInMillis = System.currentTimeMillis();
                } else {
                    long currentTimeInMillis = System.currentTimeMillis();
                    // if a full reset clear threshold duration has passed after last reset, bump the reset limit
                    if (currentTimeInMillis - m_lastResetTimeInMillis >= RESET_CLEAR_THRESHOLD) {
                        m_resetLimit = m_resetCounter + m_resetAllowance;
                    }
                    m_lastResetTimeInMillis = currentTimeInMillis;
                }

                ++m_resetCounter;
                if (m_resetCounter > m_resetLimit) {
                    return;
                }

                m_memberId = m_canonical_memberId + "_v" + m_resetCounter;
                try {
                    for (StateMachineInstance instance : m_registeredStateMachines) {
                        instance.reset(instance == m_directVictim);
                    }
                } catch (Exception e) {
                    return; // if something wrong happened in reset(), give up as if the reset limit is hit
                }
                m_done.set(false);

                initializeInstances.run();
            }
        }
    }

    /*
     * Track state machine membership. If it changes, notify all state machine instances
     */
    private void checkForMembershipChanges() throws KeeperException, InterruptedException {
        Set children = ImmutableSet.copyOf(m_zk.getChildren(m_stateMachineMemberPath, m_membershipWatcher));

        Set removedMembers;
        Set addedMembers;
        if (m_registeredStateMachineInstances == m_registeredStateMachines.length && !m_groupMembers.equals(children)) {
            removedMembers = Sets.difference(m_groupMembers, children);
            addedMembers = Sets.difference(children, m_groupMembers);
            m_groupMembers = children;
            for (StateMachineInstance stateMachine : m_registeredStateMachines) {
                stateMachine.membershipChanged(m_groupMembers, addedMembers, removedMembers);
            }
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy