org.jgroups.protocols.raft.ELECTION2 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jgroups-raft
JGroups RAFT implementation
There is a newer version: 1.0.13.Final
package org.jgroups.protocols.raft;

import org.jgroups.Address;
import org.jgroups.EmptyMessage;
import org.jgroups.Message;
import org.jgroups.View;
import org.jgroups.annotations.MBean;
import org.jgroups.annotations.ManagedAttribute;
import org.jgroups.conf.ClassConfigurator;
import org.jgroups.protocols.raft.election.BaseElection;
import org.jgroups.protocols.raft.election.PreVoteRequest;
import org.jgroups.protocols.raft.election.PreVoteResponse;
import org.jgroups.raft.util.Utils;
import org.jgroups.raft.util.Utils.Majority;
import org.jgroups.util.ResponseCollector;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static org.jgroups.Message.Flag.OOB;

/**
 * A leader election protocol.
 * 
 * This implementation extends {@link ELECTION} with a pre-vote mechanism. The pre-vote always runs before starting the
 * voting thread to define the leader. The pre-vote increases the delay in electing the leader but, in turn, covers more
 * edge cases. As a rule of thumb, if deploying in an unstable network with frequent partitions, this protocol should
 * give a more stable mechanism, avoid leader disruptions, and avoid possible liveness issues. Otherwise,
 * {@link ELECTION} is the default choice.
 *
 * 
Pre-Voting phase:
 *
 * This extension includes the pre-voting mechanism proposed in Ongaro's dissertation (§9.6). In the current
 * implementation, a pre-voting phase starts in case the current node is the new view coordinator and:
 *
 * 
 *     The view coordinator changes and the computed update is {@link Majority#no_change};
 *     The computed view update is {@link Majority#reached} or {@link Majority#leader_lost}
 * 
 *
 * The process which executes the pre-voting mechanism sends a {@link PreVoteRequest} to all processes in the view. The
 * recipients reply with a {@link PreVoteResponse} identifying the node they see as leader. Once the initiator receives
 * the reply from all nodes in the view, it can start the voting process, resuming the work the same as {@link ELECTION}.
 *
 * @since 1.0.12
 * @see ELECTION
 * @see Ongaro's dissertation
 * @see Issue #221
 * @author José Bolina
 */
@MBean(description = "Performs leader election with a pre-voting phase.")
public class ELECTION2 extends BaseElection {

    protected static final short ELECTION_ID    = 524;

    static {
        ClassConfigurator.addProtocol(ELECTION_ID, ELECTION2.class);
        ClassConfigurator.add(PRE_VOTE_REQ, PreVoteRequest.class);
        ClassConfigurator.add(PRE_VOTE_RSP, PreVoteResponse.class);
    }

    private final PreVotingMechanism preVotingMechanism = new PreVotingMechanism();

    @ManagedAttribute(description="Whether the pre-voting is running? (Coordinator only)")
    public boolean isPreVoteThreadRunning() {
        return preVotingMechanism.isRunning();
    }

    @Override
    protected void handleView(View v) {
        View previousView = this.view;
        this.view = v;
        Majority result = Utils.computeMajority(previousView, v, raft().majority(), raft.leader());
        log.debug("%s: existing view: %s, new view: %s, result: %s", local_addr, previousView, v, result);

        List joiners = View.newMembers(previousView, v);
        boolean has_new_members = joiners != null && !joiners.isEmpty();

        switch (result) {
            case no_change:
                if (raft.isLeader() && has_new_members) {
                    sendLeaderElectedMessage(raft.leader(), raft.currentTerm());
                    break;
                }
                // If we have no change in terms of majority threshold. If the view coordinator changed, we need to
                // verify if an election is necessary.
                if (Utils.viewCoordinatorChanged(previousView, v) && isViewCoordinator() && view.size() >= raft.majority()) {
                    preVotingMechanism.start();
                }
                break;
            case reached:
            case leader_lost:
                // In case the leader is lost, we stop everything *before* starting again.
                // This avoids cases where the leader is lost before the voting mechanism has stopped.
                // See: https://github.com/jgroups-extras/jgroups-raft/issues/259
                if (isViewCoordinator()) {
                    stopVotingThread();
                    preVotingMechanism.stop();
                    preVotingMechanism.start();
                }
                break;
            case lost:
                preVotingMechanism.stop();
                stopVotingThread();
                raft.setLeaderAndTerm(null);
                break;
        }
    }

    @Override
    protected void handleMessage(Message msg, RaftHeader hdr) {
        if (hdr instanceof PreVoteRequest) {
            handlePreVoteRequest(msg, (PreVoteRequest) hdr);
            return;
        }

        if (hdr instanceof PreVoteResponse) {
            handlePreVoteResponse(msg, (PreVoteResponse) hdr);
            return;
        }

        super.handleMessage(msg, hdr);
    }

    /**
     * Handle the {@link PreVoteRequest} coming from other nodes.
     * 
     * A node sends a {@link PreVoteRequest} to verify if it can become the leader, running the pre-voting phase
     * instead of disrupting the cluster. The node that receives this request must only reply if they would vote for
     * the sender in an election. Although, they can reply to different pre-votes, the node is not bound during this phase.
     * 

     * This version is an altered version of the pre-voting mechanism from the dissertation (§9.6). In this version,
     * the node replies with its current known leader address. This is because the sender is not interested in electing
     * itself. The sender is checking if a cluster-wide election round should be started.
     *
     * @param message: The message received.
     * @param hdr: The message header.
     */
    private void handlePreVoteRequest(Message message, PreVoteRequest hdr) {
        sendPreVoteResponse(message.getSrc());
    }

    private void handlePreVoteResponse(Message msg, PreVoteResponse hdr) {
        // We are not interested in the replies if the voting phase started.
        // Or if we are not the view coordinator.
        if (isVotingThreadRunning() || !isViewCoordinator() || !preVotingMechanism.isRunning()) return;

        preVotingMechanism.includeResponse(msg.getSrc(), hdr);
    }

    private void sendPreVotingRequest() {
        PreVoteRequest hdr = new PreVoteRequest();
        Message msg = new EmptyMessage(null).putHeader(id, hdr).setFlag(OOB);
        log.trace("%s -> all: %s", local_addr, hdr);
        down_prot.down(msg);
    }

    private void sendPreVoteResponse(Address dest) {
        PreVoteResponse hdr = new PreVoteResponse(raft.leader());
        Message msg = new EmptyMessage(dest).putHeader(id, hdr).setFlag(OOB);
        log.trace("%s -> %s: %s", local_addr, dest, hdr);
        down_prot.down(msg);
    }

    private class PreVotingMechanism {
        protected final ResponseCollector preVotingResponses = new ResponseCollector<>();

        public boolean isRunning() {
            return preVotingResponses.size() > 0;
        }

        public void start() {
            int majority = raft.majority();
            if (!isRunning() && isViewCoordinator() && !isVotingThreadRunning() && view.getMembers().size() >= majority) {
                log.trace("%s: starting pre-voting mechanism", local_addr);
                startPreVotingPhase();
            }
        }

        public void stop() {
            log.trace("%s: stopping pre-voting thread", local_addr);
            preVotingResponses.reset();
        }

        /**
         * Include a new response to the running pre-vote phase.
         * 

         * Once all responses are collected and there is still a majority, the responses are parsed to verify if an
         * election phase should start.
         *
         * @param sender: The response sender.
         * @param hdr: The response message.
         */
        public void includeResponse(Address sender, PreVoteResponse hdr) {
            preVotingResponses.add(sender, hdr);;

            int majority = raft.majority();
            if (preVotingResponses.hasAllResponses() && preVotingResponses.numberOfValidResponses() >= majority) {
                Map responses = Map.copyOf(preVotingResponses.getResults());
                stopPreVotingPhase(responses);
                stop();
            } else if (log.isTraceEnabled()) {
                log.trace("%s: collected pre-vote responses %s", local_addr, preVotingResponses.getResults());
            }
        }

        private void startPreVotingPhase() {
            preVotingResponses.reset(view.getMembers());
            sendPreVotingRequest();
        }

        /**
         * Executes after collecting messages from all nodes in the current view.
         * 

         * This procedure parses the responses to identify whether to start the voting thread. It has the strategy of:
         *
         * 

         *     A majority of nodes does not have a leader or see the same "outdated" leader as this node;
         *     
In case a majority sees a different leader:
         *     
         *         The supposed leader is not in the view;
         *         
The supposed leader does not see itself as leader;
         *     
         * 
         * 
         * In case none of that matches, the election algorithm is not started.
         *
         * @param responses The cluster {@link PreVoteResponse} responses. Must have a response from all nodes
         *                  in {@link #view}.
         */
        private void stopPreVotingPhase(Map responses) {
            int acceptStartVoting = 0;
            Address localLeader = raft.leader();

            log.trace("%s: validating pre-vote responses from %s", local_addr, responses);

            Address remoteLeader = null;
            for (PreVoteResponse response : responses.values()) {
                if (response == null) continue;

                Address leader = response.leader();

                // The remote node either does not have a leader (== null) or has the same leader as this node.
                // If we are at this stage, means that this node suspects the leader. For example, we have merged partitions
                // and the remote still sees the old leader, or the node receive the message before the view update event.
                // We count that towards starting the voting thread.
                if (leader == null || leader.equals(localLeader)) {
                    acceptStartVoting++;
                } else {
                    // Is it possible that a node is so far behind came to life with a very old leader?
                    assert remoteLeader == null || remoteLeader.equals(leader) : "Somehow the leader is different!!";
                    remoteLeader = leader;
                }
            }

            // We have a majority already. Most of the nodes see an outdated or does not have a leader.
            if (acceptStartVoting >= raft.majority()) {
                log.debug("%s: pre-voting phase finished and starting the voting phase", local_addr);

                // We can already remove the outdated leader.
                raft.setLeaderAndTerm(null);
                startVotingThread();
                return;
            }

            log.trace("%s: did not met majority, taking slow-path %s", local_addr, responses);

            // We did not meet the majority. We need to inspect the responses more closely. In a more concrete example:
            // Given a cluster V1 = {A, B, C, D, E} with leader A. Suffering a quorum loss with a partial connectivity,
            // having views: A and C have V2 = {A, C} and B, D, E still have V1.
            // Once the connectivity is restored, A and C will merge the views and have V3 = {A, B, C, D, E}. Given that
            // node A is the coordinator and executes the pre-voting, a majority still sees A as leader. This leads to a
            // liveness scenario where a majority still sees the old leader, but A stepped down, and a new voting
            // phase is never started.
            Map votes = new HashMap<>();
            for (PreVoteResponse response : responses.values()) {
                if (response == null || response.leader() == null) continue;
                votes.compute(response.leader(), (k, v) -> v == null ? 1 : v + 1);
            }

            // Compute the leader the cluster still see.
            Address leader = null;
            for (Map.Entry entry : votes.entrySet()) {
                if (leader == null || entry.getValue() > votes.get(leader)) {
                    leader = entry.getKey();
                }
            }

            assert leader != null: "Leader should not be null at this stage: " + responses;
            PreVoteResponse response = responses.get(leader);

            // If the response is null, means the old leader is not present in the current view.
            // The other possibility was detailed above, a cluster see the leader, but the node stepped down, and
            // it does not see itself as a leader.
            if (response == null || !leader.equals(response.leader())) {
                log.debug("%s: pre-voting phase finished and starting the voting phase", local_addr);
                startVotingThread();
                return;
            }

            // Should receive a late message from the leader?
            log.trace("%s: not able to start voting, majority sees %s as leader", local_addr, leader);
        }
    }
}