org.jgroups.protocols.raft.ELECTION2 Maven / Gradle / Ivy
package org.jgroups.protocols.raft;
import org.jgroups.Address;
import org.jgroups.EmptyMessage;
import org.jgroups.Message;
import org.jgroups.View;
import org.jgroups.annotations.MBean;
import org.jgroups.annotations.ManagedAttribute;
import org.jgroups.conf.ClassConfigurator;
import org.jgroups.protocols.raft.election.BaseElection;
import org.jgroups.protocols.raft.election.PreVoteRequest;
import org.jgroups.protocols.raft.election.PreVoteResponse;
import org.jgroups.raft.util.Utils;
import org.jgroups.raft.util.Utils.Majority;
import org.jgroups.util.ResponseCollector;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static org.jgroups.Message.Flag.OOB;
/**
* A leader election protocol.
*
* This implementation extends {@link ELECTION} with a pre-vote mechanism. The pre-vote always runs before starting the
* voting thread to define the leader. The pre-vote increases the delay in electing the leader but, in turn, covers more
* edge cases. As a rule of thumb, if deploying in an unstable network with frequent partitions, this protocol should
* give a more stable mechanism, avoid leader disruptions, and avoid possible liveness issues. Otherwise,
* {@link ELECTION} is the default choice.
*
*
Pre-Voting phase:
*
* This extension includes the pre-voting mechanism proposed in Ongaro's dissertation (§9.6). In the current
* implementation, a pre-voting phase starts in case the current node is the new view coordinator and:
*
*
* - The view coordinator changes and the computed update is {@link Majority#no_change};
* - The computed view update is {@link Majority#reached} or {@link Majority#leader_lost}
*
*
* The process which executes the pre-voting mechanism sends a {@link PreVoteRequest} to all processes in the view. The
* recipients reply with a {@link PreVoteResponse} identifying the node they see as leader. Once the initiator receives
* the reply from all nodes in the view, it can start the voting process, resuming the work the same as {@link ELECTION}.
*
* @since 1.0.12
* @see ELECTION
* @see Ongaro's dissertation
* @see Issue #221
* @author José Bolina
*/
@MBean(description = "Performs leader election with a pre-voting phase.")
public class ELECTION2 extends BaseElection {
protected static final short ELECTION_ID = 524;
static {
ClassConfigurator.addProtocol(ELECTION_ID, ELECTION2.class);
ClassConfigurator.add(PRE_VOTE_REQ, PreVoteRequest.class);
ClassConfigurator.add(PRE_VOTE_RSP, PreVoteResponse.class);
}
private final PreVotingMechanism preVotingMechanism = new PreVotingMechanism();
@ManagedAttribute(description="Whether the pre-voting is running? (Coordinator only)")
public boolean isPreVoteThreadRunning() {
return preVotingMechanism.isRunning();
}
@Override
protected void handleView(View v) {
View previousView = this.view;
this.view = v;
Majority result = Utils.computeMajority(previousView, v, raft().majority(), raft.leader());
log.debug("%s: existing view: %s, new view: %s, result: %s", local_addr, previousView, v, result);
List joiners = View.newMembers(previousView, v);
boolean has_new_members = joiners != null && !joiners.isEmpty();
switch (result) {
case no_change:
if (raft.isLeader() && has_new_members) {
sendLeaderElectedMessage(raft.leader(), raft.currentTerm());
break;
}
// If we have no change in terms of majority threshold. If the view coordinator changed, we need to
// verify if an election is necessary.
if (Utils.viewCoordinatorChanged(previousView, v) && isViewCoordinator() && view.size() >= raft.majority()) {
preVotingMechanism.start();
}
break;
case reached:
case leader_lost:
// In case the leader is lost, we stop everything *before* starting again.
// This avoids cases where the leader is lost before the voting mechanism has stopped.
// See: https://github.com/jgroups-extras/jgroups-raft/issues/259
if (isViewCoordinator()) {
stopVotingThread();
preVotingMechanism.stop();
preVotingMechanism.start();
}
break;
case lost:
preVotingMechanism.stop();
stopVotingThread();
raft.setLeaderAndTerm(null);
break;
}
}
@Override
protected void handleMessage(Message msg, RaftHeader hdr) {
if (hdr instanceof PreVoteRequest) {
handlePreVoteRequest(msg, (PreVoteRequest) hdr);
return;
}
if (hdr instanceof PreVoteResponse) {
handlePreVoteResponse(msg, (PreVoteResponse) hdr);
return;
}
super.handleMessage(msg, hdr);
}
/**
* Handle the {@link PreVoteRequest} coming from other nodes.
*
* A node sends a {@link PreVoteRequest} to verify if it can become the leader, running the pre-voting phase
* instead of disrupting the cluster. The node that receives this request must only reply if they would vote for
* the sender in an election. Although, they can reply to different pre-votes, the node is not bound during this phase.
*
* This version is an altered version of the pre-voting mechanism from the dissertation (§9.6). In this version,
* the node replies with its current known leader address. This is because the sender is not interested in electing
* itself. The sender is checking if a cluster-wide election round should be started.
*
* @param message: The message received.
* @param hdr: The message header.
*/
private void handlePreVoteRequest(Message message, PreVoteRequest hdr) {
sendPreVoteResponse(message.getSrc());
}
private void handlePreVoteResponse(Message msg, PreVoteResponse hdr) {
// We are not interested in the replies if the voting phase started.
// Or if we are not the view coordinator.
if (isVotingThreadRunning() || !isViewCoordinator() || !preVotingMechanism.isRunning()) return;
preVotingMechanism.includeResponse(msg.getSrc(), hdr);
}
private void sendPreVotingRequest() {
PreVoteRequest hdr = new PreVoteRequest();
Message msg = new EmptyMessage(null).putHeader(id, hdr).setFlag(OOB);
log.trace("%s -> all: %s", local_addr, hdr);
down_prot.down(msg);
}
private void sendPreVoteResponse(Address dest) {
PreVoteResponse hdr = new PreVoteResponse(raft.leader());
Message msg = new EmptyMessage(dest).putHeader(id, hdr).setFlag(OOB);
log.trace("%s -> %s: %s", local_addr, dest, hdr);
down_prot.down(msg);
}
private class PreVotingMechanism {
protected final ResponseCollector preVotingResponses = new ResponseCollector<>();
public boolean isRunning() {
return preVotingResponses.size() > 0;
}
public void start() {
int majority = raft.majority();
if (!isRunning() && isViewCoordinator() && !isVotingThreadRunning() && view.getMembers().size() >= majority) {
log.trace("%s: starting pre-voting mechanism", local_addr);
startPreVotingPhase();
}
}
public void stop() {
log.trace("%s: stopping pre-voting thread", local_addr);
preVotingResponses.reset();
}
/**
* Include a new response to the running pre-vote phase.
*
* Once all responses are collected and there is still a majority, the responses are parsed to verify if an
* election phase should start.
*
* @param sender: The response sender.
* @param hdr: The response message.
*/
public void includeResponse(Address sender, PreVoteResponse hdr) {
preVotingResponses.add(sender, hdr);;
int majority = raft.majority();
if (preVotingResponses.hasAllResponses() && preVotingResponses.numberOfValidResponses() >= majority) {
Map
responses = Map.copyOf(preVotingResponses.getResults());
stopPreVotingPhase(responses);
stop();
} else if (log.isTraceEnabled()) {
log.trace("%s: collected pre-vote responses %s", local_addr, preVotingResponses.getResults());
}
}
private void startPreVotingPhase() {
preVotingResponses.reset(view.getMembers());
sendPreVotingRequest();
}
/**
* Executes after collecting messages from all nodes in the current view.
*
* This procedure parses the responses to identify whether to start the voting thread. It has the strategy of:
*
*
* - A majority of nodes does not have a leader or see the same "outdated" leader as this node;
*
- In case a majority sees a different leader:
*
* - The supposed leader is not in the view;
*
- The supposed leader does not see itself as leader;
*
*
*
* In case none of that matches, the election algorithm is not started.
*
* @param responses The cluster {@link PreVoteResponse} responses. Must have a response from all nodes
* in {@link #view}.
*/
private void stopPreVotingPhase(Map
responses) {
int acceptStartVoting = 0;
Address localLeader = raft.leader();
log.trace("%s: validating pre-vote responses from %s", local_addr, responses);
Address remoteLeader = null;
for (PreVoteResponse response : responses.values()) {
if (response == null) continue;
Address leader = response.leader();
// The remote node either does not have a leader (== null) or has the same leader as this node.
// If we are at this stage, means that this node suspects the leader. For example, we have merged partitions
// and the remote still sees the old leader, or the node receive the message before the view update event.
// We count that towards starting the voting thread.
if (leader == null || leader.equals(localLeader)) {
acceptStartVoting++;
} else {
// Is it possible that a node is so far behind came to life with a very old leader?
assert remoteLeader == null || remoteLeader.equals(leader) : "Somehow the leader is different!!";
remoteLeader = leader;
}
}
// We have a majority already. Most of the nodes see an outdated or does not have a leader.
if (acceptStartVoting >= raft.majority()) {
log.debug("%s: pre-voting phase finished and starting the voting phase", local_addr);
// We can already remove the outdated leader.
raft.setLeaderAndTerm(null);
startVotingThread();
return;
}
log.trace("%s: did not met majority, taking slow-path %s", local_addr, responses);
// We did not meet the majority. We need to inspect the responses more closely. In a more concrete example:
// Given a cluster V1 = {A, B, C, D, E} with leader A. Suffering a quorum loss with a partial connectivity,
// having views: A and C have V2 = {A, C} and B, D, E still have V1.
// Once the connectivity is restored, A and C will merge the views and have V3 = {A, B, C, D, E}. Given that
// node A is the coordinator and executes the pre-voting, a majority still sees A as leader. This leads to a
// liveness scenario where a majority still sees the old leader, but A stepped down, and a new voting
// phase is never started.
Map votes = new HashMap<>();
for (PreVoteResponse response : responses.values()) {
if (response == null || response.leader() == null) continue;
votes.compute(response.leader(), (k, v) -> v == null ? 1 : v + 1);
}
// Compute the leader the cluster still see.
Address leader = null;
for (Map.Entry entry : votes.entrySet()) {
if (leader == null || entry.getValue() > votes.get(leader)) {
leader = entry.getKey();
}
}
assert leader != null: "Leader should not be null at this stage: " + responses;
PreVoteResponse response = responses.get(leader);
// If the response is null, means the old leader is not present in the current view.
// The other possibility was detailed above, a cluster see the leader, but the node stepped down, and
// it does not see itself as a leader.
if (response == null || !leader.equals(response.leader())) {
log.debug("%s: pre-voting phase finished and starting the voting phase", local_addr);
startVotingThread();
return;
}
// Should receive a late message from the leader?
log.trace("%s: not able to start voting, majority sees %s as leader", local_addr, leader);
}
}
}