org.jsimpledb.kv.raft.LeaderRole Maven / Gradle / Ivy
Show all versions of jsimpledb-kv-raft Show documentation
* Copyright (C) 2015 Archie L. Cobbs. All rights reserved.
package org.jsimpledb.kv.raft;
import com.google.common.base.Preconditions;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NavigableSet;
import java.util.function.Predicate;
import javax.annotation.concurrent.GuardedBy;
import org.dellroad.stuff.io.ByteBufferInputStream;
import org.jsimpledb.kv.KVTransactionException;
import org.jsimpledb.kv.RetryTransactionException;
import org.jsimpledb.kv.mvcc.Reads;
import org.jsimpledb.kv.mvcc.Writes;
import org.jsimpledb.kv.raft.msg.AppendRequest;
import org.jsimpledb.kv.raft.msg.AppendResponse;
import org.jsimpledb.kv.raft.msg.CommitRequest;
import org.jsimpledb.kv.raft.msg.CommitResponse;
import org.jsimpledb.kv.raft.msg.GrantVote;
import org.jsimpledb.kv.raft.msg.InstallSnapshot;
import org.jsimpledb.kv.raft.msg.Message;
import org.jsimpledb.kv.raft.msg.RequestVote;
* Raft leader role.
public class LeaderRole extends Role {
// Timestamp scrub interval
private static final int TIMESTAMP_SCRUB_INTERVAL = 24 * 60 * 60 * 1000; // once a day
// Our followers
private final HashMap followerMap = new HashMap<>();
// Our leadership "lease" timeout - i.e., the earliest time another leader could possibly be elected
private Timestamp leaseTimeout;
// Service tasks
private final Service updateLeaderCommitIndexService = new Service(this, "update leader commitIndex") {
public void run() {
private final Service updateLeaseTimeoutService = new Service(this, "update lease timeout") {
public void run() {
private final Service updateKnownFollowersService = new Service(this, "update known followers") {
public void run() {
private final Timer checkApplyTimer = new Timer(this.raft, "check apply entries", new Service(this, "check apply entries") {
public void run() {
private final Timer timestampScrubTimer = new Timer(this.raft, "scrub timestamps", new Service(this, "scrub timestamps") {
public void run() {
// Constructors
LeaderRole(RaftKVDatabase raft) {
// Status & Debugging
* Get this leader's known followers.
* The returned list is a copy; changes have no effect on this instance.
* @return this leader's followers
public List getFollowers() {
final ArrayList list;
synchronized (this.raft) {
list = new ArrayList<>(this.followerMap.values());
Collections.sort(list, Follower.SORT_BY_IDENTITY);
return list;
* Get this leader's "lease timeout".
* This is the earliest possible time at which some other, new leader could be elected in a new term.
* Consequently, it is the earliest possible time at which any entry that this leader is unaware of
* could be appended to the Raft log.
* Normally, if followers are responding to {@link AppendRequest}s properly, this should be a value
* in the (near) future. This allows the leader to make the assumption, up until that point in time,
* that its log is fully up-to-date.
* Until it hears from a majority of followers, a leader will not have a lease timeout established yet.
* In that case this method returns null.
* This method may also return null if a previous lease timeout has gotten very stale (e.g., isolated leader).
* @return this leader's lease timeout, or null if none is established yet
public Timestamp getLeaseTimeout() {
synchronized (this.raft) {
return this.leaseTimeout;
* Force this leader to step down.
* @throws IllegalStateException if this role is no longer active or election timer is not running
public void stepDown() {
synchronized (this.raft) {
Preconditions.checkState(this.raft.role == this, "role is no longer active");
this.debug("stepping down as leader due to invocation of stepDown()");
this.raft.changeRole(new FollowerRole(this.raft));
// Lifecycle
void setup() {
assert Thread.holdsLock(this.raft);
if (this.log.isDebugEnabled())
this.debug("entering leader role in term " + this.raft.currentTerm);
// Generate follower list
// Append a "dummy" log entry with my current term. This allows us to advance the commit index when the last
// entry in our log is from a prior term. This is needed to avoid the problem where a transaction could end up
// waiting indefinitely for its log entry with a prior term number to be committed.
final LogEntry logEntry;
try {
logEntry = this.applyNewLogEntry(new NewLogEntry(this.raft, new LogEntry.Data(new Writes(), null)));
} catch (Exception e) {
this.error("error attempting to apply initial log entry", e);
if (this.log.isDebugEnabled())
this.debug("added log entry " + logEntry + " to commit at the beginning of my new term");
// Rebase transactions
// Start check apply timer
if (!this.raft.raftLog.isEmpty())
// Start timestamp scrub timer
void shutdown() {
assert Thread.holdsLock(this.raft);
// Service
void outputQueueEmpty(String address) {
assert Thread.holdsLock(this.raft);
// Find matching follower(s) and update them if needed
.filter(follower -> follower.getAddress().equals(address))
.forEach(follower -> {
if (this.log.isTraceEnabled())
this.trace("updating peer \"" + follower.getIdentity() + "\" after queue empty notification");
this.raft.requestService(new UpdateFollowerService(follower));
void applyCommittedLogEntries() {
assert Thread.holdsLock(this.raft);
// Stop check apply timer if there are none left
if (this.raft.raftLog.isEmpty() && this.checkApplyTimer.isRunning())
boolean roleMayApplyLogEntry(LogEntry logEntry) {
assert Thread.holdsLock(this.raft);
// If any snapshots are in progress, we don't want to apply any log entries with index greater than the snapshot's
// index, because then we'd lose the ability to update the follower with that log entry, and as a result just have
// to send a snapshot again. However, we impose a limit on how long we'll wait for a slow follower.
for (Follower follower : this.followerMap.values()) {
final SnapshotTransmit snapshotTransmit = follower.getSnapshotTransmit();
if (snapshotTransmit == null)
if (snapshotTransmit.getSnapshotIndex() < logEntry.getIndex()
&& snapshotTransmit.getAge() < RaftKVDatabase.MAX_SNAPSHOT_TRANSMIT_AGE) {
if (this.log.isTraceEnabled()) {
this.trace("delaying application of " + logEntry + " because of in-progress snapshot install of "
+ snapshotTransmit.getSnapshotIndex() + "t" + snapshotTransmit.getSnapshotTerm()
+ " to " + follower);
return false;
// If some follower does not yet have the log entry, wait for them to get it (up to some maximum time).
// If the follower appears to be offline, don't bother waiting.
final int maxLogEntryAge = this.raft.maxFollowerAckHeartbeats * this.raft.heartbeatTimeout;
if (logEntry.getAge() < maxLogEntryAge) {
final Timestamp minLeaderTimestamp = new Timestamp().offset(-maxLogEntryAge);
for (Follower follower : this.followerMap.values()) {
// Has this follower acknowledged reciept of the log entry?
// If so, then the follower has already rebased any rebasable transactions.
if (follower.getMatchIndex() >= logEntry.getIndex())
// If we haven't heard from this follower in a while, don't bother waiting for it
final Timestamp leaderTimestamp = follower.getLeaderTimestamp();
if (leaderTimestamp == null || leaderTimestamp.compareTo(minLeaderTimestamp) <= 0)
// Wait for follower to do so before applying to state machine
if (this.log.isTraceEnabled()) {
this.trace("delaying application of " + logEntry + " (age "
+ logEntry.getAge() + " < " + maxLogEntryAge + ") because of slow " + follower);
return false;
// OK
return true;
// We have to periodically check if we can apply log entries, because the condition is time-dependent
private void checkApplyEntries() {
assert Thread.holdsLock(this.raft);
if (!this.raft.raftLog.isEmpty())
* Update my {@code commitIndex} based on followers' {@code matchIndex}'s.
* This should be invoked:
* - After any log entry has been added to the log, if we have zero followers
* - After a log entry that contains a configuration change has been added to the log
* - After a follower's {@linkplain Follower#getMatchIndex match index} has advanced
private void updateLeaderCommitIndex() {
assert Thread.holdsLock(this.raft);
// Find highest index for which a majority of cluster members have ack'd the corresponding log entry from my term
final int totalCount = this.raft.currentConfig.size(); // total possible nodes
final int requiredCount = totalCount / 2 + 1; // require a majority
final int startingCount = this.raft.isClusterMember() ? 1 : 0; // count myself, if member
long maxCommitIndex = this.raft.commitIndex;
int commitCount = -1;
for (long index = this.raft.commitIndex + 1; index <= this.raft.getLastLogIndex(); index++) {
// Count the number of nodes (possibly including myself) that have a copy of the log entry at index
final int count = startingCount + this.countFollowersWithLogEntry(index);
// The log entry term must match my current term (exception: unless every node has it)
final long term = this.raft.getLogTermAtIndex(index);
if (count < totalCount && term != this.raft.currentTerm)
// Do a majority of cluster nodes have this log entry?
if (count < requiredCount) {
if (term >= this.raft.currentTerm) // there's no point in going further
continue; // a later term log entry might work
// We have a winner
maxCommitIndex = index;
commitCount = count;
// Update commit index if it advanced
if (maxCommitIndex > this.raft.commitIndex) {
// Update index
if (this.log.isDebugEnabled()) {
this.debug("advancing commit index from " + this.raft.commitIndex + " -> " + maxCommitIndex + " based on "
+ commitCount + "/" + totalCount + " nodes having received " + this.raft.getLogEntryAtIndex(maxCommitIndex));
this.raft.commitIndex = maxCommitIndex;
// Update commitables
// Perform various service
// Notify all (up-to-date) followers with the updated leaderCommit
// If we are no longer a member of the cluster, step down after the most recent config change is committed
if (!this.raft.isClusterMember() && this.raft.commitIndex >= this.findMostRecentConfigChange()) {
if (this.log.isDebugEnabled())
this.log.debug("stepping down as leader of cluster (no longer a member)");
private int countFollowersWithLogEntry(long index) {
assert index <= this.raft.getLastLogIndex();
// Count the number of followers (who are also cluster members) that have a copy of the log entry at the specified index
int nodesWithLogEntry = 0;
for (Follower follower : this.followerMap.values()) {
if (follower.hasLogEntry(index))
// Done
return nodesWithLogEntry;
* Update my {@code leaseTimeout} based on followers' returned {@code leaderTimeout}'s.
* This should be invoked:
* - After a follower has replied with an {@link AppendResponse} containing a newer
* {@linkplain AppendResponse#getLeaderTimestamp leader timestamp} than before
private void updateLeaseTimeout() {
assert Thread.holdsLock(this.raft);
// Only needed when we have followers
final int numFollowers = this.followerMap.size();
if (numFollowers == 0)
// Get all cluster member leader timestamps, sorted in increasing order
final Timestamp[] leaderTimestamps = new Timestamp[this.raft.currentConfig.size()];
int index = 0;
if (this.raft.isClusterMember())
leaderTimestamps[index++] = new Timestamp(); // this represents my own vote
for (Follower follower : this.followerMap.values()) {
if (this.raft.isClusterMember(follower.getIdentity()))
leaderTimestamps[index++] = follower.getLeaderTimestamp(); // note follower timestamps could be null
Arrays.sort(leaderTimestamps, Timestamp.NULL_FIRST_SORT);
// Calculate highest leaderTimeout shared by a majority of cluster members, based on sorted array:
// # nodes timestamps
// ------- ----------
// 5 [ ][ ][x][x][x] 3/5 x's make a majority at index (5 - 1)/2 = 2
// 6 [ ][ ][x][x][x][x] 4/6 x's make a majority at index (6 - 1)/2 = 2
// The minimum leaderTimeout shared by a majority of nodes is at index (leaderTimestamps.length - 1) / 2.
// We then add the minimum election timeout, then subtract a little for clock drift.
final Timestamp newLeaseTimeout = leaderTimestamps[(leaderTimestamps.length + 1) / 2]
.offset((int)(this.raft.minElectionTimeout * (1.0f - RaftKVDatabase.MAX_CLOCK_DRIFT) - 1));
if (Timestamp.NULL_FIRST_SORT.compare(newLeaseTimeout, this.leaseTimeout) > 0) {
assert newLeaseTimeout != null;
// Update my leader lease timeout
if (this.log.isTraceEnabled())
this.trace("updating my lease timeout from " + this.leaseTimeout + " -> " + newLeaseTimeout);
this.leaseTimeout = newLeaseTimeout;
// Notify any followers who care
for (Follower follower : this.followerMap.values()) {
final NavigableSet timeouts = follower.getCommitLeaseTimeouts().headSet(this.leaseTimeout, true);
if (!timeouts.isEmpty()) {
follower.updateNow(); // notify follower so it can commit waiting transaction(s)
* Scrub timestamps to avoid roll-over.
* This should be invoked periodically, e.g., once a day.
private void scrubTimestamps() {
assert Thread.holdsLock(this.raft);
if (this.log.isTraceEnabled())
this.trace("scrubbing timestamps");
for (Follower follower : this.followerMap.values()) {
final Timestamp leaderTimestamp = follower.getLeaderTimestamp();
if (leaderTimestamp != null && leaderTimestamp.isRolloverDanger()) {
if (this.log.isDebugEnabled())
this.debug("scrubbing " + follower + " leader timestamp " + leaderTimestamp);
final Timestamp snapshotTimestamp = follower.getSnapshotTimestamp();
if (snapshotTimestamp != null && snapshotTimestamp.isRolloverDanger()) {
if (this.log.isDebugEnabled())
this.debug("scrubbing " + follower + " snapshot timestamp " + snapshotTimestamp);
for (Iterator i = follower.getCommitLeaseTimeouts().iterator(); i.hasNext(); ) {
final Timestamp leaseTimestamp = i.next();
if (leaseTimestamp.isRolloverDanger()) {
if (this.log.isDebugEnabled())
this.debug("scrubbing " + follower + " commit lease timestamp " + leaseTimestamp);
if (this.leaseTimeout != null && this.leaseTimeout.isRolloverDanger()) {
if (this.log.isDebugEnabled())
this.debug("scrubbing leader lease timestamp " + this.leaseTimeout);
this.leaseTimeout = null;
* Update our list of followers to match our current configuration.
* This should be invoked:
* - After a log entry that contains a configuration change has been added to the log
* - When the {@linkplain Follower#getNextIndex next index} of a follower not in the current config advances
private void updateKnownFollowers() {
assert Thread.holdsLock(this.raft);
// Compare known followers with the current config and determine who needs to be be added or removed
final HashSet adds = new HashSet<>(this.raft.currentConfig.keySet());
final HashSet dels = new HashSet<>(this.followerMap.keySet());
// Keep around a follower after its removal until it receives the config change that removed it
for (Follower follower : this.followerMap.values()) {
// Is this follower scheduled for deletion?
final String peer = follower.getIdentity();
if (!dels.contains(peer))
// Find the most recent log entry containing a config change in which the follower was removed
final String node = follower.getIdentity();
final long index = this.findMostRecentConfigChangeMatching(
configChange -> configChange[0].equals(node) && configChange[1] == null);
// If follower has not received that log entry yet, keep on updating them until they do
if (follower.getMatchIndex() < index)
// Add new followers
for (String peer : adds) {
final String address = this.raft.currentConfig.get(peer);
final Follower follower = new Follower(this.raft, peer, address, this.raft.getLastLogIndex());
if (this.log.isDebugEnabled())
this.debug("adding new follower \"" + peer + "\" at " + address);
follower.setUpdateTimer(new Timer(this.raft, "update timer for \"" + peer + "\"", new UpdateFollowerService(follower)));
this.followerMap.put(peer, follower);
follower.updateNow(); // schedule an immediate update
// Remove old followers
for (String peer : dels) {
final Follower follower = this.followerMap.remove(peer);
if (this.log.isDebugEnabled())
this.debug("removing old follower \"" + peer + "\"");
* Check whether a follower needs an update and send one if so.
* This should be invoked:
* - After a new follower has been added
* - When the output queue for a follower goes from non-empty to empty
* - After the follower's {@linkplain Follower#getUpdateTimer update timer} has expired
* - After a new log entry has been added to the log (all followers)
* - After receiving an {@link AppendResponse} that caused the follower's
* {@linkplain Follower#getNextIndex next index} to change
* - After receiving the first positive {@link AppendResponse} to a probe
* - After our {@code commitIndex} has advanced (all followers)
* - After our {@code leaseTimeout} has advanced past one or more of a follower's
* {@linkplain Follower#getCommitLeaseTimeouts commit lease timeouts} (with update timer reset)
* - After sending a {@link CommitResponse} with a non-null {@linkplain CommitResponse#getCommitLeaderLeaseTimeout
* commit leader lease timeout} (all followers) to probe for updated leader timestamps
* - After starting, aborting, or completing a snapshot install for a follower
private void updateFollower(Follower follower) {
// Sanity check
assert Thread.holdsLock(this.raft);
// If follower has an in-progress snapshot that has become too stale, abort it
final String peer = follower.getIdentity();
SnapshotTransmit snapshotTransmit = follower.getSnapshotTransmit();
if (snapshotTransmit != null && snapshotTransmit.getSnapshotIndex() < this.raft.lastAppliedIndex) {
if (this.log.isDebugEnabled())
this.debug("aborting stale snapshot install for " + follower);
// Is follower's queue empty? If not, hold off until then
if (this.raft.isTransmitting(follower.getAddress())) {
if (this.log.isTraceEnabled())
this.trace("no update for \"" + peer + "\": output queue still not empty");
// Handle any in-progress snapshot install
if ((snapshotTransmit = follower.getSnapshotTransmit()) != null) {
// Send the next chunk in transmission, if any
final long pairIndex = snapshotTransmit.getPairIndex();
final ByteBuffer chunk = snapshotTransmit.getNextChunk();
boolean synced = true;
if (chunk != null) {
// Send next chunk
final InstallSnapshot msg = new InstallSnapshot(this.raft.clusterId, this.raft.identity, peer,
this.raft.currentTerm, snapshotTransmit.getSnapshotTerm(), snapshotTransmit.getSnapshotIndex(), pairIndex,
pairIndex == 0 ? snapshotTransmit.getSnapshotConfig() : null, !snapshotTransmit.hasMoreChunks(), chunk);
if (this.raft.sendMessage(msg)) {
follower.setSnapshotTimestamp(new Timestamp());
if (this.log.isDebugEnabled())
this.debug("canceling snapshot install for " + follower + " due to failure to send " + msg);
// Message failed -> snapshot is fatally wounded, so cancel it
synced = false;
if (synced) {
if (this.log.isDebugEnabled())
this.debug("completed snapshot install for out-of-date " + follower);
// Snapshot transmit is complete (or failed)
// Trigger an immediate regular update
follower.setNextIndex(snapshotTransmit.getSnapshotIndex() + 1);
this.raft.requestService(new UpdateFollowerService(follower));
// Are we still waiting for the update timer to expire?
if (!follower.getUpdateTimer().pollForTimeout()) {
boolean waitForTimerToExpire = true;
// Don't wait for the update timer to expire if:
// (a) The follower is sync'd; AND
// (y) We have a new log entry that the follower doesn't have; OR
// (y) We have a new leaderCommit that the follower doesn't have
// The effect is that we will pipeline updates to synchronized followers.
if (follower.isSynced()
&& (follower.getLeaderCommit() != this.raft.commitIndex
|| follower.getNextIndex() <= this.raft.getLastLogIndex()))
waitForTimerToExpire = false;
// Wait for timer to expire
if (waitForTimerToExpire) {
if (this.log.isTraceEnabled()) {
this.trace("no update for \"" + follower.getIdentity() + "\": timer not expired yet, and follower is "
+ (follower.isSynced() ? "up to date" : "not synced"));
// Get index of the next log entry to send to follower
final long nextIndex = follower.getNextIndex();
// If follower is too far behind, we must do a snapshot install
if (nextIndex <= this.raft.lastAppliedIndex) {
final MostRecentView view = new MostRecentView(this.raft, this.raft.commitIndex);
follower.setSnapshotTransmit(new SnapshotTransmit(view.getTerm(),
view.getIndex(), view.getConfig(), view.getSnapshot(), view.getView()));
if (this.log.isDebugEnabled())
this.debug("started snapshot install for out-of-date " + follower);
this.raft.requestService(new UpdateFollowerService(follower));
// Restart update timer here (to avoid looping if an error occurs below)
// Send actual data if follower is synced and there is a log entry to send; otherwise, just send a probe
final AppendRequest msg;
if (!follower.isSynced() || nextIndex > this.raft.getLastLogIndex()) {
// Create probe-only message
msg = new AppendRequest(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, new Timestamp(),
this.leaseTimeout, this.raft.commitIndex, this.raft.getLogTermAtIndex(nextIndex - 1), nextIndex - 1);
} else {
// Get log entry to send
final LogEntry logEntry = this.raft.getLogEntryAtIndex(nextIndex);
// If the log entry correspond's to follower's transaction, don't send the data because follower already has it.
// But only do this optimization the first time, in case something goes wrong on the follower's end.
ByteBuffer mutationData = null;
if (!follower.getSkipDataLogEntries().remove(logEntry)) {
try {
mutationData = logEntry.getContent();
} catch (IOException e) {
this.error("error reading log file " + logEntry.getFile(), e);
// Create message
msg = new AppendRequest(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, new Timestamp(),
this.leaseTimeout, this.raft.commitIndex, this.raft.getLogTermAtIndex(nextIndex - 1), nextIndex - 1,
logEntry.getTerm(), mutationData);
// Send update
final boolean sent = this.raft.sendMessage(msg);
// Advance next index if a log entry was sent; we allow pipelining log entries when synchronized
if (sent && !msg.isProbe()) {
assert follower.isSynced();
follower.setNextIndex(Math.min(follower.getNextIndex(), this.raft.getLastLogIndex()) + 1);
// Update the leaderCommit we sent to the follower
if (sent)
private void updateAllSynchronizedFollowersNow() {
assert Thread.holdsLock(this.raft);
private class UpdateFollowerService extends Service {
private final Follower follower;
UpdateFollowerService(Follower follower) {
super(LeaderRole.this, "update follower \"" + follower.getIdentity() + "\"");
this.follower = follower;
public void run() {
public boolean equals(Object obj) {
if (obj == this)
return true;
if (obj == null || obj.getClass() != this.getClass())
return false;
final UpdateFollowerService that = (UpdateFollowerService)obj;
return this.follower.equals(that.follower);
public int hashCode() {
return this.follower.hashCode();
// Transactions
void handleLinearizableReadOnlyChange(RaftKVTransaction tx) {
// Sanity check
// Set commit info based on what we currently know as "up-to-date"
if (!tx.hasCommitInfo()) {
tx.setCommitInfo(this.raft.getLastLogTerm(), this.raft.getLastLogIndex(), this.getCurrentCommitMinLeaseTimeout());
void checkReadyTransactionNeedingCommitInfo(RaftKVTransaction tx) {
// Sanity check
// Handle (effectively) read-only transactions
if (!tx.addsLogEntry()) {
// Does it already have commit information?
if (tx.hasCommitInfo()) {
// Set commit info based on what we currently know as "up-to-date" and proceed
this.raft.getLastLogTerm(), this.raft.getLastLogIndex(), this.getCurrentCommitMinLeaseTimeout());
// Must be a read-write transaction that's fully rebased
assert !tx.isReadOnly();
assert tx.isRebasable() : "fail tx " + tx;
assert !tx.isCommittable();
assert !tx.hasCommitInfo();
assert this.checkRebasableAndCommittableUpToDate(tx);
// If a config change is involved, check whether we can safely apply it
if (tx.getConfigChange() != null && !this.mayApplyNewConfigChange())
// Commit transaction as a new log entry
final LogEntry logEntry;
try {
logEntry = this.applyNewLogEntry(new NewLogEntry(tx));
} catch (IllegalStateException e) {
throw new RetryTransactionException(tx, e.getMessage());
} catch (Exception e) {
throw new KVTransactionException(tx, "error attempting to persist transaction", e);
if (this.log.isDebugEnabled())
this.debug("added log entry " + logEntry + " for local transaction " + tx);
// Update transaction
this.advanceReadyTransactionWithCommitInfo(tx, logEntry.getTerm(), logEntry.getIndex(), null);
// Rebase transactions
// Determine whether it's safe to append a log entry with a configuration change
private boolean mayApplyNewConfigChange() {
assert Thread.holdsLock(this.raft);
// Rule #1: this leader must have committed at least one log entry in this term
assert this.raft.commitIndex >= this.raft.lastAppliedIndex;
if (this.raft.getLogTermAtIndex(this.raft.commitIndex) < this.raft.currentTerm)
return false;
// Rule #2: there must be no previous config change that is still uncommitted
for (int i = (int)(this.raft.commitIndex - this.raft.lastAppliedIndex) + 1; i < this.raft.raftLog.size(); i++) {
if (this.raft.raftLog.get(i).getConfigChange() != null)
return false;
// OK
return true;
Timestamp getLeaderLeaseTimeout() {
return this.leaseTimeout;
* Get the minimum future leader timestamp required before we will know that our last log entry is up-to-date as of now.
* If we already know that it's up-to-date as of now (because our lease currently extends into the future), return null.
private Timestamp getCurrentCommitMinLeaseTimeout() {
return this.isLeaderLeaseActiveNow() ? null : new Timestamp();
// Message
void caseAppendRequest(AppendRequest msg, NewLogEntry newLogEntry) {
assert Thread.holdsLock(this.raft);
void caseAppendResponse(AppendResponse msg) {
assert Thread.holdsLock(this.raft);
// Find follower
final Follower follower = this.findFollower(msg);
if (follower == null)
// Update follower's last rec'd leader timestamp
if (follower.getLeaderTimestamp() == null || msg.getLeaderTimestamp().compareTo(follower.getLeaderTimestamp()) > 0) {
// Ignore if a snapshot install is in progress
if (follower.getSnapshotTransmit() != null) {
if (this.log.isTraceEnabled())
this.trace("rec'd " + msg + " while sending snapshot install; ignoring");
// Ignore a response to a request that was sent prior to the most resent snapshot install
if (follower.getSnapshotTimestamp() != null && msg.getLeaderTimestamp().compareTo(follower.getSnapshotTimestamp()) < 0) {
if (this.log.isTraceEnabled())
this.trace("rec'd " + msg + " sent prior to snapshot install; ignoring");
// Flag indicating we might want to update follower when done
boolean updateFollowerAgain = false;
// Update follower's match index
if (msg.getMatchIndex() > follower.getMatchIndex()) {
if (!this.raft.isClusterMember(follower.getIdentity()))
// Check result and update follower's next index
final boolean wasSynced = follower.isSynced();
final long previousNextIndex = follower.getNextIndex();
if (!msg.isSuccess())
follower.setNextIndex(Math.max(follower.getNextIndex() - 1, 1));
if (follower.isSynced() != wasSynced) {
if (this.log.isDebugEnabled()) {
this.debug("sync status of \"" + follower.getIdentity() + "\" changed -> "
+ (!follower.isSynced() ? "not " : "") + "synced");
updateFollowerAgain = true;
// Use follower's match index as a lower bound on follower's next index.
follower.setNextIndex(Math.max(follower.getNextIndex(), follower.getMatchIndex() + 1));
// Use follower's last log index as an upper bound on follower's next index.
follower.setNextIndex(Math.min(msg.getLastLogIndex() + 1, follower.getNextIndex()));
// Update follower again if next index has changed
updateFollowerAgain |= follower.getNextIndex() != previousNextIndex;
// Debug
if (this.log.isTraceEnabled())
this.trace("updated follower: " + follower + ", update again = " + updateFollowerAgain);
// Immediately update follower again (if appropriate)
if (updateFollowerAgain)
this.raft.requestService(new UpdateFollowerService(follower));
void caseCommitRequest(CommitRequest msg, NewLogEntry newLogEntry) {
assert Thread.holdsLock(this.raft);
// Find follower
final Follower follower = this.findFollower(msg);
if (follower == null)
// Decode reads, if any, and check for conflicts
final ByteBuffer readsData = msg.getReadsData();
if (readsData != null) {
// Decode reads
final Reads reads;
try {
reads = new Reads(new ByteBufferInputStream(msg.getReadsData()));
} catch (Exception e) {
this.error("error decoding reads data in " + msg, e);
this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(),
this.raft.currentTerm, msg.getTxId(), "error decoding reads data: " + e));
// Check for conflict
final String conflictMsg = this.checkConflicts(msg.getBaseTerm(), msg.getBaseIndex(), reads,
this.raft.dumpConflicts ? msg.getSenderId() + " txId=" + msg.getTxId() : null);
if (conflictMsg != null) {
if (this.log.isDebugEnabled())
this.debug("commit request " + msg + " failed due to conflict: " + conflictMsg);
this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(),
this.raft.currentTerm, msg.getTxId(), conflictMsg));
// Handle read-only vs. read-write transaction
if (msg.isReadOnly()) {
assert newLogEntry == null;
// Determine our minimum lease timeout before we can know for sure that we are up-to-date, if not already
final Timestamp minimumLeaseTimeout = this.getCurrentCommitMinLeaseTimeout();
// If there is a minimum lease timeout requirement, try to advance our lease timeout
if (minimumLeaseTimeout != null) {
// Remember that this follower is now going to be waiting for this particular leaseTimeout
// Send immediate probes to all (up-to-date) followers in an attempt to increase our leaseTimeout quickly
// Send response with commit term+index set from our last log entry
this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(),
this.raft.currentTerm, msg.getTxId(), this.raft.getLastLogTerm(), this.raft.getLastLogIndex(),
} else {
assert newLogEntry != null;
// If the client is requesting a config change, we could check for an outstanding config change now and if so
// delay our response until it completes, but that's not worth the trouble. Instead, applyNewLogEntry() will
// throw an exception and the client will just just have to retry the transaction. Config changes are rare.
// Commit mutations as a new log entry
final LogEntry logEntry;
try {
logEntry = this.applyNewLogEntry(newLogEntry);
} catch (Exception e) {
if (!(e instanceof IllegalStateException))
this.error("error appending new log entry for " + msg, e);
else if (this.log.isDebugEnabled())
this.debug("error appending new log entry for " + msg + ": " + e);
this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(),
this.raft.currentTerm, msg.getTxId(), e.getMessage() != null ? e.getMessage() : "" + e));
if (this.log.isDebugEnabled())
this.debug("added log entry " + logEntry + " for rec'd " + msg);
// Rebase transactions
// Follower transaction data optimization
// Send response
this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(),
this.raft.currentTerm, msg.getTxId(), logEntry.getTerm(), logEntry.getIndex()));
void caseCommitResponse(CommitResponse msg) {
assert Thread.holdsLock(this.raft);
void caseInstallSnapshot(InstallSnapshot msg) {
assert Thread.holdsLock(this.raft);
void caseRequestVote(RequestVote msg) {
assert Thread.holdsLock(this.raft);
// Too late dude, I already won the election
if (this.log.isDebugEnabled())
this.debug("ignoring " + msg + " rec'd while in " + this);
void caseGrantVote(GrantVote msg) {
assert Thread.holdsLock(this.raft);
// Thanks and all, but I already won the election
if (this.log.isDebugEnabled())
this.debug("ignoring " + msg + " rec'd while in " + this);
private void failDuplicateLeader(Message msg) {
assert Thread.holdsLock(this.raft);
// This should never happen - same term but two different leaders
final boolean defer = this.raft.identity.compareTo(msg.getSenderId()) <= 0;
this.error("detected a duplicate leader in " + msg + " - should never happen; possible inconsistent cluster"
+ " configuration on " + msg.getSenderId() + " (mine: " + this.raft.currentConfig + "); "
+ (defer ? "reverting to follower" : "ignoring"));
if (defer)
this.raft.changeRole(new FollowerRole(this.raft, msg.getSenderId(), this.raft.returnAddress));
// Object
public String toString() {
synchronized (this.raft) {
return this.toStringPrefix()
+ ",followerMap=" + this.followerMap
+ "]";
// Debug
boolean checkState() {
assert Thread.holdsLock(this.raft);
assert this.checkApplyTimer.isRunning() == !this.raft.raftLog.isEmpty();
for (Follower follower : this.followerMap.values()) {
assert follower.getNextIndex() <= this.raft.getLastLogIndex() + 1;
assert follower.getMatchIndex() <= this.raft.getLastLogIndex() + 1;
assert follower.getLeaderCommit() <= this.raft.commitIndex;
assert follower.getUpdateTimer().isRunning() || follower.getSnapshotTransmit() != null;
assert this.timestampScrubTimer.isRunning();
return true;
// Internal methods
* Find the index of the most recent unapplied log entry having an associated config change.
* @return most recent config change log entry, or zero if none found
private long findMostRecentConfigChange() {
return this.findMostRecentConfigChangeMatching(configChange -> true);
* Find the index of the most recent unapplied log entry having an associated config change matching the given predicate.
* @return most recent matching log entry, or zero if none found
private long findMostRecentConfigChangeMatching(Predicate predicate) {
assert Thread.holdsLock(this.raft);
for (long index = this.raft.getLastLogIndex(); index > this.raft.lastAppliedIndex; index--) {
final String[] configChange = this.raft.getLogEntryAtIndex(index).getConfigChange();
if (configChange != null && predicate.test(configChange))
return index;
return 0;
* Apply a new log entry to the Raft log; if operation fails, {@link NewLogEntry#cancel cancel()} {@code newLogEntry}.
* @throws IllegalStateException if a config change would not be safe at the current time
* @throws IllegalArgumentException if the config change attempts to remove the last node
private LogEntry applyNewLogEntry(NewLogEntry newLogEntry) throws Exception {
assert Thread.holdsLock(this.raft);
// Do a couple of extra checks if a config change is included
final String[] configChange = newLogEntry.getData().getConfigChange();
if (configChange != null) {
// If a config change is involved, check whether we can safely apply it
if (!this.mayApplyNewConfigChange())
throw new IllegalStateException("config change cannot be safely applied at this time");
// Disallow a configuration change that removes the last node in a cluster
if (this.raft.currentConfig.size() == 1 && configChange[1] == null) {
final String lastNode = this.raft.currentConfig.keySet().iterator().next();
if (configChange[0].equals(lastNode))
throw new IllegalArgumentException("can't remove the last node in a cluster (\"" + lastNode + "\")");
// Append new log entry to the Raft log
final LogEntry logEntry = this.raft.appendLogEntry(this.raft.currentTerm, newLogEntry);
// Update follower list if configuration changed
if (configChange != null)
// Update commit index (this is only needed if config has changed, or in the single node case)
if (configChange != null || this.followerMap.isEmpty())
// Immediately update all up-to-date followers
// Start check apply timer if not already running
if (!this.checkApplyTimer.isRunning())
// Done
return logEntry;
* Check whether a proposed transaction can commit without any MVCC conflict.
* @param baseTerm the term of the log entry on which the transaction is based
* @param baseIndex the index of the log entry on which the transaction is based
* @param reads reads performed by the transaction
* @return error message on failure, null for success
private String checkConflicts(long baseTerm, long baseIndex, Reads reads, String dumpDescription) {
assert Thread.holdsLock(this.raft);
// Validate the index of the log entry on which the transaction is based
final long minIndex = this.raft.lastAppliedIndex;
final long maxIndex = this.raft.getLastLogIndex();
if (baseIndex < minIndex)
return "transaction is too old: base index " + baseIndex + " < last applied log index " + minIndex;
if (baseIndex > maxIndex)
return "transaction is too new: base index " + baseIndex + " > most recent log index " + maxIndex;
// Validate the term of the log entry on which the transaction is based
final long actualBaseTerm = this.raft.getLogTermAtIndex(baseIndex);
if (baseTerm != actualBaseTerm) {
return "transaction is based on an overwritten log entry with index "
+ baseIndex + " and term " + baseTerm + " != " + actualBaseTerm;
// Check for conflicts from intervening commits
for (long index = baseIndex + 1; index <= maxIndex; index++) {
final LogEntry logEntry = this.raft.getLogEntryAtIndex(index);
if (reads.isConflict(logEntry.getWrites())) {
if (dumpDescription != null)
this.dumpConflicts(reads, logEntry, dumpDescription);
return "writes of committed transaction at index " + index
+ " conflict with transaction reads from transaction base index " + baseIndex;
// No conflict
return null;
private Follower findFollower(Message msg) {
assert Thread.holdsLock(this.raft);
final Follower follower = this.followerMap.get(msg.getSenderId());
if (follower == null)
this.warn("rec'd " + msg + " from unknown follower \"" + msg.getSenderId() + "\", ignoring");
return follower;