All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jsimpledb.kv.raft.LeaderRole Maven / Gradle / Ivy

Go to download

JSimpleDB distributed, linearizable ACID compliant key/value store implementation based on the Raft consensus algorithm.

There is a newer version: 3.6.1
Show newest version

/*
 * Copyright (C) 2015 Archie L. Cobbs. All rights reserved.
 */

package org.jsimpledb.kv.raft;

import com.google.common.base.Preconditions;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NavigableSet;
import java.util.function.Predicate;

import javax.annotation.concurrent.GuardedBy;

import org.dellroad.stuff.io.ByteBufferInputStream;
import org.jsimpledb.kv.KVTransactionException;
import org.jsimpledb.kv.RetryTransactionException;
import org.jsimpledb.kv.mvcc.Reads;
import org.jsimpledb.kv.mvcc.Writes;
import org.jsimpledb.kv.raft.msg.AppendRequest;
import org.jsimpledb.kv.raft.msg.AppendResponse;
import org.jsimpledb.kv.raft.msg.CommitRequest;
import org.jsimpledb.kv.raft.msg.CommitResponse;
import org.jsimpledb.kv.raft.msg.GrantVote;
import org.jsimpledb.kv.raft.msg.InstallSnapshot;
import org.jsimpledb.kv.raft.msg.Message;
import org.jsimpledb.kv.raft.msg.RequestVote;

/**
 * Raft leader role.
 */
public class LeaderRole extends Role {

    // Timestamp scrub interval
    private static final int TIMESTAMP_SCRUB_INTERVAL = 24 * 60 * 60 * 1000;            // once a day

    // Our followers
    @GuardedBy("raft")
    private final HashMap followerMap = new HashMap<>();

    // Our leadership "lease" timeout - i.e., the earliest time another leader could possibly be elected
    @GuardedBy("raft")
    private Timestamp leaseTimeout;

    // Service tasks
    private final Service updateLeaderCommitIndexService = new Service(this, "update leader commitIndex") {
        @Override
        public void run() {
            LeaderRole.this.updateLeaderCommitIndex();
        }
    };
    private final Service updateLeaseTimeoutService = new Service(this, "update lease timeout") {
        @Override
        public void run() {
            LeaderRole.this.updateLeaseTimeout();
        }
    };
    private final Service updateKnownFollowersService = new Service(this, "update known followers") {
        @Override
        public void run() {
            LeaderRole.this.updateKnownFollowers();
        }
    };
    private final Timer checkApplyTimer = new Timer(this.raft, "check apply entries", new Service(this, "check apply entries") {
        @Override
        public void run() {
            LeaderRole.this.checkApplyEntries();
        }
    });
    private final Timer timestampScrubTimer = new Timer(this.raft, "scrub timestamps", new Service(this, "scrub timestamps") {
        @Override
        public void run() {
            LeaderRole.this.scrubTimestamps();
        }
    });

// Constructors

    LeaderRole(RaftKVDatabase raft) {
        super(raft);
    }

// Status & Debugging

    /**
     * Get this leader's known followers.
     *
     * 

* The returned list is a copy; changes have no effect on this instance. * * @return this leader's followers */ public List getFollowers() { final ArrayList list; synchronized (this.raft) { list = new ArrayList<>(this.followerMap.values()); } Collections.sort(list, Follower.SORT_BY_IDENTITY); return list; } /** * Get this leader's "lease timeout". * *

* This is the earliest possible time at which some other, new leader could be elected in a new term. * Consequently, it is the earliest possible time at which any entry that this leader is unaware of * could be appended to the Raft log. * *

* Normally, if followers are responding to {@link AppendRequest}s properly, this should be a value * in the (near) future. This allows the leader to make the assumption, up until that point in time, * that its log is fully up-to-date. * *

* Until it hears from a majority of followers, a leader will not have a lease timeout established yet. * In that case this method returns null. * *

* This method may also return null if a previous lease timeout has gotten very stale (e.g., isolated leader). * * @return this leader's lease timeout, or null if none is established yet */ public Timestamp getLeaseTimeout() { synchronized (this.raft) { return this.leaseTimeout; } } /** * Force this leader to step down. * * @throws IllegalStateException if this role is no longer active or election timer is not running */ public void stepDown() { synchronized (this.raft) { Preconditions.checkState(this.raft.role == this, "role is no longer active"); this.debug("stepping down as leader due to invocation of stepDown()"); this.raft.changeRole(new FollowerRole(this.raft)); } } // Lifecycle @Override void setup() { assert Thread.holdsLock(this.raft); super.setup(); if (this.log.isDebugEnabled()) this.debug("entering leader role in term " + this.raft.currentTerm); // Generate follower list this.updateKnownFollowers(); // Append a "dummy" log entry with my current term. This allows us to advance the commit index when the last // entry in our log is from a prior term. This is needed to avoid the problem where a transaction could end up // waiting indefinitely for its log entry with a prior term number to be committed. final LogEntry logEntry; try { logEntry = this.applyNewLogEntry(new NewLogEntry(this.raft, new LogEntry.Data(new Writes(), null))); } catch (Exception e) { this.error("error attempting to apply initial log entry", e); return; } if (this.log.isDebugEnabled()) this.debug("added log entry " + logEntry + " to commit at the beginning of my new term"); // Rebase transactions this.rebaseTransactions(); // Start check apply timer if (!this.raft.raftLog.isEmpty()) this.checkApplyTimer.timeoutAfter(this.raft.heartbeatTimeout); // Start timestamp scrub timer this.timestampScrubTimer.timeoutAfter(TIMESTAMP_SCRUB_INTERVAL); } @Override void shutdown() { assert Thread.holdsLock(this.raft); this.followerMap.values() .forEach(Follower::cleanup); this.checkApplyTimer.cancel(); this.timestampScrubTimer.cancel(); super.shutdown(); } // Service @Override void outputQueueEmpty(String address) { assert Thread.holdsLock(this.raft); // Find matching follower(s) and update them if needed this.followerMap.values().stream() .filter(follower -> follower.getAddress().equals(address)) .forEach(follower -> { if (this.log.isTraceEnabled()) this.trace("updating peer \"" + follower.getIdentity() + "\" after queue empty notification"); this.raft.requestService(new UpdateFollowerService(follower)); }); } @Override void applyCommittedLogEntries() { assert Thread.holdsLock(this.raft); super.applyCommittedLogEntries(); // Stop check apply timer if there are none left if (this.raft.raftLog.isEmpty() && this.checkApplyTimer.isRunning()) this.checkApplyTimer.cancel(); } @Override boolean roleMayApplyLogEntry(LogEntry logEntry) { assert Thread.holdsLock(this.raft); // If any snapshots are in progress, we don't want to apply any log entries with index greater than the snapshot's // index, because then we'd lose the ability to update the follower with that log entry, and as a result just have // to send a snapshot again. However, we impose a limit on how long we'll wait for a slow follower. for (Follower follower : this.followerMap.values()) { final SnapshotTransmit snapshotTransmit = follower.getSnapshotTransmit(); if (snapshotTransmit == null) continue; if (snapshotTransmit.getSnapshotIndex() < logEntry.getIndex() && snapshotTransmit.getAge() < RaftKVDatabase.MAX_SNAPSHOT_TRANSMIT_AGE) { if (this.log.isTraceEnabled()) { this.trace("delaying application of " + logEntry + " because of in-progress snapshot install of " + snapshotTransmit.getSnapshotIndex() + "t" + snapshotTransmit.getSnapshotTerm() + " to " + follower); } return false; } } // If some follower does not yet have the log entry, wait for them to get it (up to some maximum time). // If the follower appears to be offline, don't bother waiting. final int maxLogEntryAge = this.raft.maxFollowerAckHeartbeats * this.raft.heartbeatTimeout; if (logEntry.getAge() < maxLogEntryAge) { final Timestamp minLeaderTimestamp = new Timestamp().offset(-maxLogEntryAge); for (Follower follower : this.followerMap.values()) { // Has this follower acknowledged reciept of the log entry? // If so, then the follower has already rebased any rebasable transactions. if (follower.getMatchIndex() >= logEntry.getIndex()) continue; // If we haven't heard from this follower in a while, don't bother waiting for it final Timestamp leaderTimestamp = follower.getLeaderTimestamp(); if (leaderTimestamp == null || leaderTimestamp.compareTo(minLeaderTimestamp) <= 0) continue; // Wait for follower to do so before applying to state machine if (this.log.isTraceEnabled()) { this.trace("delaying application of " + logEntry + " (age " + logEntry.getAge() + " < " + maxLogEntryAge + ") because of slow " + follower); } return false; } } // OK return true; } // We have to periodically check if we can apply log entries, because the condition is time-dependent private void checkApplyEntries() { assert Thread.holdsLock(this.raft); this.raft.requestService(this.checkWaitingTransactionsService); this.raft.requestService(this.applyCommittedLogEntriesService); if (!this.raft.raftLog.isEmpty()) this.checkApplyTimer.timeoutAfter(this.raft.heartbeatTimeout); } /** * Update my {@code commitIndex} based on followers' {@code matchIndex}'s. * *

* This should be invoked: *

    *
  • After any log entry has been added to the log, if we have zero followers
  • *
  • After a log entry that contains a configuration change has been added to the log
  • *
  • After a follower's {@linkplain Follower#getMatchIndex match index} has advanced
  • *
*/ private void updateLeaderCommitIndex() { assert Thread.holdsLock(this.raft); // Find highest index for which a majority of cluster members have ack'd the corresponding log entry from my term final int totalCount = this.raft.currentConfig.size(); // total possible nodes final int requiredCount = totalCount / 2 + 1; // require a majority final int startingCount = this.raft.isClusterMember() ? 1 : 0; // count myself, if member long maxCommitIndex = this.raft.commitIndex; int commitCount = -1; for (long index = this.raft.commitIndex + 1; index <= this.raft.getLastLogIndex(); index++) { // Count the number of nodes (possibly including myself) that have a copy of the log entry at index final int count = startingCount + this.countFollowersWithLogEntry(index); // The log entry term must match my current term (exception: unless every node has it) final long term = this.raft.getLogTermAtIndex(index); if (count < totalCount && term != this.raft.currentTerm) continue; // Do a majority of cluster nodes have this log entry? if (count < requiredCount) { if (term >= this.raft.currentTerm) // there's no point in going further break; continue; // a later term log entry might work } // We have a winner maxCommitIndex = index; commitCount = count; } // Update commit index if it advanced if (maxCommitIndex > this.raft.commitIndex) { // Update index if (this.log.isDebugEnabled()) { this.debug("advancing commit index from " + this.raft.commitIndex + " -> " + maxCommitIndex + " based on " + commitCount + "/" + totalCount + " nodes having received " + this.raft.getLogEntryAtIndex(maxCommitIndex)); } this.raft.commitIndex = maxCommitIndex; // Update commitables this.checkCommittables(); // Perform various service this.raft.requestService(this.checkReadyTransactionsService); this.raft.requestService(this.checkWaitingTransactionsService); this.raft.requestService(this.triggerKeyWatchesService); this.raft.requestService(this.applyCommittedLogEntriesService); // Notify all (up-to-date) followers with the updated leaderCommit this.updateAllSynchronizedFollowersNow(); // If we are no longer a member of the cluster, step down after the most recent config change is committed if (!this.raft.isClusterMember() && this.raft.commitIndex >= this.findMostRecentConfigChange()) { if (this.log.isDebugEnabled()) this.log.debug("stepping down as leader of cluster (no longer a member)"); this.stepDown(); } } } private int countFollowersWithLogEntry(long index) { assert index <= this.raft.getLastLogIndex(); // Count the number of followers (who are also cluster members) that have a copy of the log entry at the specified index int nodesWithLogEntry = 0; for (Follower follower : this.followerMap.values()) { if (follower.hasLogEntry(index)) nodesWithLogEntry++; } // Done return nodesWithLogEntry; } /** * Update my {@code leaseTimeout} based on followers' returned {@code leaderTimeout}'s. * *

* This should be invoked: *

    *
  • After a follower has replied with an {@link AppendResponse} containing a newer * {@linkplain AppendResponse#getLeaderTimestamp leader timestamp} than before
  • *
*/ private void updateLeaseTimeout() { assert Thread.holdsLock(this.raft); // Only needed when we have followers final int numFollowers = this.followerMap.size(); if (numFollowers == 0) return; // Get all cluster member leader timestamps, sorted in increasing order final Timestamp[] leaderTimestamps = new Timestamp[this.raft.currentConfig.size()]; int index = 0; if (this.raft.isClusterMember()) leaderTimestamps[index++] = new Timestamp(); // this represents my own vote for (Follower follower : this.followerMap.values()) { if (this.raft.isClusterMember(follower.getIdentity())) leaderTimestamps[index++] = follower.getLeaderTimestamp(); // note follower timestamps could be null } Arrays.sort(leaderTimestamps, Timestamp.NULL_FIRST_SORT); // // Calculate highest leaderTimeout shared by a majority of cluster members, based on sorted array: // // # nodes timestamps // ------- ---------- // 5 [ ][ ][x][x][x] 3/5 x's make a majority at index (5 - 1)/2 = 2 // 6 [ ][ ][x][x][x][x] 4/6 x's make a majority at index (6 - 1)/2 = 2 // // The minimum leaderTimeout shared by a majority of nodes is at index (leaderTimestamps.length - 1) / 2. // We then add the minimum election timeout, then subtract a little for clock drift. // final Timestamp newLeaseTimeout = leaderTimestamps[(leaderTimestamps.length + 1) / 2] .offset((int)(this.raft.minElectionTimeout * (1.0f - RaftKVDatabase.MAX_CLOCK_DRIFT) - 1)); if (Timestamp.NULL_FIRST_SORT.compare(newLeaseTimeout, this.leaseTimeout) > 0) { assert newLeaseTimeout != null; // Update my leader lease timeout if (this.log.isTraceEnabled()) this.trace("updating my lease timeout from " + this.leaseTimeout + " -> " + newLeaseTimeout); this.leaseTimeout = newLeaseTimeout; // Notify any followers who care for (Follower follower : this.followerMap.values()) { final NavigableSet timeouts = follower.getCommitLeaseTimeouts().headSet(this.leaseTimeout, true); if (!timeouts.isEmpty()) { follower.updateNow(); // notify follower so it can commit waiting transaction(s) timeouts.clear(); } } } } /** * Scrub timestamps to avoid roll-over. * *

* This should be invoked periodically, e.g., once a day. */ private void scrubTimestamps() { assert Thread.holdsLock(this.raft); if (this.log.isTraceEnabled()) this.trace("scrubbing timestamps"); for (Follower follower : this.followerMap.values()) { final Timestamp leaderTimestamp = follower.getLeaderTimestamp(); if (leaderTimestamp != null && leaderTimestamp.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing " + follower + " leader timestamp " + leaderTimestamp); follower.setLeaderTimestamp(null); } final Timestamp snapshotTimestamp = follower.getSnapshotTimestamp(); if (snapshotTimestamp != null && snapshotTimestamp.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing " + follower + " snapshot timestamp " + snapshotTimestamp); follower.setSnapshotTimestamp(null); } for (Iterator i = follower.getCommitLeaseTimeouts().iterator(); i.hasNext(); ) { final Timestamp leaseTimestamp = i.next(); if (leaseTimestamp.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing " + follower + " commit lease timestamp " + leaseTimestamp); i.remove(); } } } if (this.leaseTimeout != null && this.leaseTimeout.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing leader lease timestamp " + this.leaseTimeout); this.leaseTimeout = null; } } /** * Update our list of followers to match our current configuration. * *

* This should be invoked: *

    *
  • After a log entry that contains a configuration change has been added to the log
  • *
  • When the {@linkplain Follower#getNextIndex next index} of a follower not in the current config advances
  • *
*/ private void updateKnownFollowers() { assert Thread.holdsLock(this.raft); // Compare known followers with the current config and determine who needs to be be added or removed final HashSet adds = new HashSet<>(this.raft.currentConfig.keySet()); adds.removeAll(this.followerMap.keySet()); adds.remove(this.raft.identity); final HashSet dels = new HashSet<>(this.followerMap.keySet()); dels.removeAll(this.raft.currentConfig.keySet()); // Keep around a follower after its removal until it receives the config change that removed it for (Follower follower : this.followerMap.values()) { // Is this follower scheduled for deletion? final String peer = follower.getIdentity(); if (!dels.contains(peer)) continue; // Find the most recent log entry containing a config change in which the follower was removed final String node = follower.getIdentity(); final long index = this.findMostRecentConfigChangeMatching( configChange -> configChange[0].equals(node) && configChange[1] == null); // If follower has not received that log entry yet, keep on updating them until they do if (follower.getMatchIndex() < index) dels.remove(peer); } // Add new followers for (String peer : adds) { final String address = this.raft.currentConfig.get(peer); final Follower follower = new Follower(this.raft, peer, address, this.raft.getLastLogIndex()); if (this.log.isDebugEnabled()) this.debug("adding new follower \"" + peer + "\" at " + address); follower.setUpdateTimer(new Timer(this.raft, "update timer for \"" + peer + "\"", new UpdateFollowerService(follower))); this.followerMap.put(peer, follower); follower.updateNow(); // schedule an immediate update } // Remove old followers for (String peer : dels) { final Follower follower = this.followerMap.remove(peer); if (this.log.isDebugEnabled()) this.debug("removing old follower \"" + peer + "\""); follower.cleanup(); } } /** * Check whether a follower needs an update and send one if so. * *

* This should be invoked: *

    *
  • After a new follower has been added
  • *
  • When the output queue for a follower goes from non-empty to empty
  • *
  • After the follower's {@linkplain Follower#getUpdateTimer update timer} has expired
  • *
  • After a new log entry has been added to the log (all followers)
  • *
  • After receiving an {@link AppendResponse} that caused the follower's * {@linkplain Follower#getNextIndex next index} to change
  • *
  • After receiving the first positive {@link AppendResponse} to a probe
  • *
  • After our {@code commitIndex} has advanced (all followers)
  • *
  • After our {@code leaseTimeout} has advanced past one or more of a follower's * {@linkplain Follower#getCommitLeaseTimeouts commit lease timeouts} (with update timer reset)
  • *
  • After sending a {@link CommitResponse} with a non-null {@linkplain CommitResponse#getCommitLeaderLeaseTimeout * commit leader lease timeout} (all followers) to probe for updated leader timestamps
  • *
  • After starting, aborting, or completing a snapshot install for a follower
  • *
*/ private void updateFollower(Follower follower) { // Sanity check assert Thread.holdsLock(this.raft); // If follower has an in-progress snapshot that has become too stale, abort it final String peer = follower.getIdentity(); SnapshotTransmit snapshotTransmit = follower.getSnapshotTransmit(); if (snapshotTransmit != null && snapshotTransmit.getSnapshotIndex() < this.raft.lastAppliedIndex) { if (this.log.isDebugEnabled()) this.debug("aborting stale snapshot install for " + follower); follower.cancelSnapshotTransmit(); follower.updateNow(); } // Is follower's queue empty? If not, hold off until then if (this.raft.isTransmitting(follower.getAddress())) { if (this.log.isTraceEnabled()) this.trace("no update for \"" + peer + "\": output queue still not empty"); return; } // Handle any in-progress snapshot install if ((snapshotTransmit = follower.getSnapshotTransmit()) != null) { // Send the next chunk in transmission, if any final long pairIndex = snapshotTransmit.getPairIndex(); final ByteBuffer chunk = snapshotTransmit.getNextChunk(); boolean synced = true; if (chunk != null) { // Send next chunk final InstallSnapshot msg = new InstallSnapshot(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, snapshotTransmit.getSnapshotTerm(), snapshotTransmit.getSnapshotIndex(), pairIndex, pairIndex == 0 ? snapshotTransmit.getSnapshotConfig() : null, !snapshotTransmit.hasMoreChunks(), chunk); if (this.raft.sendMessage(msg)) { follower.setSnapshotTimestamp(new Timestamp()); return; } if (this.log.isDebugEnabled()) this.debug("canceling snapshot install for " + follower + " due to failure to send " + msg); // Message failed -> snapshot is fatally wounded, so cancel it synced = false; } if (synced) { if (this.log.isDebugEnabled()) this.debug("completed snapshot install for out-of-date " + follower); } // Snapshot transmit is complete (or failed) follower.cancelSnapshotTransmit(); // Trigger an immediate regular update follower.setNextIndex(snapshotTransmit.getSnapshotIndex() + 1); follower.setSynced(synced); follower.updateNow(); this.raft.requestService(new UpdateFollowerService(follower)); return; } // Are we still waiting for the update timer to expire? if (!follower.getUpdateTimer().pollForTimeout()) { boolean waitForTimerToExpire = true; // Don't wait for the update timer to expire if: // (a) The follower is sync'd; AND // (y) We have a new log entry that the follower doesn't have; OR // (y) We have a new leaderCommit that the follower doesn't have // The effect is that we will pipeline updates to synchronized followers. if (follower.isSynced() && (follower.getLeaderCommit() != this.raft.commitIndex || follower.getNextIndex() <= this.raft.getLastLogIndex())) waitForTimerToExpire = false; // Wait for timer to expire if (waitForTimerToExpire) { if (this.log.isTraceEnabled()) { this.trace("no update for \"" + follower.getIdentity() + "\": timer not expired yet, and follower is " + (follower.isSynced() ? "up to date" : "not synced")); } return; } } // Get index of the next log entry to send to follower final long nextIndex = follower.getNextIndex(); // If follower is too far behind, we must do a snapshot install if (nextIndex <= this.raft.lastAppliedIndex) { final MostRecentView view = new MostRecentView(this.raft, this.raft.commitIndex); follower.setSnapshotTransmit(new SnapshotTransmit(view.getTerm(), view.getIndex(), view.getConfig(), view.getSnapshot(), view.getView())); if (this.log.isDebugEnabled()) this.debug("started snapshot install for out-of-date " + follower); this.raft.requestService(new UpdateFollowerService(follower)); return; } // Restart update timer here (to avoid looping if an error occurs below) follower.getUpdateTimer().timeoutAfter(this.raft.heartbeatTimeout); // Send actual data if follower is synced and there is a log entry to send; otherwise, just send a probe final AppendRequest msg; if (!follower.isSynced() || nextIndex > this.raft.getLastLogIndex()) { // Create probe-only message msg = new AppendRequest(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, new Timestamp(), this.leaseTimeout, this.raft.commitIndex, this.raft.getLogTermAtIndex(nextIndex - 1), nextIndex - 1); } else { // Get log entry to send final LogEntry logEntry = this.raft.getLogEntryAtIndex(nextIndex); // If the log entry correspond's to follower's transaction, don't send the data because follower already has it. // But only do this optimization the first time, in case something goes wrong on the follower's end. ByteBuffer mutationData = null; if (!follower.getSkipDataLogEntries().remove(logEntry)) { try { mutationData = logEntry.getContent(); } catch (IOException e) { this.error("error reading log file " + logEntry.getFile(), e); return; } } // Create message msg = new AppendRequest(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, new Timestamp(), this.leaseTimeout, this.raft.commitIndex, this.raft.getLogTermAtIndex(nextIndex - 1), nextIndex - 1, logEntry.getTerm(), mutationData); } // Send update final boolean sent = this.raft.sendMessage(msg); // Advance next index if a log entry was sent; we allow pipelining log entries when synchronized if (sent && !msg.isProbe()) { assert follower.isSynced(); follower.setNextIndex(Math.min(follower.getNextIndex(), this.raft.getLastLogIndex()) + 1); } // Update the leaderCommit we sent to the follower if (sent) follower.setLeaderCommit(msg.getLeaderCommit()); } private void updateAllSynchronizedFollowersNow() { assert Thread.holdsLock(this.raft); this.followerMap.values().stream() .filter(Follower::isSynced) .forEach(Follower::updateNow); } private class UpdateFollowerService extends Service { private final Follower follower; UpdateFollowerService(Follower follower) { super(LeaderRole.this, "update follower \"" + follower.getIdentity() + "\""); this.follower = follower; } @Override public void run() { LeaderRole.this.updateFollower(this.follower); } @Override public boolean equals(Object obj) { if (obj == this) return true; if (obj == null || obj.getClass() != this.getClass()) return false; final UpdateFollowerService that = (UpdateFollowerService)obj; return this.follower.equals(that.follower); } @Override public int hashCode() { return this.follower.hashCode(); } } // Transactions @Override void handleLinearizableReadOnlyChange(RaftKVTransaction tx) { // Sanity check super.handleLinearizableReadOnlyChange(tx); // Set commit info based on what we currently know as "up-to-date" if (!tx.hasCommitInfo()) { tx.setCommitInfo(this.raft.getLastLogTerm(), this.raft.getLastLogIndex(), this.getCurrentCommitMinLeaseTimeout()); this.checkCommittable(tx); } } @Override void checkReadyTransactionNeedingCommitInfo(RaftKVTransaction tx) { // Sanity check super.checkReadyTransactionNeedingCommitInfo(tx); // Handle (effectively) read-only transactions if (!tx.addsLogEntry()) { // Does it already have commit information? if (tx.hasCommitInfo()) { this.advanceReadyTransaction(tx); return; } // Set commit info based on what we currently know as "up-to-date" and proceed this.advanceReadyTransactionWithCommitInfo(tx, this.raft.getLastLogTerm(), this.raft.getLastLogIndex(), this.getCurrentCommitMinLeaseTimeout()); return; } // Must be a read-write transaction that's fully rebased assert !tx.isReadOnly(); assert tx.isRebasable() : "fail tx " + tx; assert !tx.isCommittable(); assert !tx.hasCommitInfo(); assert this.checkRebasableAndCommittableUpToDate(tx); // If a config change is involved, check whether we can safely apply it if (tx.getConfigChange() != null && !this.mayApplyNewConfigChange()) return; // Commit transaction as a new log entry final LogEntry logEntry; try { logEntry = this.applyNewLogEntry(new NewLogEntry(tx)); } catch (IllegalStateException e) { throw new RetryTransactionException(tx, e.getMessage()); } catch (Exception e) { throw new KVTransactionException(tx, "error attempting to persist transaction", e); } if (this.log.isDebugEnabled()) this.debug("added log entry " + logEntry + " for local transaction " + tx); // Update transaction this.advanceReadyTransactionWithCommitInfo(tx, logEntry.getTerm(), logEntry.getIndex(), null); // Rebase transactions this.rebaseTransactions(); } // Determine whether it's safe to append a log entry with a configuration change private boolean mayApplyNewConfigChange() { assert Thread.holdsLock(this.raft); // Rule #1: this leader must have committed at least one log entry in this term assert this.raft.commitIndex >= this.raft.lastAppliedIndex; if (this.raft.getLogTermAtIndex(this.raft.commitIndex) < this.raft.currentTerm) return false; // Rule #2: there must be no previous config change that is still uncommitted for (int i = (int)(this.raft.commitIndex - this.raft.lastAppliedIndex) + 1; i < this.raft.raftLog.size(); i++) { if (this.raft.raftLog.get(i).getConfigChange() != null) return false; } // OK return true; } @Override Timestamp getLeaderLeaseTimeout() { return this.leaseTimeout; } /** * Get the minimum future leader timestamp required before we will know that our last log entry is up-to-date as of now. * If we already know that it's up-to-date as of now (because our lease currently extends into the future), return null. */ private Timestamp getCurrentCommitMinLeaseTimeout() { return this.isLeaderLeaseActiveNow() ? null : new Timestamp(); } // Message @Override void caseAppendRequest(AppendRequest msg, NewLogEntry newLogEntry) { assert Thread.holdsLock(this.raft); this.failDuplicateLeader(msg); } @Override void caseAppendResponse(AppendResponse msg) { assert Thread.holdsLock(this.raft); // Find follower final Follower follower = this.findFollower(msg); if (follower == null) return; // Update follower's last rec'd leader timestamp if (follower.getLeaderTimestamp() == null || msg.getLeaderTimestamp().compareTo(follower.getLeaderTimestamp()) > 0) { follower.setLeaderTimestamp(msg.getLeaderTimestamp()); this.raft.requestService(this.updateLeaseTimeoutService); } // Ignore if a snapshot install is in progress if (follower.getSnapshotTransmit() != null) { if (this.log.isTraceEnabled()) this.trace("rec'd " + msg + " while sending snapshot install; ignoring"); return; } // Ignore a response to a request that was sent prior to the most resent snapshot install if (follower.getSnapshotTimestamp() != null && msg.getLeaderTimestamp().compareTo(follower.getSnapshotTimestamp()) < 0) { if (this.log.isTraceEnabled()) this.trace("rec'd " + msg + " sent prior to snapshot install; ignoring"); return; } // Flag indicating we might want to update follower when done boolean updateFollowerAgain = false; // Update follower's match index if (msg.getMatchIndex() > follower.getMatchIndex()) { follower.setMatchIndex(msg.getMatchIndex()); this.raft.requestService(this.updateLeaderCommitIndexService); if (!this.raft.isClusterMember(follower.getIdentity())) this.raft.requestService(this.updateKnownFollowersService); } // Check result and update follower's next index final boolean wasSynced = follower.isSynced(); final long previousNextIndex = follower.getNextIndex(); if (!msg.isSuccess()) follower.setNextIndex(Math.max(follower.getNextIndex() - 1, 1)); follower.setSynced(msg.isSuccess()); if (follower.isSynced() != wasSynced) { if (this.log.isDebugEnabled()) { this.debug("sync status of \"" + follower.getIdentity() + "\" changed -> " + (!follower.isSynced() ? "not " : "") + "synced"); } updateFollowerAgain = true; } // Use follower's match index as a lower bound on follower's next index. follower.setNextIndex(Math.max(follower.getNextIndex(), follower.getMatchIndex() + 1)); // Use follower's last log index as an upper bound on follower's next index. follower.setNextIndex(Math.min(msg.getLastLogIndex() + 1, follower.getNextIndex())); // Update follower again if next index has changed updateFollowerAgain |= follower.getNextIndex() != previousNextIndex; // Debug if (this.log.isTraceEnabled()) this.trace("updated follower: " + follower + ", update again = " + updateFollowerAgain); // Immediately update follower again (if appropriate) if (updateFollowerAgain) this.raft.requestService(new UpdateFollowerService(follower)); } @Override void caseCommitRequest(CommitRequest msg, NewLogEntry newLogEntry) { assert Thread.holdsLock(this.raft); // Find follower final Follower follower = this.findFollower(msg); if (follower == null) return; // Decode reads, if any, and check for conflicts final ByteBuffer readsData = msg.getReadsData(); if (readsData != null) { // Decode reads final Reads reads; try { reads = new Reads(new ByteBufferInputStream(msg.getReadsData())); } catch (Exception e) { this.error("error decoding reads data in " + msg, e); this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), "error decoding reads data: " + e)); return; } // Check for conflict final String conflictMsg = this.checkConflicts(msg.getBaseTerm(), msg.getBaseIndex(), reads, this.raft.dumpConflicts ? msg.getSenderId() + " txId=" + msg.getTxId() : null); if (conflictMsg != null) { if (this.log.isDebugEnabled()) this.debug("commit request " + msg + " failed due to conflict: " + conflictMsg); this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), conflictMsg)); return; } } // Handle read-only vs. read-write transaction if (msg.isReadOnly()) { assert newLogEntry == null; // Determine our minimum lease timeout before we can know for sure that we are up-to-date, if not already final Timestamp minimumLeaseTimeout = this.getCurrentCommitMinLeaseTimeout(); // If there is a minimum lease timeout requirement, try to advance our lease timeout if (minimumLeaseTimeout != null) { // Remember that this follower is now going to be waiting for this particular leaseTimeout follower.getCommitLeaseTimeouts().add(minimumLeaseTimeout); // Send immediate probes to all (up-to-date) followers in an attempt to increase our leaseTimeout quickly this.updateAllSynchronizedFollowersNow(); } // Send response with commit term+index set from our last log entry this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), this.raft.getLastLogTerm(), this.raft.getLastLogIndex(), minimumLeaseTimeout)); } else { assert newLogEntry != null; // If the client is requesting a config change, we could check for an outstanding config change now and if so // delay our response until it completes, but that's not worth the trouble. Instead, applyNewLogEntry() will // throw an exception and the client will just just have to retry the transaction. Config changes are rare. // Commit mutations as a new log entry final LogEntry logEntry; try { logEntry = this.applyNewLogEntry(newLogEntry); } catch (Exception e) { if (!(e instanceof IllegalStateException)) this.error("error appending new log entry for " + msg, e); else if (this.log.isDebugEnabled()) this.debug("error appending new log entry for " + msg + ": " + e); this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), e.getMessage() != null ? e.getMessage() : "" + e)); return; } if (this.log.isDebugEnabled()) this.debug("added log entry " + logEntry + " for rec'd " + msg); // Rebase transactions this.rebaseTransactions(); // Follower transaction data optimization follower.getSkipDataLogEntries().add(logEntry); // Send response this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), logEntry.getTerm(), logEntry.getIndex())); } } @Override void caseCommitResponse(CommitResponse msg) { assert Thread.holdsLock(this.raft); this.failDuplicateLeader(msg); } @Override void caseInstallSnapshot(InstallSnapshot msg) { assert Thread.holdsLock(this.raft); this.failDuplicateLeader(msg); } @Override void caseRequestVote(RequestVote msg) { assert Thread.holdsLock(this.raft); // Too late dude, I already won the election if (this.log.isDebugEnabled()) this.debug("ignoring " + msg + " rec'd while in " + this); } @Override void caseGrantVote(GrantVote msg) { assert Thread.holdsLock(this.raft); // Thanks and all, but I already won the election if (this.log.isDebugEnabled()) this.debug("ignoring " + msg + " rec'd while in " + this); } private void failDuplicateLeader(Message msg) { assert Thread.holdsLock(this.raft); // This should never happen - same term but two different leaders final boolean defer = this.raft.identity.compareTo(msg.getSenderId()) <= 0; this.error("detected a duplicate leader in " + msg + " - should never happen; possible inconsistent cluster" + " configuration on " + msg.getSenderId() + " (mine: " + this.raft.currentConfig + "); " + (defer ? "reverting to follower" : "ignoring")); if (defer) this.raft.changeRole(new FollowerRole(this.raft, msg.getSenderId(), this.raft.returnAddress)); } // Object @Override public String toString() { synchronized (this.raft) { return this.toStringPrefix() + ",followerMap=" + this.followerMap + "]"; } } // Debug @Override boolean checkState() { assert Thread.holdsLock(this.raft); assert this.checkApplyTimer.isRunning() == !this.raft.raftLog.isEmpty(); for (Follower follower : this.followerMap.values()) { assert follower.getNextIndex() <= this.raft.getLastLogIndex() + 1; assert follower.getMatchIndex() <= this.raft.getLastLogIndex() + 1; assert follower.getLeaderCommit() <= this.raft.commitIndex; assert follower.getUpdateTimer().isRunning() || follower.getSnapshotTransmit() != null; } assert this.timestampScrubTimer.isRunning(); return true; } // Internal methods /** * Find the index of the most recent unapplied log entry having an associated config change. * * @return most recent config change log entry, or zero if none found */ private long findMostRecentConfigChange() { return this.findMostRecentConfigChangeMatching(configChange -> true); } /** * Find the index of the most recent unapplied log entry having an associated config change matching the given predicate. * * @return most recent matching log entry, or zero if none found */ private long findMostRecentConfigChangeMatching(Predicate predicate) { assert Thread.holdsLock(this.raft); for (long index = this.raft.getLastLogIndex(); index > this.raft.lastAppliedIndex; index--) { final String[] configChange = this.raft.getLogEntryAtIndex(index).getConfigChange(); if (configChange != null && predicate.test(configChange)) return index; } return 0; } /** * Apply a new log entry to the Raft log; if operation fails, {@link NewLogEntry#cancel cancel()} {@code newLogEntry}. * * @throws IllegalStateException if a config change would not be safe at the current time * @throws IllegalArgumentException if the config change attempts to remove the last node */ private LogEntry applyNewLogEntry(NewLogEntry newLogEntry) throws Exception { assert Thread.holdsLock(this.raft); // Do a couple of extra checks if a config change is included final String[] configChange = newLogEntry.getData().getConfigChange(); if (configChange != null) { // If a config change is involved, check whether we can safely apply it if (!this.mayApplyNewConfigChange()) throw new IllegalStateException("config change cannot be safely applied at this time"); // Disallow a configuration change that removes the last node in a cluster if (this.raft.currentConfig.size() == 1 && configChange[1] == null) { final String lastNode = this.raft.currentConfig.keySet().iterator().next(); if (configChange[0].equals(lastNode)) throw new IllegalArgumentException("can't remove the last node in a cluster (\"" + lastNode + "\")"); } } // Append new log entry to the Raft log final LogEntry logEntry = this.raft.appendLogEntry(this.raft.currentTerm, newLogEntry); // Update follower list if configuration changed if (configChange != null) this.raft.requestService(this.updateKnownFollowersService); // Update commit index (this is only needed if config has changed, or in the single node case) if (configChange != null || this.followerMap.isEmpty()) this.raft.requestService(this.updateLeaderCommitIndexService); // Immediately update all up-to-date followers this.updateAllSynchronizedFollowersNow(); // Start check apply timer if not already running if (!this.checkApplyTimer.isRunning()) this.checkApplyTimer.timeoutAfter(this.raft.heartbeatTimeout); // Done return logEntry; } /** * Check whether a proposed transaction can commit without any MVCC conflict. * * @param baseTerm the term of the log entry on which the transaction is based * @param baseIndex the index of the log entry on which the transaction is based * @param reads reads performed by the transaction * @return error message on failure, null for success */ private String checkConflicts(long baseTerm, long baseIndex, Reads reads, String dumpDescription) { assert Thread.holdsLock(this.raft); // Validate the index of the log entry on which the transaction is based final long minIndex = this.raft.lastAppliedIndex; final long maxIndex = this.raft.getLastLogIndex(); if (baseIndex < minIndex) return "transaction is too old: base index " + baseIndex + " < last applied log index " + minIndex; if (baseIndex > maxIndex) return "transaction is too new: base index " + baseIndex + " > most recent log index " + maxIndex; // Validate the term of the log entry on which the transaction is based final long actualBaseTerm = this.raft.getLogTermAtIndex(baseIndex); if (baseTerm != actualBaseTerm) { return "transaction is based on an overwritten log entry with index " + baseIndex + " and term " + baseTerm + " != " + actualBaseTerm; } // Check for conflicts from intervening commits for (long index = baseIndex + 1; index <= maxIndex; index++) { final LogEntry logEntry = this.raft.getLogEntryAtIndex(index); if (reads.isConflict(logEntry.getWrites())) { if (dumpDescription != null) this.dumpConflicts(reads, logEntry, dumpDescription); return "writes of committed transaction at index " + index + " conflict with transaction reads from transaction base index " + baseIndex; } } // No conflict return null; } private Follower findFollower(Message msg) { assert Thread.holdsLock(this.raft); final Follower follower = this.followerMap.get(msg.getSenderId()); if (follower == null) this.warn("rec'd " + msg + " from unknown follower \"" + msg.getSenderId() + "\", ignoring"); return follower; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy