All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.permazen.kv.raft.LeaderRole Maven / Gradle / Ivy


/*
 * Copyright (C) 2015 Archie L. Cobbs. All rights reserved.
 */

package io.permazen.kv.raft;

import com.google.common.base.Preconditions;

import io.permazen.kv.KVTransactionException;
import io.permazen.kv.RetryKVTransactionException;
import io.permazen.kv.mvcc.Conflict;
import io.permazen.kv.mvcc.Reads;
import io.permazen.kv.mvcc.Writes;
import io.permazen.kv.raft.msg.AppendRequest;
import io.permazen.kv.raft.msg.AppendResponse;
import io.permazen.kv.raft.msg.CommitRequest;
import io.permazen.kv.raft.msg.CommitResponse;
import io.permazen.kv.raft.msg.GrantVote;
import io.permazen.kv.raft.msg.InstallSnapshot;
import io.permazen.kv.raft.msg.Message;
import io.permazen.kv.raft.msg.RequestVote;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NavigableSet;
import java.util.function.Predicate;

import javax.annotation.concurrent.GuardedBy;

import org.dellroad.stuff.io.ByteBufferInputStream;

/**
 * Raft leader role.
 */
public class LeaderRole extends Role {

    // Maximum age of an oustanding SnapshotTransmit in milliseconds
    private static final int MAX_SNAPSHOT_AGE = 5 * 60 * 1000;                          // 5 minutes

    // Timestamp scrub interval
    private static final int TIMESTAMP_SCRUB_INTERVAL = 24 * 60 * 60 * 1000;            // once a day

    // Our followers
    @GuardedBy("raft")
    private final HashMap followerMap = new HashMap<>();

    // Our leadership "lease" timeout - i.e., the earliest time another leader could possibly be elected.
    // Note that this value is not used or meaningful when we are the only node in the cluster.
    @GuardedBy("raft")
    private Timestamp leaseTimeout;

    // Service tasks
    private final Service updateLeaderCommitIndexService = new Service(this, "update commitIndex", this::updateLeaderCommitIndex);
    private final Service updateLeaseTimeoutService = new Service(this, "update lease timeout", this::updateLeaseTimeout);
    private final Service updateKnownFollowersService = new Service(this, "update known followers", this::updateKnownFollowers);

    // Timers
    private final Timer timestampScrubTimer = new Timer(this.raft, "scrub timestamps",
      new Service(this, "scrub timestamps", this::scrubTimestamps));

// Constructors

    LeaderRole(RaftKVDatabase raft) {
        super(raft);
    }

// Status & Debugging

    /**
     * Get this leader's known followers.
     *
     * 

* The returned list is a copy; changes have no effect on this instance. * * @return this leader's followers */ public List getFollowers() { final ArrayList list; synchronized (this.raft) { list = new ArrayList<>(this.followerMap.values()); } list.sort(Follower.SORT_BY_IDENTITY); return list; } /** * Get this leader's "lease timeout". * *

* This is the earliest possible time at which some other, new leader could be elected in a new term. * Consequently, it is the earliest possible time at which any entry that this leader is unaware of * could be appended to the Raft log, under the assumption that all nodes are configured with the same * election timeout value. * *

* Normally, if followers are responding to {@link AppendRequest}s properly, this will be a value * in the (near) future. This allows the leader to make the assumption, up until that point in time, * that its log is fully up-to-date. * *

* Until it hears from a majority of followers, a leader will not have a lease timeout established yet. * This value is also not used or meaningful when this node is the only node in the cluster. * In either case, this method returns null. * *

* This method may also return null if a previous lease timeout has gotten very stale and in danger of * rolling over (e.g., isolated leader). * * @return this leader's lease timeout, or null if none is established yet */ public Timestamp getLeaseTimeout() { synchronized (this.raft) { return this.leaseTimeout; } } /** * Force this leader to step down. * * @throws IllegalStateException if this role is no longer active or election timer is not running */ public void stepDown() { this.doStepDown("stepDown() invoked"); } private void doStepDown(String reason) { synchronized (this.raft) { Preconditions.checkState(this.raft.role == this, "role is no longer active"); this.info("stepping down as leader: {}", reason); this.raft.changeRole(new FollowerRole(this.raft)); } } // Lifecycle @Override void setup() { assert Thread.holdsLock(this.raft); super.setup(); if (this.log.isDebugEnabled()) this.debug("entering leader role in term {}", this.raft.currentTerm); // Generate follower list this.updateKnownFollowers(); // Append a "dummy" log entry with my current term. This allows us to advance the commit index when the last // entry in our log is from a prior term. This is needed to avoid the problem where a transaction could end up // waiting indefinitely for its log entry with a prior term number to be committed. final LogEntry logEntry; try { logEntry = this.applyNewLogEntry(new NewLogEntry(this.raft, new LogEntry.Data(new Writes(), null))); } catch (Exception e) { this.error("error attempting to apply initial log entry", e); return; } if (this.log.isDebugEnabled()) this.debug("added log entry {} to commit at the beginning of my new term", logEntry); // Rebase transactions this.rebaseTransactions(false); // Start timestamp scrub timer this.timestampScrubTimer.timeoutAfter(TIMESTAMP_SCRUB_INTERVAL); } @Override void shutdown() { assert Thread.holdsLock(this.raft); this.followerMap.values() .forEach(Follower::cleanup); this.timestampScrubTimer.cancel(); super.shutdown(); } // Service @Override void outputQueueEmpty(String address) { assert Thread.holdsLock(this.raft); // Find matching follower(s) and update them if needed this.followerMap.values() .stream() .filter(follower -> follower.getAddress().equals(address)) .iterator() .forEachRemaining(follower -> { if (this.log.isTraceEnabled()) this.trace("updating peer \"{}\" after queue empty notification", follower.getIdentity()); this.raft.requestService(follower.getUpdateService()); }); } @Override long calculateMaxAppliedDiscardIndex() { assert Thread.holdsLock(this.raft); // Calculate MIN(discardIndex) - 1 over all followers, where discardIndex = follower match index, or snapshot base index // if follower is being sent a snapshot. Applied log entries <= this index can be discarded, because we know the // follower already has them, or, in the case of a snapshot, will very likely soon have them. The "- 1" is because // we need to know the term of the entry just before the next index we send, so we need to keep one extra. long maxAppliedDiscardIndex = super.calculateMaxAppliedDiscardIndex(); for (Follower follower : this.followerMap.values()) { final SnapshotTransmit snapshotTransmit = follower.getSnapshotTransmit(); final long discardIndex = snapshotTransmit != null ? snapshotTransmit.getSnapshotIndex() : follower.getMatchIndex(); maxAppliedDiscardIndex = Math.min(maxAppliedDiscardIndex, discardIndex); } return maxAppliedDiscardIndex - 1; } /** * Update my {@code commitIndex} based on followers' {@code matchIndex}'s. * *

* This should be invoked: *

    *
  • After any log entry has been added to the log, if we have zero followers
  • *
  • After a log entry that contains a configuration change has been added to the log
  • *
  • After a follower's {@linkplain Follower#getMatchIndex match index} has advanced
  • *
*/ private void updateLeaderCommitIndex() { assert Thread.holdsLock(this.raft); // Find highest index for which a majority of cluster members have ack'd the corresponding log entry from my term final int totalCount = this.raft.currentConfig.size(); // total possible nodes final int requiredCount = totalCount / 2 + 1; // require a majority final int startingCount = this.raft.isClusterMember() ? 1 : 0; // count myself, if member long maxCommitIndex = this.raft.commitIndex; int commitCount = -1; for (long index = this.raft.commitIndex + 1; index <= this.raft.log.getLastIndex(); index++) { // Count the number of nodes (possibly including myself) that have a copy of the log entry at index final int count = startingCount + this.countFollowersWithLogEntry(index); // The log entry term must match my current term (exception: unless every node has it) final long term = this.raft.log.getTermAtIndex(index); if (count < totalCount && term != this.raft.currentTerm) continue; // Do a majority of cluster nodes have this log entry? if (count < requiredCount) { if (term >= this.raft.currentTerm) // there's no point in going further break; continue; // a later term log entry might work } // We have a winner maxCommitIndex = index; commitCount = count; } // Update commit index if it advanced if (maxCommitIndex > this.raft.commitIndex) { // Update index if (this.log.isDebugEnabled()) { this.debug("advancing commit index from {} -> {} based on {}/{} nodes having received {}", this.raft.commitIndex, maxCommitIndex, commitCount, totalCount, this.raft.log.getEntryAtIndex(maxCommitIndex)); } this.raft.commitIndex = maxCommitIndex; // Update commitables this.checkCommittables(); // Perform various service this.raft.requestService(this.checkReadyTransactionsService); this.raft.requestService(this.checkWaitingTransactionsService); this.raft.requestService(this.triggerKeyWatchesService); this.raft.requestService(this.applyCommittedLogEntriesService); // Notify all (up-to-date) followers with the updated leaderCommit this.updateAllSynchronizedFollowersNow(); // If we are no longer a member of the cluster, step down after the most recent config change is committed if (!this.raft.isClusterMember() && this.raft.commitIndex >= this.findMostRecentConfigChange()) this.doStepDown("no longer a member of my own cluster"); } } private int countFollowersWithLogEntry(long index) { assert index <= this.raft.log.getLastIndex(); // Count the number of followers (who are also cluster members) that have a copy of the log entry at the specified index int nodesWithLogEntry = 0; for (Follower follower : this.followerMap.values()) { if (follower.hasLogEntry(index)) nodesWithLogEntry++; } // Done return nodesWithLogEntry; } /** * Update my {@code leaseTimeout} based on followers' returned {@code leaderTimeout}'s. * *

* This should be invoked: *

    *
  • After a follower has replied with an {@link AppendResponse} containing a newer * {@linkplain AppendResponse#getLeaderTimestamp leader timestamp} than before
  • *
*/ private void updateLeaseTimeout() { assert Thread.holdsLock(this.raft); // Only needed when we have followers final int numFollowers = this.followerMap.size(); if (numFollowers == 0) return; // Get all cluster member leader timestamps, sorted in increasing order final Timestamp[] leaderTimestamps = new Timestamp[this.raft.currentConfig.size()]; int index = 0; if (this.raft.isClusterMember()) leaderTimestamps[index++] = new Timestamp(); // this represents my own vote for (Follower follower : this.followerMap.values()) { if (this.raft.isClusterMember(follower.getIdentity())) leaderTimestamps[index++] = follower.getLeaderTimestamp(); // note follower timestamps could be null } Arrays.sort(leaderTimestamps, Timestamp.NULL_FIRST_SORT); // // Calculate highest leaderTimeout shared by a majority of cluster members, based on sorted array: // // # nodes timestamps // ------- ---------- // 5 [ ][ ][x][x][x] 3/5 x's make a majority at index (5 - 1)/2 = 2 // 6 [ ][ ][x][x][x][x] 4/6 x's make a majority at index (6 - 1)/2 = 2 // // The minimum leaderTimeout shared by a majority of nodes is at index (leaderTimestamps.length - 1) / 2. // We then add the minimum election timeout, then subtract a little for clock drift. // final Timestamp newLeaseTimeout = leaderTimestamps[(leaderTimestamps.length + 1) / 2] .offset((int)(this.raft.minElectionTimeout * (1.0f - RaftKVDatabase.MAX_CLOCK_DRIFT) - 1)); if (Timestamp.NULL_FIRST_SORT.compare(newLeaseTimeout, this.leaseTimeout) > 0) { assert newLeaseTimeout != null; // Update my leader lease timeout if (this.log.isTraceEnabled()) this.trace("updating my lease timeout from {} -> {}", this.leaseTimeout, newLeaseTimeout); this.leaseTimeout = newLeaseTimeout; // Notify any followers who care for (Follower follower : this.followerMap.values()) { final NavigableSet timeouts = follower.getCommitLeaseTimeouts().headSet(this.leaseTimeout, true); if (!timeouts.isEmpty()) { follower.updateNow(); // notify follower so it can commit waiting transaction(s) timeouts.clear(); } } // Check for any transactions that are waiting on leader lease time this.checkWaitingTransactions(); } } /** * Scrub timestamps to avoid roll-over. * *

* This should be invoked periodically, e.g., once a day. */ private void scrubTimestamps() { assert Thread.holdsLock(this.raft); if (this.log.isTraceEnabled()) this.trace("scrubbing timestamps"); for (Follower follower : this.followerMap.values()) { final Timestamp leaderTimestamp = follower.getLeaderTimestamp(); if (leaderTimestamp != null && leaderTimestamp.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing {} timestamp {}", follower, leaderTimestamp); follower.setLeaderTimestamp(null); } final Timestamp snapshotTimestamp = follower.getSnapshotTimestamp(); if (snapshotTimestamp != null && snapshotTimestamp.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing {} snapshot timestamp {}", follower, snapshotTimestamp); follower.setSnapshotTimestamp(null); } for (Iterator i = follower.getCommitLeaseTimeouts().iterator(); i.hasNext(); ) { final Timestamp leaseTimestamp = i.next(); if (leaseTimestamp.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing {} commit lease timestamp {}", follower, leaseTimestamp); i.remove(); } } } if (this.leaseTimeout != null && this.leaseTimeout.isRolloverDanger()) { if (this.log.isDebugEnabled()) this.debug("scrubbing leader lease timestamp {}", this.leaseTimeout); this.leaseTimeout = null; } // Restart timestamp scrub timer this.timestampScrubTimer.timeoutAfter(TIMESTAMP_SCRUB_INTERVAL); } /** * Update our list of followers to match our current configuration. * *

* This should be invoked: *

    *
  • After a log entry that contains a configuration change has been added to the log
  • *
  • When the {@linkplain Follower#getNextIndex next index} of a follower not in the current config advances
  • *
*/ private void updateKnownFollowers() { assert Thread.holdsLock(this.raft); // Compare known followers with the current config and determine who needs to be be added or removed final HashSet adds = new HashSet<>(this.raft.currentConfig.keySet()); adds.removeAll(this.followerMap.keySet()); adds.remove(this.raft.identity); final HashSet dels = new HashSet<>(this.followerMap.keySet()); dels.removeAll(this.raft.currentConfig.keySet()); // Keep around a follower after its removal until it receives the config change that removed it for (Follower follower : this.followerMap.values()) { // Is this follower scheduled for deletion? final String peer = follower.getIdentity(); if (!dels.contains(peer)) continue; // Find the most recent log entry containing a config change in which the follower was removed final String node = follower.getIdentity(); final long index = this.findMostRecentConfigChangeMatching( configChange -> configChange[0].equals(node) && configChange[1] == null); // If follower has not received that log entry yet, keep on updating them until they do if (follower.getMatchIndex() < index) dels.remove(peer); } // Add new followers for (String peer : adds) { final String address = this.raft.currentConfig.get(peer); final Follower follower = new Follower(this, peer, address, this.raft.log.getLastIndex()); if (this.log.isDebugEnabled()) this.debug("adding new follower \"{}\" at {}", peer, address); this.followerMap.put(peer, follower); follower.updateNow(); // schedule an immediate update } // Remove old followers for (String peer : dels) { final Follower follower = this.followerMap.remove(peer); if (this.log.isDebugEnabled()) this.debug("removing old follower \"{}\"", peer); follower.cleanup(); } } /** * Check whether a follower needs an update and send one if so. * *

* This should be invoked: *

    *
  • After a new follower has been added
  • *
  • When the output queue for a follower goes from non-empty to empty
  • *
  • After the follower's {@linkplain Follower#getUpdateTimer update timer} has expired
  • *
  • After a new log entry has been added to the log (all followers)
  • *
  • After receiving an {@link AppendResponse} that caused the follower's * {@linkplain Follower#getNextIndex next index} to change
  • *
  • After receiving the first positive {@link AppendResponse} to a probe
  • *
  • After our {@code commitIndex} has advanced (all followers)
  • *
  • After our {@code leaseTimeout} has advanced past one or more of a follower's * {@linkplain Follower#getCommitLeaseTimeouts commit lease timeouts} (with update timer reset)
  • *
  • After sending a {@link CommitResponse} with a non-null {@linkplain CommitResponse#getCommitLeaderLeaseTimeout * commit leader lease timeout} (all followers) to probe for updated leader timestamps
  • *
  • After starting, aborting, or completing a snapshot install for a follower
  • *
*/ void updateFollower(Follower follower) { // Sanity check assert Thread.holdsLock(this.raft); // If follower has an in-progress snapshot that has become too stale, abort it final String peer = follower.getIdentity(); SnapshotTransmit snapshotTransmit = follower.getSnapshotTransmit(); if (snapshotTransmit != null && snapshotTransmit.getAge() > MAX_SNAPSHOT_AGE) { if (this.raft.isPerfLogEnabled()) this.perfLog("aborting stale snapshot install for {} (age {}ms)", follower, snapshotTransmit.getAge()); follower.cancelSnapshotTransmit(); follower.updateNow(); } // Is follower's queue empty? If not, hold off until then if (this.raft.isTransmitting(follower.getAddress())) { if (this.log.isTraceEnabled()) this.trace("no update for \"{}\": output queue still not empty", peer); return; } // Handle any in-progress snapshot install if ((snapshotTransmit = follower.getSnapshotTransmit()) != null) { // Send the next chunk in transmission, if any final long pairIndex = snapshotTransmit.getPairIndex(); final ByteBuffer chunk = snapshotTransmit.getNextChunk(); boolean synced = true; if (chunk != null) { // Send next chunk final InstallSnapshot msg = new InstallSnapshot(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, snapshotTransmit.getSnapshotTerm(), snapshotTransmit.getSnapshotIndex(), pairIndex, pairIndex == 0 ? snapshotTransmit.getSnapshotConfig() : null, !snapshotTransmit.hasMoreChunks(), chunk); if (this.raft.sendMessage(msg)) { follower.setSnapshotTimestamp(new Timestamp()); return; } if (this.raft.isPerfLogEnabled()) this.perfLog("canceling snapshot install for {} due to failure to send {}", follower, msg); // Message failed -> snapshot is fatally wounded, so cancel it synced = false; } if (synced) { if (this.raft.isPerfLogEnabled()) this.perfLog("completed snapshot install for out-of-date {}", follower); } // Snapshot transmit is complete (or failed) follower.cancelSnapshotTransmit(); // Trigger an immediate regular update follower.setNextIndex(snapshotTransmit.getSnapshotIndex() + 1); follower.setSynced(synced); follower.updateNow(); this.raft.requestService(follower.getUpdateService()); return; } // Are we still waiting for the update timer to expire? if (!follower.getUpdateTimer().pollForTimeout()) { boolean waitForTimerToExpire = true; // Don't wait for the update timer to expire if: // (a) The follower is sync'd; AND // (y) We have a new log entry that the follower doesn't have; OR // (y) We have a new leaderCommit that the follower doesn't have // The effect is that we will pipeline updates to synchronized followers. if (follower.isSynced() && (follower.getLeaderCommit() != this.raft.commitIndex || follower.getNextIndex() <= this.raft.log.getLastIndex())) waitForTimerToExpire = false; // Wait for timer to expire if (waitForTimerToExpire) { if (this.log.isTraceEnabled()) { this.trace("no update for \"{}\": timer not expired yet, and follower is {}", follower.getIdentity(), follower.isSynced() ? "up to date" : "not synced"); } return; } } // Get index of the next log entry to send to follower final long nextIndex = follower.getNextIndex(); assert nextIndex >= 1 && nextIndex <= this.raft.log.getLastIndex() + 1; // Get the log entry to send, if we have it LogEntry logEntry = this.raft.log.getEntryAtIndexIfKnown(nextIndex); // In order to send the log entry (or a probe), we need to know the previous log entry's term final long previousIndex = nextIndex - 1; final long previousTerm = this.raft.log.getTermAtIndexIfKnown(previousIndex); // If the follower is so far behind that we no longer have the information it needs, we must do a snapshot install if ((logEntry == null || previousTerm == 0) && nextIndex <= this.raft.log.getLastAppliedIndex()) { final MostRecentView view = new MostRecentView(this.raft, this.raft.commitIndex); follower.setSnapshotTransmit(new SnapshotTransmit(view.getTerm(), view.getIndex(), view.getConfig(), view.getSnapshot(), view.getView())); if (this.raft.isPerfLogEnabled()) { this.perfLog("started snapshot install for out-of-date {} with nextIndex {} <= {}", follower, nextIndex, this.raft.log.getLastAppliedIndex()); } follower.getSkipDataLogEntries().clear(); // avoid memory leak if snapshot leapfrogs follower log entries this.raft.requestService(follower.getUpdateService()); return; } // It must be the case that previousTerm is known now, because lastAppliedIndex <= previousIndex <= lastIndex assert previousTerm > 0; assert previousIndex > 0; // Restart update timer here (to avoid looping if an error occurs below) follower.getUpdateTimer().timeoutAfter(this.raft.heartbeatTimeout); // Send actual data if follower is synced and there is a log entry to send; otherwise, just send a probe final AppendRequest msg; if (!follower.isSynced() || logEntry == null) { // Create probe msg = new AppendRequest(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, new Timestamp(), this.leaseTimeout, this.raft.commitIndex, previousTerm, previousIndex); } else { // If the log entry correspond's to follower's transaction, don't send the data because follower already has it. // But only do this optimization the first time, in case something goes wrong on the follower's end. ByteBuffer mutationData = null; if (!follower.getSkipDataLogEntries().remove(logEntry)) { try { mutationData = logEntry.getContent(); } catch (IOException e) { this.error("error reading log file {}", logEntry.getFile(), e); return; } } // Create message msg = new AppendRequest(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, new Timestamp(), this.leaseTimeout, this.raft.commitIndex, previousTerm, previousIndex, logEntry.getTerm(), mutationData); } // Send update final boolean sent = this.raft.sendMessage(msg); // Advance next index if a log entry was sent; we allow pipelining log entries when synchronized if (sent && !msg.isProbe()) { assert follower.isSynced(); follower.setNextIndex(Math.min(follower.getNextIndex(), this.raft.log.getLastIndex()) + 1); } // Update the leaderCommit we sent to the follower if (sent) follower.setLeaderCommit(msg.getLeaderCommit()); } private void updateAllSynchronizedFollowersNow() { assert Thread.holdsLock(this.raft); this.followerMap.values() .stream() .filter(Follower::isSynced) .iterator() .forEachRemaining(Follower::updateNow); } // Transactions @Override void handleLinearizableReadOnlyChange(RaftKVTransaction tx) { // Sanity check super.handleLinearizableReadOnlyChange(tx); // Set commit info based on what we currently know as "up-to-date" if (!tx.hasCommitInfo()) { tx.setCommitInfo(this.raft.log.getLastTerm(), this.raft.log.getLastIndex(), this.getCurrentCommitMinLeaseTimeout()); this.checkCommittable(tx); } } @Override void checkReadyTransactionNeedingCommitInfo(RaftKVTransaction tx) { // Sanity check super.checkReadyTransactionNeedingCommitInfo(tx); // Handle (effectively) read-only transactions if (!tx.addsLogEntry()) { // Does it already have commit information? if (tx.hasCommitInfo()) { this.advanceReadyTransaction(tx); return; } // Set commit info based on what we currently know as "up-to-date" and proceed this.advanceReadyTransactionWithCommitInfo(tx, this.raft.log.getLastTerm(), this.raft.log.getLastIndex(), this.getCurrentCommitMinLeaseTimeout()); return; } // Must be a read-write transaction that's fully rebased assert !tx.isReadOnly(); assert tx.isRebasable() : "fail tx " + tx; assert !tx.isCommittable(); assert !tx.hasCommitInfo(); assert this.checkRebasableAndCommittableUpToDate(tx); // If a config change is involved, check whether we can safely apply it if (tx.getConfigChange() != null && !this.mayApplyNewConfigChange()) return; // We must check for a high priority TX conflict, and rebase the high priority TX, atomically, so setup locking final boolean needHighPriorityCheck = this.raft.highPrioTx != null && this.raft.highPrioTx != tx; synchronized (needHighPriorityCheck ? this.raft.highPrioTx.view : this.raft) { // Check for the existence of a conflicting high priority TX if (needHighPriorityCheck) { final String conflictMsg = this.checkHighPriorityConflict(tx.view.getWrites(), this.raft.dumpConflicts ? "local tx " + tx : null); if (conflictMsg != null) throw new RetryKVTransactionException(tx, conflictMsg); } // Commit transaction as a new log entry final LogEntry logEntry; try { logEntry = this.applyNewLogEntry(new NewLogEntry(tx)); } catch (IllegalStateException e) { throw new RetryKVTransactionException(tx, e.getMessage()); } catch (Exception e) { throw new KVTransactionException(tx, "error attempting to persist transaction", e); } if (this.log.isDebugEnabled()) this.debug("added log entry {} for local transaction {}", logEntry, tx); // Update transaction this.advanceReadyTransactionWithCommitInfo(tx, logEntry.getTerm(), logEntry.getIndex(), null); // Rebase transactions this.rebaseTransactions(needHighPriorityCheck); } } // Determine whether it's safe to append a log entry with a configuration change private boolean mayApplyNewConfigChange() { assert Thread.holdsLock(this.raft); // Rule #1: this leader must have committed at least one log entry in this term assert this.raft.commitIndex >= this.raft.log.getLastAppliedIndex(); if (this.raft.log.getTermAtIndex(this.raft.commitIndex) < this.raft.currentTerm) return false; // Rule #2: there must be no previous config change that is still uncommitted for (long index = this.raft.commitIndex + 1; index <= this.raft.log.getLastIndex(); index++) { if (this.raft.log.getEntryAtIndex(index).getConfigChange() != null) return false; } // OK return true; } @Override Timestamp getLeaderLeaseTimeout() { return this.leaseTimeout; } /** * Get the minimum future leader timestamp required before we will know that our last log entry is up-to-date as of now. * If we already know that it's up-to-date as of now (because our lease currently extends into the future), return null. */ private Timestamp getCurrentCommitMinLeaseTimeout() { return this.followerMap.isEmpty() || this.isLeaderLeaseActiveNow() ? null : new Timestamp(); } /** * Given a possible new transaction to commit, check for the existence of a high priority with which it conflicts. */ private String checkHighPriorityConflict(Writes writes, String dumpDescription) { // Sanity check assert Thread.holdsLock(this.raft); assert this.raft.highPrioTx != null; assert Thread.holdsLock(this.raft.highPrioTx.view); // Check for conflict final Reads reads = this.raft.highPrioTx.view.getReads(); final Conflict conflict = reads.findConflict(writes); if (conflict == null) return null; // Report conflicts if (dumpDescription != null) { this.dumpConflicts(reads, writes, dumpDescription + " fails due to conflicts with high priority transaction " + this.raft.highPrioTx); } // Fail return "transaction conflicts with a high priority transaction: " + conflict; } // Message @Override void caseAppendRequest(AppendRequest msg, NewLogEntry newLogEntry) { assert Thread.holdsLock(this.raft); this.failDuplicateLeader(msg); } @Override void caseAppendResponse(AppendResponse msg) { assert Thread.holdsLock(this.raft); // Find follower final Follower follower = this.findFollower(msg); if (follower == null) return; // Update follower's last rec'd leader timestamp if (follower.getLeaderTimestamp() == null || msg.getLeaderTimestamp().compareTo(follower.getLeaderTimestamp()) > 0) { follower.setLeaderTimestamp(msg.getLeaderTimestamp()); this.raft.requestService(this.updateLeaseTimeoutService); } // Ignore if a snapshot install is in progress if (follower.getSnapshotTransmit() != null) { if (this.log.isTraceEnabled()) this.trace("rec'd {} while sending snapshot install; ignoring", msg); return; } // Ignore a response to a request that was sent prior to the most resent snapshot install if (follower.getSnapshotTimestamp() != null && msg.getLeaderTimestamp().compareTo(follower.getSnapshotTimestamp()) < 0) { if (this.log.isTraceEnabled()) this.trace("rec'd {} sent prior to snapshot install; ignoring", msg); return; } // Flag indicating we might want to update follower when done boolean updateFollowerAgain = false; // Update follower's match index if (msg.getMatchIndex() > follower.getMatchIndex()) { follower.setMatchIndex(msg.getMatchIndex()); this.raft.requestService(this.updateLeaderCommitIndexService); if (!this.raft.isClusterMember(follower.getIdentity())) this.raft.requestService(this.updateKnownFollowersService); } // Check result and update follower's next index final boolean wasSynced = follower.isSynced(); final long previousNextIndex = follower.getNextIndex(); if (!msg.isSuccess()) follower.setNextIndex(Math.max(follower.getNextIndex() - 1, 1)); follower.setSynced(msg.isSuccess()); if (follower.isSynced() != wasSynced) updateFollowerAgain = true; // Use follower's match index as a lower bound on follower's next index. follower.setNextIndex(Math.max(follower.getNextIndex(), follower.getMatchIndex() + 1)); // Use follower's last log index as an upper bound on follower's next index. follower.setNextIndex(Math.min(msg.getLastLogIndex() + 1, follower.getNextIndex())); // Update follower again if next index has changed updateFollowerAgain |= follower.getNextIndex() != previousNextIndex; // Debug if (this.log.isTraceEnabled()) this.trace("updated follower: {}, update again = {}", follower, updateFollowerAgain); // Immediately update follower again (if appropriate) if (updateFollowerAgain) this.raft.requestService(follower.getUpdateService()); } @Override void caseCommitRequest(CommitRequest msg, NewLogEntry newLogEntry) { assert Thread.holdsLock(this.raft); // Find follower final Follower follower = this.findFollower(msg); if (follower == null) return; // Decode reads, if any, and check for conflicts final ByteBuffer readsData = msg.getReadsData(); if (readsData != null) { // Decode reads final Reads reads; try { reads = new Reads(new ByteBufferInputStream(msg.getReadsData())); } catch (Exception e) { this.error("error decoding reads data in {}", msg, e); this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), "error decoding reads data: " + e)); return; } // Check for conflict final String conflictMsg = this.checkConflicts(msg.getBaseTerm(), msg.getBaseIndex(), reads, this.raft.dumpConflicts ? msg.getSenderId() + " txId=" + msg.getTxId() : null); if (conflictMsg != null) { if (this.log.isDebugEnabled()) this.debug("commit request {} failed due to conflict: {}", msg, conflictMsg); this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), conflictMsg)); return; } } // Handle read-only vs. read-write transaction if (msg.isReadOnly()) { assert newLogEntry == null; // Determine our minimum lease timeout before we can know for sure that we are up-to-date, if not already final Timestamp minimumLeaseTimeout = this.getCurrentCommitMinLeaseTimeout(); // If there is a minimum lease timeout requirement, try to advance our lease timeout if (minimumLeaseTimeout != null) { // Remember that this follower is now going to be waiting for this particular leaseTimeout follower.getCommitLeaseTimeouts().add(minimumLeaseTimeout); // Send immediate probes to all (up-to-date) followers in an attempt to increase our leaseTimeout quickly this.updateAllSynchronizedFollowersNow(); } // Send response with commit term+index set from our last log entry this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), this.raft.log.getLastTerm(), this.raft.log.getLastIndex(), minimumLeaseTimeout)); } else { assert newLogEntry != null; // If the client is requesting a config change, we could check for an outstanding config change now and if so // delay our response until it completes, but that's not worth the trouble. Instead, applyNewLogEntry() will // throw an exception and the client will just just have to retry the transaction. Config changes are rare. // We must check for a high priority TX conflict, and rebase the high priority TX, atomically, so setup locking final LogEntry logEntry; final boolean needHighPriorityCheck = this.raft.highPrioTx != null; synchronized (needHighPriorityCheck ? this.raft.highPrioTx.view : this.raft) { // Check for the existence of a conflicting high priority TX if (needHighPriorityCheck) { final String conflictMsg = this.checkHighPriorityConflict(newLogEntry.getData().getWrites(), this.raft.dumpConflicts ? "commit request " + msg : null); if (conflictMsg != null) { this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), conflictMsg)); return; } } // Commit mutations as a new log entry try { logEntry = this.applyNewLogEntry(newLogEntry); } catch (Exception e) { if (!(e instanceof IllegalStateException)) this.error("error appending new log entry for {}", msg, e); else if (this.log.isDebugEnabled()) this.debug("error appending new log entry for {}: {}", msg, e.toString()); this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), e.getMessage() != null ? e.getMessage() : "" + e)); return; } if (this.log.isDebugEnabled()) this.debug("added log entry {} for rec'd {}", logEntry, msg); // Rebase transactions this.rebaseTransactions(needHighPriorityCheck); } // Follower transaction data optimization follower.getSkipDataLogEntries().add(logEntry); // Send response this.raft.sendMessage(new CommitResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTxId(), logEntry.getTerm(), logEntry.getIndex())); } } @Override void caseCommitResponse(CommitResponse msg) { assert Thread.holdsLock(this.raft); this.failDuplicateLeader(msg); } @Override void caseInstallSnapshot(InstallSnapshot msg) { assert Thread.holdsLock(this.raft); this.failDuplicateLeader(msg); } @Override void caseRequestVote(RequestVote msg) { assert Thread.holdsLock(this.raft); // Too late dude, I already won the election if (this.log.isDebugEnabled()) this.debug("ignoring {} rec'd while in {}", msg, this); } @Override void caseGrantVote(GrantVote msg) { assert Thread.holdsLock(this.raft); // Thanks and all, but I already won the election if (this.log.isDebugEnabled()) this.debug("ignoring {} rec'd while in {}", msg, this); } private void failDuplicateLeader(Message msg) { assert Thread.holdsLock(this.raft); // This should never happen - same term but two different leaders final boolean defer = this.raft.identity.compareTo(msg.getSenderId()) <= 0; this.error("detected a duplicate leader in {} - should never happen;" + " possible inconsistent cluster configuration on {} (mine: {}); {}", msg, msg.getSenderId(), this.raft.currentConfig, defer ? "reverting to follower" : "ignoring"); if (defer) this.raft.changeRole(new FollowerRole(this.raft, msg.getSenderId(), this.raft.returnAddress)); } // Object @Override public String toString() { synchronized (this.raft) { return this.toStringPrefix() + ",followerMap=" + this.followerMap + "]"; } } // Debug @Override boolean checkState() { assert Thread.holdsLock(this.raft); for (Follower follower : this.followerMap.values()) { assert follower.getNextIndex() >= 1; assert follower.getNextIndex() <= this.raft.log.getLastIndex() + 1; assert follower.getMatchIndex() <= this.raft.log.getLastIndex() + 1; assert follower.getLeaderCommit() <= this.raft.commitIndex; assert follower.getUpdateTimer().isRunning() || follower.getSnapshotTransmit() != null; } assert this.timestampScrubTimer.isRunning(); return true; } // Internal methods /** * Find the index of the most recent unapplied log entry having an associated config change. * * @return most recent config change log entry, or zero if none found */ private long findMostRecentConfigChange() { return this.findMostRecentConfigChangeMatching(configChange -> true); } /** * Find the index of the most recent unapplied log entry having an associated config change matching the given predicate. * * @return most recent matching log entry, or zero if none found */ private long findMostRecentConfigChangeMatching(Predicate predicate) { assert Thread.holdsLock(this.raft); for (long index = this.raft.log.getLastIndex(); index > this.raft.log.getLastAppliedIndex(); index--) { final String[] configChange = this.raft.log.getEntryAtIndex(index).getConfigChange(); if (configChange != null && predicate.test(configChange)) return index; } return 0; } /** * Apply a new log entry to the Raft log. * * @throws IllegalStateException if a config change would not be safe at the current time * @throws IllegalArgumentException if the config change attempts to remove the last node * @throws IOException if there was a disk error whilst persisting the new log entry */ private LogEntry applyNewLogEntry(NewLogEntry newLogEntry) throws IOException { assert Thread.holdsLock(this.raft); // Do a couple of extra checks if a config change is included final String[] configChange = newLogEntry.getData().getConfigChange(); if (configChange != null) { // If a config change is involved, check whether we can safely apply it if (!this.mayApplyNewConfigChange()) throw new IllegalStateException("config change cannot be safely applied at this time"); // Disallow a configuration change that removes the last node in a cluster if (this.raft.currentConfig.size() == 1 && configChange[1] == null) { final String lastNode = this.raft.currentConfig.keySet().iterator().next(); if (configChange[0].equals(lastNode)) { throw new IllegalArgumentException(String.format( "can't remove the last node in a cluster (\"%s\")", lastNode)); } } } // Append new log entry to the Raft log final LogEntry logEntry = this.raft.appendLogEntry(this.raft.currentTerm, newLogEntry); // Update follower list if configuration changed if (configChange != null) this.raft.requestService(this.updateKnownFollowersService); // Update commit index (this is only needed if config has changed, or in the single node case) if (configChange != null || this.followerMap.isEmpty()) this.raft.requestService(this.updateLeaderCommitIndexService); // Immediately update all up-to-date followers this.updateAllSynchronizedFollowersNow(); // Done return logEntry; } /** * Check whether a proposed transaction can commit without any MVCC conflict. * * @param baseTerm the term of the log entry on which the transaction is based * @param baseIndex the index of the log entry on which the transaction is based * @param reads reads performed by the transaction * @param dumpDesc description used in conflict dump, or null for none * @return error message on failure, null for success */ private String checkConflicts(long baseTerm, long baseIndex, Reads reads, String dumpDesc) { assert Thread.holdsLock(this.raft); // Check if the base index is too high final long maxIndex = this.raft.log.getLastIndex(); if (baseIndex > maxIndex) return "transaction base index " + baseIndex + " > most recent log index " + maxIndex; // Validate the term of the log entry on which the transaction is based final long baseIndexActualTerm = this.raft.log.getTermAtIndexIfKnown(baseIndex); if (baseIndexActualTerm == 0) { return "transaction base index " + baseIndex + " < first index " + this.raft.log.getFirstIndex() + " for which the term is known"; } if (baseTerm != baseIndexActualTerm) { return "transaction is based on an overwritten log entry with index " + baseIndex + " and term " + baseTerm + " != " + baseIndexActualTerm; } // Check for conflicts from intervening commits for (long index = baseIndex + 1; index <= maxIndex; index++) { final LogEntry logEntry = this.raft.log.getEntryAtIndexIfKnown(index); assert logEntry != null; try { final Conflict conflict = reads.findConflict(logEntry.getMutations()); if (conflict != null) { if (dumpDesc != null) this.dumpConflicts(reads, logEntry.getMutations(), dumpDesc + " fails due to conflicts with " + logEntry); return "writes of committed transaction at index " + index + " conflict with transaction reads from transaction base index " + baseIndex + ": " + conflict; } } catch (IOException e) { this.error("error during conflict check", e); return "error during conflict check: " + e; } } // No conflict return null; } private Follower findFollower(Message msg) { assert Thread.holdsLock(this.raft); final Follower follower = this.followerMap.get(msg.getSenderId()); if (follower == null) this.warn("rec'd {} from unknown follower \"{}\", ignoring", msg, msg.getSenderId()); return follower; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy