All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jsimpledb.kv.raft.FollowerRole Maven / Gradle / Ivy

Go to download

JSimpleDB distributed, linearizable ACID compliant key/value store implementation based on the Raft consensus algorithm.

There is a newer version: 3.6.1
Show newest version

/*
 * Copyright (C) 2015 Archie L. Cobbs. All rights reserved.
 */

package org.jsimpledb.kv.raft;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import javax.annotation.concurrent.GuardedBy;

import org.dellroad.stuff.io.ByteBufferOutputStream;
import org.dellroad.stuff.util.LongMap;
import org.jsimpledb.kv.KVTransactionException;
import org.jsimpledb.kv.RetryTransactionException;
import org.jsimpledb.kv.mvcc.Reads;
import org.jsimpledb.kv.mvcc.Writes;
import org.jsimpledb.kv.raft.msg.AppendRequest;
import org.jsimpledb.kv.raft.msg.AppendResponse;
import org.jsimpledb.kv.raft.msg.CommitRequest;
import org.jsimpledb.kv.raft.msg.CommitResponse;
import org.jsimpledb.kv.raft.msg.GrantVote;
import org.jsimpledb.kv.raft.msg.InstallSnapshot;
import org.jsimpledb.kv.raft.msg.Message;
import org.jsimpledb.kv.raft.msg.PingRequest;
import org.jsimpledb.kv.raft.msg.PingResponse;
import org.jsimpledb.kv.raft.msg.RequestVote;

/**
 * Raft follower role.
 */
public class FollowerRole extends NonLeaderRole {

    @GuardedBy("raft")
    private String leader;                                                          // our leader, if known
    @GuardedBy("raft")
    private String leaderAddress;                                                   // our leader's network address
    @GuardedBy("raft")
    private String votedFor;                                                        // the candidate we voted for this term
    @GuardedBy("raft")
    private SnapshotReceive snapshotReceive;                                        // in-progress snapshot install, if any
    @GuardedBy("raft")
    private final HashSet commitRequests = new HashSet<>();      // waiting for CommitResponse from leader
    @GuardedBy("raft")
    private final LongMap pendingWrites = new LongMap<>();            // wait for AppendRequest with null data
    @GuardedBy("raft")
    private Timestamp lastLeaderMessageTime;                                        // time of most recent rec'd AppendRequest
    @GuardedBy("raft")
    private Timestamp leaderLeaseTimeout;                                           // latest rec'd leader lease timeout
    @GuardedBy("raft")
    private HashMap probeTimestamps;                             // used only when probing majority

// Constructors

    FollowerRole(RaftKVDatabase raft) {
        this(raft, null, null, null);
    }

    FollowerRole(RaftKVDatabase raft, String leader, String leaderAddress) {
        this(raft, leader, leaderAddress, leader);
    }

    FollowerRole(RaftKVDatabase raft, String leader, String leaderAddress, String votedFor) {
        super(raft, raft.isClusterMember());
        this.leader = leader;
        this.leaderAddress = leaderAddress;
        this.votedFor = votedFor;
        assert this.leaderAddress != null || this.leader == null;
    }

// Status

    /**
     * Get the identity of my leader, if known.
     *
     * @return leader identity, or null if not known
     */
    public String getLeaderIdentity() {
        synchronized (this.raft) {
            return this.leader;
        }
    }

    /**
     * Get the address of my leader, if known.
     *
     * @return leader address, or null if not known
     */
    public String getLeaderAddress() {
        synchronized (this.raft) {
            return this.leaderAddress;
        }
    }

    /**
     * Get the identity of the node that this node voted for this term, if any.
     *
     * @return node voted for, or null if none
     */
    public String getVotedFor() {
        synchronized (this.raft) {
            return this.votedFor;
        }
    }

    /**
     * Determine whether this node is currently in the process of receiving a whole database snapshot download.
     *
     * @return true if snapshot install is in progress
     */
    public boolean isInstallingSnapshot() {
        synchronized (this.raft) {
            return this.snapshotReceive != null;
        }
    }

    /**
     * Determine the number of nodes (including this node) that this node has successfully probed when probing
     * for a majority of nodes with {@link PingRequest}s prior to reverting to a candidate.
     *
     * @return the number of other nodes this node has successfully probed, or -1 if not probing
     */
    public int getNodesProbed() {
        synchronized (this.raft) {
            return this.probeTimestamps != null ? this.calculateProbedNodes() : -1;
        }
    }

// Probing mode

    private int calculateProbedNodes() {
        assert Thread.holdsLock(this.raft);
        assert this.probeTimestamps != null;
        int numProbed = this.raft.isClusterMember() ? 1 : 0;
        final Timestamp now = new Timestamp();
        for (Iterator i = this.probeTimestamps.values().iterator(); i.hasNext(); ) {
            final Timestamp timestamp = i.next();
            if (now.offsetFrom(timestamp) >= this.raft.maxElectionTimeout) {                // timestamp is too old, discard
                i.remove();
                continue;
            }
            numProbed++;
        }
        return numProbed;
    }

// Lifecycle

    @Override
    void setup() {
        assert Thread.holdsLock(this.raft);
        super.setup();
        if (this.log.isDebugEnabled()) {
            this.debug("entering follower role in term " + this.raft.currentTerm
              + (this.leader != null ? "; with leader \"" + this.leader + "\" at " + this.leaderAddress : "")
              + (this.votedFor != null ? "; having voted for \"" + this.votedFor + "\"" : ""));
        }
    }

    @Override
    void shutdown() {
        assert Thread.holdsLock(this.raft);

        // Cancel any in-progress snapshot install
        if (this.snapshotReceive != null) {
            if (this.log.isDebugEnabled())
                this.debug("aborting snapshot install due to leaving follower role");
            this.raft.discardFlipFloppedStateMachine();
            this.snapshotReceive = null;
        }

        // Fail any r/w transactions that are waiting on leader response to a CommitRequest. We've already discarded
        // the reads for these transactions, so there's no way to conflict check them with a new leader.
        for (RaftKVTransaction tx : new ArrayList<>(this.raft.openTransactions.values())) {
            if (this.commitRequests.contains(tx) && tx.addsLogEntry()) {
                assert !tx.isRebasable();
                assert tx.getState().equals(TxState.COMMIT_READY);
                this.raft.fail(tx, new RetryTransactionException(tx, "leader was deposed before commit response received"));
            }
        }

        // Cleanup pending requests and commit writes
        this.commitRequests.clear();
        this.pendingWrites.values().forEach(PendingWrite::cleanup);
        this.pendingWrites.clear();

        // Proceed
        super.shutdown();
    }

// Service

    @Override
    void outputQueueEmpty(String address) {
        assert Thread.holdsLock(this.raft);
        if (address.equals(this.leaderAddress))
            this.raft.requestService(this.checkReadyTransactionsService);       // TODO: track specific transactions
    }

    @Override
    void handleElectionTimeout() {
        assert Thread.holdsLock(this.raft);

        // Invalidate current leader
        this.leader = null;
        this.leaderAddress = null;

        // Is probing enabled? If not convert immediately into a candidate
        if (!this.raft.followerProbingEnabled) {
            if (this.log.isDebugEnabled())
                this.debug("follower election timeout: probing is disabled, so converting immediately to candidate");
            this.raft.changeRole(new CandidateRole(this.raft));
            return;
        }

        // If not probing, start probing; if already probing, then we never heard from a majority, so keep on trying
        if (this.probeTimestamps == null) {
            if (this.log.isDebugEnabled())
                this.debug("follower election timeout: attempting to probe a majority before becoming candidate");
            this.probeTimestamps = new HashMap<>(this.raft.currentConfig.size() - 1);
        }

        // Send out a(nother) round of probes to all other nodes
        final Timestamp now = new Timestamp();
        for (String peer : this.raft.currentConfig.keySet()) {
            if (peer.equals(this.raft.identity))
                continue;
            this.raft.sendMessage(new PingRequest(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm, now));
        }

        // Restart election timeout (now it's really a ping timer)
        this.restartElectionTimer();

        // Handle the case where I am the only node
        this.checkProbeResult();
    }

    /**
     * Check whether the election timer should be running, and make it so.
     *
     * 

* This should be invoked: *

    *
  • After a log entry that contains a configuration change has been added to the log
  • *
  • When a snapshot install starts
  • *
  • When a snapshot install completes
  • *
*/ private void updateElectionTimer() { assert Thread.holdsLock(this.raft); final boolean isClusterMember = this.raft.isClusterMember(); final boolean electionTimerRunning = this.electionTimer.isRunning(); if (isClusterMember && !electionTimerRunning) { if (this.log.isTraceEnabled()) this.trace("starting up election timer because I'm now part of the current config"); this.restartElectionTimer(); } else if (!isClusterMember && electionTimerRunning) { if (this.log.isTraceEnabled()) this.trace("stopping election timer because I'm no longer part of the current config"); this.electionTimer.cancel(); } } // Transactions @Override void handleLinearizableReadOnlyChange(RaftKVTransaction tx) { // Sanity check super.handleLinearizableReadOnlyChange(tx); assert !this.commitRequests.contains(tx); // Send an immediate CommitRequest this.checkSendCommitRequest(tx, false); } @Override void checkReadyTransactionNeedingCommitInfo(RaftKVTransaction tx) { // Sanity check super.checkReadyTransactionNeedingCommitInfo(tx); // Send CommitRequest if not already sent this.checkSendCommitRequest(tx, true); } private void checkSendCommitRequest(RaftKVTransaction tx, boolean allowConfigure) { // Sanity check final boolean addsLogEntry = tx.addsLogEntry(); assert Thread.holdsLock(this.raft); assert (tx.getState().equals(TxState.EXECUTING) && !addsLogEntry) || tx.getState().equals(TxState.COMMIT_READY); // Did we already send a CommitRequest for this transaction? if (this.commitRequests.contains(tx)) { if (this.log.isTraceEnabled()) this.trace("not sending CommitRequest for tx " + tx + " because request already sent"); return; } // If we are installing a snapshot, we must wait if (this.snapshotReceive != null) { if (this.log.isTraceEnabled()) this.trace("not sending CommitRequest for tx " + tx + " because a snapshot install is in progress"); return; } // Handle situation where we are unconfigured and not part of any cluster yet if (allowConfigure && !this.raft.isConfigured()) { // Get transaction mutations final String[] configChange = tx.getConfigChange(); // Allow an empty read-only transaction when unconfigured if (!addsLogEntry) { this.raft.succeed(tx); return; } // Otherwise, we can only handle an initial config change that is adding the local node if (configChange == null || !configChange[0].equals(this.raft.identity) || configChange[1] == null) { throw new RetryTransactionException(tx, "unconfigured system: an initial configuration change adding" + " the local node (\"" + this.raft.identity + "\") as the first member of a new cluster is required"); } // Create a new cluster if needed if (this.raft.clusterId == 0) { // Pick a new, random cluster ID int newClusterId; do newClusterId = this.raft.random.nextInt(); while (newClusterId == 0); // Join new cluster this.info("creating new cluster with ID " + String.format("0x%08x", newClusterId)); if (!this.raft.joinCluster(newClusterId)) throw new KVTransactionException(tx, "error persisting new cluster ID"); } // Advance term assert this.raft.currentTerm == 0; if (!this.raft.advanceTerm(this.raft.currentTerm + 1)) throw new KVTransactionException(tx, "error advancing term"); // Append the first entry to the Raft log final LogEntry logEntry; try { logEntry = this.raft.appendLogEntry(this.raft.currentTerm, new NewLogEntry(tx)); } catch (Exception e) { throw new KVTransactionException(tx, "error attempting to persist transaction", e); } if (this.log.isDebugEnabled()) this.debug("added log entry " + logEntry + " for local transaction " + tx); assert logEntry.getTerm() == 1; assert logEntry.getIndex() == 1; // Advance transaction this.advanceReadyTransactionWithCommitInfo(tx, 1, 1, null); // Rebase any other transactions this.rebaseTransactions(); // Update our commit term and index from new log entry this.raft.commitIndex = logEntry.getIndex(); this.checkCommittables(); // Commit transaction new CheckWaitingTransactionService(this, tx).run(); // Trigger key watches this.raft.requestService(this.triggerKeyWatchesService); // Immediately become the leader of our new single-node cluster assert this.raft.isConfigured(); if (this.log.isDebugEnabled()) this.debug("appointing myself leader in newly created cluster"); this.raft.changeRole(new LeaderRole(this.raft)); return; } // If we don't have a leader yet, or leader's queue is full, we must wait if (this.leader == null || this.raft.isTransmitting(this.leaderAddress)) { if (this.log.isTraceEnabled()) { this.trace("leaving alone tx " + tx + " because leader " + (this.leader == null ? "is not known yet" : "\"" + this.leader + "\" is not writable yet")); } return; } // For read-write transactions, send the reads & writes to the leader so leader can check for conflicts ByteBuffer readsData = null; ByteBuffer mutationData = null; if (addsLogEntry) { // Serialize reads into buffer assert tx.getConsistency().isGuaranteesUpToDateReads(); final Reads reads = tx.view.getReads(); final long readsDataSize = reads.serializedLength(); if (readsDataSize != (int)readsDataSize) throw new KVTransactionException(tx, "transaction read information exceeds maximum length"); readsData = Util.allocateByteBuffer((int)readsDataSize); try (ByteBufferOutputStream output = new ByteBufferOutputStream(readsData)) { reads.serialize(output); } catch (IOException e) { throw new RuntimeException("unexpected exception", e); } assert !readsData.hasRemaining(); readsData.flip(); // Serialize mutations into a temporary file (but do not close or durably persist yet) final Writes writes = tx.view.getWrites(); // synchronization not req'd here because tx is COMMIT_READY final File file = new File(this.raft.logDir, String.format("%s%019d%s", RaftKVDatabase.TX_FILE_PREFIX, tx.txId, RaftKVDatabase.TEMP_FILE_SUFFIX)); final FileWriter fileWriter; try { fileWriter = new FileWriter(file, this.raft.disableSync); } catch (IOException e) { throw new KVTransactionException(tx, "error saving transaction mutations to temporary file", e); } try { LogEntry.writeData(fileWriter, new LogEntry.Data(writes, tx.getConfigChange())); fileWriter.flush(); } catch (IOException e) { Util.closeIfPossible(fileWriter); this.raft.deleteFile(fileWriter.getFile(), "pending write temp file"); throw new KVTransactionException(tx, "error saving transaction mutations to temporary file", e); } // Load serialized writes from file final long writeLength = fileWriter.getLength(); try { mutationData = Util.readFile(fileWriter.getFile(), writeLength); } catch (IOException e) { Util.closeIfPossible(fileWriter); this.raft.deleteFile(fileWriter.getFile(), "pending write temp file"); throw new KVTransactionException(tx, "error reading transaction mutations from temporary file", e); } // Record pending commit write with temporary file final PendingWrite pendingWrite = new PendingWrite(tx, fileWriter); this.pendingWrites.put(tx.txId, pendingWrite); } // Send commit request to leader final CommitRequest msg = new CommitRequest(this.raft.clusterId, this.raft.identity, this.leader, this.raft.currentTerm, tx.txId, tx.getBaseTerm(), tx.getBaseIndex(), readsData, mutationData); if (this.log.isTraceEnabled()) this.trace("sending " + msg + " to \"" + this.leader + "\" for " + tx); if (!this.raft.sendMessage(msg)) throw new RetryTransactionException(tx, "error sending commit request to leader"); // Record pending request assert !this.commitRequests.contains(tx); this.commitRequests.add(tx); // Mark transaction no longer rebasable if leader will be checking conflicts for us if (addsLogEntry) tx.setNoLongerRebasable(); } @Override void cleanupForTransaction(RaftKVTransaction tx) { assert Thread.holdsLock(this.raft); this.commitRequests.remove(tx); final PendingWrite pendingWrite = this.pendingWrites.remove(tx.txId); if (pendingWrite != null) pendingWrite.cleanup(); super.cleanupForTransaction(tx); } // Messages @Override boolean mayAdvanceCurrentTerm(Message msg) { assert Thread.holdsLock(this.raft); // Deny vote if we have heard from our leader within the minimum election timeout (dissertation, section 4.2.3) if (msg instanceof RequestVote && this.lastLeaderMessageTime != null && this.lastLeaderMessageTime.offsetFromNow() > -this.raft.minElectionTimeout) return false; // OK return true; } @Override void caseAppendRequest(AppendRequest msg, NewLogEntry newLogEntry) { assert Thread.holdsLock(this.raft); // Cancel probing if (this.probeTimestamps != null) { if (this.log.isDebugEnabled()) this.debug("heard from leader before we probed a majority, reverting back to normal follower"); this.probeTimestamps = null; } // Record new cluster ID if we haven't done so already if (this.raft.clusterId == 0) this.raft.joinCluster(msg.getClusterId()); // Record leader if (!msg.getSenderId().equals(this.leader)) { if (this.leader != null && !this.leader.equals(msg.getSenderId())) { this.error("detected a conflicting leader in " + msg + " (previous leader was \"" + this.leader + "\") - should never happen; possible inconsistent cluster configuration (mine: " + this.raft.currentConfig + ")"); } this.leader = msg.getSenderId(); this.leaderAddress = this.raft.returnAddress; this.leaderLeaseTimeout = null; if (this.log.isDebugEnabled()) this.debug("updated leader to \"" + this.leader + "\" at " + this.leaderAddress); this.raft.requestService(this.checkReadyTransactionsService); // allows COMMIT_READY transactions to be sent } // Get message info final long leaderCommitIndex = msg.getLeaderCommit(); final long leaderPrevTerm = msg.getPrevLogTerm(); final long leaderPrevIndex = msg.getPrevLogIndex(); final long logTerm = msg.getLogEntryTerm(); final long logIndex = leaderPrevIndex + 1; // Update timestamp last heard from leader this.lastLeaderMessageTime = new Timestamp(); // Update leader's lease timeout if (msg.getLeaderLeaseTimeout() != null && (this.leaderLeaseTimeout == null || msg.getLeaderLeaseTimeout().compareTo(this.leaderLeaseTimeout) > 0)) { if (this.log.isTraceEnabled()) this.trace("advancing leader lease timeout " + this.leaderLeaseTimeout + " -> " + msg.getLeaderLeaseTimeout()); this.leaderLeaseTimeout = msg.getLeaderLeaseTimeout(); this.raft.requestService(this.checkWaitingTransactionsService); } // If a snapshot install is in progress, cancel it if (this.snapshotReceive != null) { if (this.log.isDebugEnabled()) this.debug("rec'd " + msg + " during in-progress " + this.snapshotReceive + "; aborting snapshot install"); this.raft.discardFlipFloppedStateMachine(); this.snapshotReceive = null; this.updateElectionTimer(); } // Restart election timeout (if running) if (this.electionTimer.isRunning()) this.restartElectionTimer(); // Get my last log entry's index long lastLogIndex = this.raft.getLastLogIndex(); // Check whether our previous log entry term matches that of leader; if not, or it doesn't exist, request fails // Note: if log entry index is prior to my last applied log entry index, Raft guarantees that term must match if (leaderPrevIndex >= this.raft.lastAppliedIndex && (leaderPrevIndex > lastLogIndex || leaderPrevTerm != this.raft.getLogTermAtIndex(leaderPrevIndex))) { if (this.log.isDebugEnabled()) this.debug("rejecting " + msg + " because previous log entry doesn't match"); this.raft.sendMessage(new AppendResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getLeaderTimestamp(), false, this.raft.lastAppliedIndex, this.raft.getLastLogIndex())); return; } // Check whether the message actually contains a log entry we can append; if so, append it boolean success = true; if (leaderPrevIndex >= this.raft.lastAppliedIndex && !msg.isProbe()) { // Check for a conflicting (i.e., never committed, then overwritten) log entry that we need to clear away first if (logIndex <= lastLogIndex && logTerm != this.raft.getLogTermAtIndex(logIndex)) { // Delete conflicting log entry, and all entries that follow it, from the log final int startListIndex = (int)(logIndex - this.raft.lastAppliedIndex - 1); final List conflictList = this.raft.raftLog.subList(startListIndex, this.raft.raftLog.size()); for (LogEntry logEntry : conflictList) { if (this.log.isDebugEnabled()) this.debug("deleting log entry " + logEntry + " overrwritten by " + msg); this.raft.deleteFile(logEntry.getFile(), "overwritten log file"); } try { this.raft.logDirChannel.force(true); } catch (IOException e) { this.warn("errory fsync()'ing log directory " + this.raft.logDir, e); } conflictList.clear(); // Rebuild current config this.raft.currentConfig = this.raft.buildCurrentConfig(); // Update last log entry index lastLogIndex = this.raft.getLastLogIndex(); // Fail any transactions that are based on any of the discarded log entries for (RaftKVTransaction tx : new ArrayList<>(this.raft.openTransactions.values())) { if (tx.getBaseIndex() >= logIndex && !tx.getConsistency().equals(Consistency.UNCOMMITTED)) { this.raft.fail(tx, new RetryTransactionException(tx, "base log entry " + tx.getBaseIndex() + "t" + tx.getBaseTerm() + " overwritten by new leader")); } } } // Append the new log entry - if we don't already have it if (logIndex > lastLogIndex) { assert logIndex == lastLogIndex + 1; LogEntry logEntry = null; do { // If message contains no data, we expect to get the data from the corresponding transaction if (newLogEntry == null) { // Find the matching pending commit write, if any final PendingWrite pendingWrite = this.pendingWrites.values().stream().filter(pw -> { final RaftKVTransaction tx = pw.getTx(); return tx.getState().equals(TxState.COMMIT_WAITING) && tx.getCommitTerm() == logTerm && tx.getCommitIndex() == logIndex; }).findAny().orElse(null); if (pendingWrite == null) { if (this.log.isDebugEnabled()) { this.debug("rec'd " + msg + " but no read-write transaction matching commit " + logIndex + "t" + logTerm + " found; rejecting"); } break; } // Commit's writes are no longer pending final RaftKVTransaction tx = pendingWrite.getTx(); this.pendingWrites.remove(tx.txId); // Close and durably persist the associated temporary file try { pendingWrite.getFileWriter().close(); } catch (IOException e) { this.error("error closing temporary transaction file for " + tx, e); pendingWrite.cleanup(); break; } // Append a new log entry using temporary file try { logEntry = this.raft.appendLogEntry(logTerm, new NewLogEntry(tx, pendingWrite.getFileWriter().getFile())); } catch (Exception e) { this.error("error appending new log entry for " + tx, e); pendingWrite.cleanup(); break; } // Debug if (this.log.isDebugEnabled()) { this.debug("now waiting for commit of " + tx.getCommitIndex() + "t" + tx.getCommitTerm() + " to commit " + tx); } } else { // Append new log entry normally using the data from the request try { logEntry = this.raft.appendLogEntry(logTerm, newLogEntry); } catch (Exception e) { this.error("error appending new log entry", e); break; } } } while (false); // Start/stop election timer as needed if (logEntry != null && logEntry.getConfigChange() != null) this.updateElectionTimer(); // Success? success = logEntry != null; // Rebase transactions if (success) this.rebaseTransactions(); // Update last log entry index lastLogIndex = this.raft.getLastLogIndex(); } } // Update my commit index final long newCommitIndex = Math.min(Math.max(leaderCommitIndex, this.raft.commitIndex), lastLogIndex); if (newCommitIndex > this.raft.commitIndex) { if (this.log.isDebugEnabled()) this.debug("updating leader commit index from " + this.raft.commitIndex + " -> " + newCommitIndex); this.raft.commitIndex = newCommitIndex; this.checkCommittables(); this.raft.requestService(this.checkWaitingTransactionsService); this.raft.requestService(this.triggerKeyWatchesService); this.raft.requestService(this.applyCommittedLogEntriesService); } // Debug if (this.log.isTraceEnabled()) { this.trace("my updated follower state: " + "term=" + this.raft.currentTerm + " commitIndex=" + this.raft.commitIndex + " leaderLeaseTimeout=" + this.leaderLeaseTimeout + " lastApplied=" + this.raft.lastAppliedIndex + "t" + this.raft.lastAppliedTerm + " log=" + this.raft.raftLog); } // Send reply if (success) { this.raft.sendMessage(new AppendResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getLeaderTimestamp(), true, msg.isProbe() ? logIndex - 1 : logIndex, this.raft.getLastLogIndex())); } else { this.raft.sendMessage(new AppendResponse(this.raft.clusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getLeaderTimestamp(), false, this.raft.lastAppliedIndex, this.raft.getLastLogIndex())); } } @Override void caseCommitResponse(CommitResponse msg) { assert Thread.holdsLock(this.raft); // Find transaction final RaftKVTransaction tx = this.raft.openTransactions.get(msg.getTxId()); if (tx == null) // must have been rolled back locally return; assert tx.getConsistency().equals(Consistency.LINEARIZABLE); assert msg.getCommitLeaderLeaseTimeout() == null || !tx.addsLogEntry(); // Sanity check whether we're expecting this response if (!this.commitRequests.remove(tx)) { if (this.log.isDebugEnabled()) this.debug("rec'd " + msg + " for " + tx + " not expecting a response; ignoring"); return; } // Check result if (this.log.isTraceEnabled()) this.trace("rec'd " + msg + " for " + tx); // Do we already have a commit index & term? This would be unusual and can only happen with some leader change if (tx.hasCommitInfo()) { if (this.log.isTraceEnabled()) { this.trace("ignoring " + msg + " for " + tx + "; already have commit " + tx.getCommitIndex() + "t" + tx.getCommitTerm()); } return; } // Did the request fail? if (!msg.isSuccess()) { this.raft.fail(tx, new RetryTransactionException(tx, msg.getErrorMessage())); return; } // If messages can get out of order, then it's possible we've already rebased this tx past its commit index long commitIndex = msg.getCommitIndex(); long commitTerm = msg.getCommitTerm(); if (tx.getBaseIndex() > commitIndex) { if (this.log.isTraceEnabled()) { final long actualCommitTerm = this.raft.getLogTermAtIndexIfKnown(commitIndex); this.trace(tx + " was rebased past its commit index " + commitIndex + "t" + commitTerm + " to " + tx.getBaseIndex() + "t" + tx.getBaseTerm() + "; actual term for index " + commitIndex + " is " + (actualCommitTerm != 0 ? "" + actualCommitTerm : "unknown")); } this.raft.fail(tx, new RetryTransactionException(tx, "transaction was rebased past its commit index")); return; } // Update transaction switch (tx.getState()) { case EXECUTING: assert tx.isReadOnly(); assert !tx.hasCommitInfo(); tx.setCommitInfo(commitTerm, commitIndex, msg.getCommitLeaderLeaseTimeout()); this.checkCommittable(tx); break; case COMMIT_READY: assert !tx.hasCommitInfo(); this.advanceReadyTransactionWithCommitInfo(tx, commitTerm, commitIndex, msg.getCommitLeaderLeaseTimeout()); break; default: if (this.log.isDebugEnabled()) this.debug("rec'd " + msg + " for " + tx + " in state " + tx.getState() + "; ignoring"); return; } } @Override void caseInstallSnapshot(InstallSnapshot msg) { assert Thread.holdsLock(this.raft); // Restart election timer (if running) if (this.electionTimer.isRunning()) this.restartElectionTimer(); // Sanity check that our log is not going backwards if (msg.getSnapshotIndex() < this.raft.commitIndex) { this.warn("rec'd " + msg + " with retrograde index " + msg.getSnapshotIndex() + " < my commit index " + this.raft.commitIndex + ", ignoring"); return; } // Do we have an existing install? boolean startNewInstall = false; if (this.snapshotReceive != null) { // Does the message not match? if (!this.snapshotReceive.matches(msg)) { // If the message is NOT the first one in a new install, ignore it if (msg.getPairIndex() != 0) { if (this.log.isDebugEnabled()) this.debug("rec'd " + msg + " which doesn't match in-progress " + this.snapshotReceive + "; ignoring"); return; } // The message is the first one in a new install, so discard the existing install if (this.log.isDebugEnabled()) { this.debug("rec'd initial " + msg + " with in-progress " + this.snapshotReceive + "; aborting previous install"); } startNewInstall = true; } } else { // If the message is NOT the first one in a new install, ignore it if (msg.getPairIndex() != 0) { if (this.log.isDebugEnabled()) this.debug("rec'd non-initial " + msg + " with no in-progress snapshot install; ignoring"); return; } } // Get snapshot term and index final long term = msg.getSnapshotTerm(); final long index = msg.getSnapshotIndex(); // Set up new install if necessary if (this.snapshotReceive == null || startNewInstall) { assert msg.getPairIndex() == 0; if (this.raft.discardFlipFloppedStateMachine()) this.warn("detected left-over content in flip-flopped state machine; discarding"); this.updateElectionTimer(); this.snapshotReceive = new SnapshotReceive(this.raft.kv, this.raft.getFlipFloppedStateMachinePrefix(), term, index, msg.getSnapshotConfig()); if (this.log.isDebugEnabled()) { this.debug("starting new snapshot install from \"" + msg.getSenderId() + "\" of " + index + "t" + term + " with config " + msg.getSnapshotConfig()); } } assert this.snapshotReceive.matches(msg); // Apply next chunk of key/value pairs if (this.log.isDebugEnabled()) this.debug("applying " + msg + " to " + this.snapshotReceive); try { this.snapshotReceive.applyNextChunk(msg.getData()); } catch (Exception e) { this.error("error applying snapshot to key/value store; aborting snapshot install", e); this.snapshotReceive = null; this.raft.discardFlipFloppedStateMachine(); this.updateElectionTimer(); return; } // If that was the last chunk, finalize persistent state if (msg.isLastChunk()) { // Flip-flop state machine final Map snapshotConfig = this.snapshotReceive.getSnapshotConfig(); if (this.log.isDebugEnabled()) { this.debug("snapshot install from \"" + msg.getSenderId() + "\" of " + index + "t" + term + " with config " + snapshotConfig + " complete"); } this.snapshotReceive = null; this.raft.flipFlopStateMachine(term, index, snapshotConfig); this.updateElectionTimer(); // Fail transactions we can no longer deal with for (RaftKVTransaction tx : new ArrayList<>(this.raft.openTransactions.values())) { // Fail if base index is past our applied index if (tx.getBaseIndex() > index) { this.raft.fail(tx, new RetryTransactionException(tx, "rec'd snapshot install from leader and base index " + tx.getBaseIndex() + " > " + index)); } // Fail if rebasable and the base index doesn't exactly match if (tx.isRebasable() && (tx.getBaseTerm() != term || tx.getBaseIndex() != index)) { this.raft.fail(tx, new RetryTransactionException(tx, "snapshot install of " + index + "t" + term + " invalidated base " + tx.getBaseIndex() + "t" + tx.getBaseTerm())); } } // Check for newly committable transactions this.checkCommittables(); } } @Override void caseRequestVote(RequestVote msg) { assert Thread.holdsLock(this.raft); // Record new cluster ID if we haven't done so already if (this.raft.clusterId == 0) this.raft.joinCluster(msg.getClusterId()); // Did we already vote for somebody else? final String peer = msg.getSenderId(); if (this.votedFor != null && !this.votedFor.equals(peer)) { if (this.log.isDebugEnabled()) this.debug("rec'd " + msg + "; rejected because we already voted for \"" + this.votedFor + "\""); return; } // Verify that we are allowed to vote for this peer if (msg.getLastLogTerm() < this.raft.getLastLogTerm() || (msg.getLastLogTerm() == this.raft.getLastLogTerm() && msg.getLastLogIndex() < this.raft.getLastLogIndex())) { if (this.log.isDebugEnabled()) { this.debug("rec'd " + msg + "; rejected because their log " + msg.getLastLogIndex() + "t" + msg.getLastLogTerm() + " loses to ours " + this.raft.getLastLogIndex() + "t" + this.raft.getLastLogTerm()); } return; } // Persist our vote for this peer (if not already persisted) if (this.votedFor == null) { if (this.log.isDebugEnabled()) this.debug("granting vote to \"" + peer + "\" in term " + this.raft.currentTerm); if (!this.updateVotedFor(peer)) return; } else { if (this.log.isDebugEnabled()) this.debug("confirming existing vote for \"" + peer + "\" in term " + this.raft.currentTerm); } // Send reply this.raft.sendMessage(new GrantVote(this.raft.clusterId, this.raft.identity, peer, this.raft.currentTerm)); } @Override void caseGrantVote(GrantVote msg) { assert Thread.holdsLock(this.raft); // Ignore - we already lost the election to the real leader if (this.log.isDebugEnabled()) this.debug("ignoring " + msg + " rec'd while in " + this); } @Override void casePingResponse(PingResponse msg) { assert Thread.holdsLock(this.raft); // Are we probing? if (this.probeTimestamps == null) { if (this.log.isTraceEnabled()) this.trace("ignoring " + msg + " rec'd while not probing in " + this); return; } // Update peer's ping timestamp this.probeTimestamps.put(msg.getSenderId(), msg.getTimestamp()); // Check new status this.checkProbeResult(); } private void checkProbeResult() { assert Thread.holdsLock(this.raft); assert this.probeTimestamps != null; // Get the number of nodes successfully probed so far (including ourselves), and the minimum number required (a majority) final int numProbed = this.calculateProbedNodes(); final int numRequired = this.raft.currentConfig.size() / 2 + 1; if (this.log.isTraceEnabled()) this.trace("now we have probed " + numProbed + "/" + numRequired + " required nodes"); // If we have successfully probed a majority, then we can finally become a candidate if (numProbed >= numRequired) { if (this.log.isDebugEnabled()) this.debug("successfully probed " + numProbed + " nodes, now converting to candidate"); this.raft.changeRole(new CandidateRole(this.raft)); } } // Helper methods /** * Record the peer voted for in the current term. */ private boolean updateVotedFor(String recipient) { // Sanity check assert Thread.holdsLock(this.raft); assert recipient != null; // Update persistent store final Writes writes = new Writes(); writes.getPuts().put(RaftKVDatabase.VOTED_FOR_KEY, this.raft.encodeString(recipient)); try { this.raft.kv.mutate(writes, true); } catch (Exception e) { this.error("error persisting vote for \"" + recipient + "\"", e); return false; } // Done this.votedFor = recipient; return true; } // Role @Override Timestamp getLeaderLeaseTimeout() { return this.leaderLeaseTimeout; } // Object @Override public String toString() { synchronized (this.raft) { final List pendingRequestIds = commitRequests.stream() .map(tx -> tx.txId) .collect(Collectors.toList()); return this.toStringPrefix() + (this.leader != null ? ",leader=\"" + this.leader + "\"" : "") + (this.votedFor != null ? ",votedFor=\"" + this.votedFor + "\"" : "") + (!pendingRequestIds.isEmpty() ? ",commitRequests=" + pendingRequestIds : "") + (!this.pendingWrites.isEmpty() ? ",pendingWrites=" + this.pendingWrites.keySet() : "") + "]"; } } // Debug @Override boolean checkState() { assert Thread.holdsLock(this.raft); assert this.leaderAddress != null || this.leader == null; assert this.electionTimer.isRunning() == this.raft.isClusterMember(); for (RaftKVTransaction tx : this.commitRequests) { switch (tx.getState()) { case EXECUTING: assert tx.isReadOnly(); break; case COMMIT_READY: break; default: assert false; break; } assert !tx.hasCommitInfo(); } for (Map.Entry entry : this.pendingWrites.entrySet()) { final long txId = entry.getKey(); final PendingWrite pendingWrite = entry.getValue(); final RaftKVTransaction tx = pendingWrite.getTx(); assert txId == tx.txId; assert tx.getState().equals(TxState.COMMIT_READY) || tx.getState().equals(TxState.COMMIT_WAITING); assert pendingWrite.getFileWriter().getFile().exists(); } return true; } @Override void checkTransaction(RaftKVTransaction tx) { super.checkTransaction(tx); switch (tx.getState()) { case EXECUTING: assert !this.pendingWrites.containsKey(tx.txId); break; case COMMIT_READY: break; case COMMIT_WAITING: assert !this.commitRequests.contains(tx); break; default: assert !this.pendingWrites.containsKey(tx.txId); assert !this.commitRequests.contains(tx); break; } } // PendingWrite // Represents a read-write transaction in COMMIT_READY or COMMIT_WAITING for which the server's AppendRequest // will have null mutationData, because we will already have the data on hand waiting in a temporary file. This // is a simple optimization to avoid sending the same data from leader -> follower just sent from follower -> leader. private static class PendingWrite { private final RaftKVTransaction tx; private final FileWriter fileWriter; PendingWrite(RaftKVTransaction tx, FileWriter fileWriter) { this.tx = tx; this.fileWriter = fileWriter; } public RaftKVTransaction getTx() { return this.tx; } public FileWriter getFileWriter() { return this.fileWriter; } public void cleanup() { Util.closeIfPossible(this.fileWriter); this.tx.raft.deleteFile(this.fileWriter.getFile(), "pending write temp file"); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy