All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jsimpledb.kv.raft.Role Maven / Gradle / Ivy


/*
 * Copyright (C) 2015 Archie L. Cobbs. All rights reserved.
 */

package org.jsimpledb.kv.raft;

import com.google.common.collect.Iterables;
import com.google.common.primitives.Bytes;

import java.io.IOException;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.jsimpledb.kv.KVTransactionException;
import org.jsimpledb.kv.KeyRange;
import org.jsimpledb.kv.RetryTransactionException;
import org.jsimpledb.kv.mvcc.Mutations;
import org.jsimpledb.kv.mvcc.Reads;
import org.jsimpledb.kv.mvcc.Writes;
import org.jsimpledb.kv.raft.msg.AppendRequest;
import org.jsimpledb.kv.raft.msg.AppendResponse;
import org.jsimpledb.kv.raft.msg.CommitRequest;
import org.jsimpledb.kv.raft.msg.CommitResponse;
import org.jsimpledb.kv.raft.msg.GrantVote;
import org.jsimpledb.kv.raft.msg.InstallSnapshot;
import org.jsimpledb.kv.raft.msg.Message;
import org.jsimpledb.kv.raft.msg.PingRequest;
import org.jsimpledb.kv.raft.msg.PingResponse;
import org.jsimpledb.kv.raft.msg.RequestVote;
import org.jsimpledb.util.LongEncoder;
import org.slf4j.Logger;

/**
 * Common superclass for the three roles played by a Raft node:
 * {@linkplain LeaderRole leader}, {@linkplain FollowerRole follower}, and {@linkplain CandidateRole candidate}.
 */
public abstract class Role {

    final Logger log;
    final RaftKVDatabase raft;
    final Service checkReadyTransactionsService = new Service(this, "check ready transactions") {
        @Override
        public void run() {
            Role.this.checkReadyTransactions();
        }
    };
    final Service checkWaitingTransactionsService = new Service(this, "check waiting transactions") {
        @Override
        public void run() {
            Role.this.checkWaitingTransactions();
        }
    };

    // NOTE: use of this service requires that 'checkWaitingTransactionsService' be scheduled first!
    final Service applyCommittedLogEntriesService = new Service(this, "apply committed logs") {
        @Override
        public void run() {
            Role.this.applyCommittedLogEntries();
        }
    };
    final Service triggerKeyWatchesService = new Service(this, "trigger key watches") {
        @Override
        public void run() {
            Role.this.triggerKeyWatches();
        }
    };

// Constructors

    Role(RaftKVDatabase raft) {
        this.raft = raft;
        this.log = this.raft.log;
        assert Thread.holdsLock(this.raft);
    }

// Status

    /**
     * Get the {@link RaftKVDatabase} with which this instance is associated.
     *
     * @return associated database
     */
    public RaftKVDatabase getKVDatabase() {
        return this.raft;
    }

// Lifecycle

    void setup() {
        assert Thread.holdsLock(this.raft);
        this.raft.requestService(this.checkReadyTransactionsService);
        this.raft.requestService(this.checkWaitingTransactionsService);
        this.raft.requestService(this.applyCommittedLogEntriesService);
    }

    void shutdown() {
        assert Thread.holdsLock(this.raft);
        this.raft.openTransactions.values()
          .forEach(this::cleanupForTransaction);
    }

// Service

    abstract void outputQueueEmpty(String address);

    /**
     * Check transactions in the {@link TxState#COMMIT_READY} state to see if we can advance them.
     */
    void checkReadyTransactions() {
        assert Thread.holdsLock(this.raft);
        for (RaftKVTransaction tx : new ArrayList<>(this.raft.openTransactions.values()))
            new CheckReadyTransactionService(this, tx).run();
    }

    /**
     * Check transactions in the {@link TxState#COMMIT_WAITING} state to see if they are committed yet.
     * We invoke this service method whenever our {@code commitIndex} advances.
     */
    void checkWaitingTransactions() {
        assert Thread.holdsLock(this.raft);
        for (RaftKVTransaction tx : new ArrayList<>(this.raft.openTransactions.values()))
            new CheckWaitingTransactionService(this, tx).run();
    }

    /**
     * Apply committed but unapplied log entries to the state machine.
     * We invoke this service method whenever log entries are added or our {@code commitIndex} advances.
     *
     * 

* Note: checkWaitingTransactions() must have been invoked already when this method is invoked. */ void applyCommittedLogEntries() { assert Thread.holdsLock(this.raft); // Determine how many committed log entries we can apply to the state machine at this time int numEntriesToApply = 0; while (this.raft.lastAppliedIndex + numEntriesToApply < this.raft.commitIndex && this.mayApplyLogEntry(this.raft.raftLog.get(numEntriesToApply))) numEntriesToApply++; final long maxAppliedIndex = this.raft.lastAppliedIndex + numEntriesToApply; assert maxAppliedIndex <= this.raft.commitIndex; // Sanity check that all committable transactions have been committed before we compact their commit log entries boolean assertionsEnabled = false; assert assertionsEnabled = true; if (assertionsEnabled) { for (RaftKVTransaction tx : this.raft.openTransactions.values()) assert !tx.getState().equals(TxState.COMMIT_WAITING) || tx.getCommitIndex() > this.raft.commitIndex; } // Apply committed log entries to the state machine while (this.raft.lastAppliedIndex < maxAppliedIndex) { // Grab the first unwritten log entry final LogEntry logEntry = this.raft.raftLog.get(0); assert logEntry.getIndex() == this.raft.lastAppliedIndex + 1; // Get the current config as of the log entry we're about to apply final HashMap logEntryConfig = new HashMap<>(this.raft.lastAppliedConfig); logEntry.applyConfigChange(logEntryConfig); // Prepare combined Mutations containing prefixed log entry changes plus my own final Writes logWrites = logEntry.getWrites(); final Writes myWrites = new Writes(); myWrites.getPuts().put(RaftKVDatabase.LAST_APPLIED_TERM_KEY, LongEncoder.encode(logEntry.getTerm())); myWrites.getPuts().put(RaftKVDatabase.LAST_APPLIED_INDEX_KEY, LongEncoder.encode(logEntry.getIndex())); myWrites.getPuts().put(RaftKVDatabase.LAST_APPLIED_CONFIG_KEY, this.raft.encodeConfig(logEntryConfig)); final byte[] stateMachinePrefix = this.raft.getStateMachinePrefix(); final Mutations mutations = new Mutations() { @Override public Iterable getRemoveRanges() { return Iterables.transform(logWrites.getRemoveRanges(), range -> range.prefixedBy(stateMachinePrefix)); } @Override public Iterable> getPutPairs() { return Iterables.concat( Iterables.transform(logWrites.getPutPairs(), entry -> new AbstractMap.SimpleEntry<>(Bytes.concat(stateMachinePrefix, entry.getKey()), entry.getValue())), myWrites.getPutPairs()); } @Override public Iterable> getAdjustPairs() { return Iterables.transform(logWrites.getAdjustPairs(), entry -> new AbstractMap.SimpleEntry<>(Bytes.concat(stateMachinePrefix, entry.getKey()), entry.getValue())); } }; // Apply updates to the key/value store; when applying the last one, durably persist if (this.log.isDebugEnabled()) this.debug("applying committed log entry " + logEntry + " to key/value store"); try { this.raft.kv.mutate(mutations, !this.raft.disableSync && this.raft.lastAppliedIndex == maxAppliedIndex); } catch (Exception e) { if (e instanceof RuntimeException && e.getCause() instanceof IOException) e = (IOException)e.getCause(); this.error("error applying log entry " + logEntry + " to key/value store", e); break; } // Update in-memory state assert logEntry.getIndex() == this.raft.lastAppliedIndex + 1; this.raft.incrementLastAppliedIndex(logEntry.getTerm()); logEntry.applyConfigChange(this.raft.lastAppliedConfig); assert this.raft.currentConfig.equals(this.raft.buildCurrentConfig()); // Delete the log entry this.raft.raftLog.remove(0); this.raft.deleteFile(logEntry.getFile(), "applied log file"); } } /** * Determine whether the given log entry may be applied to the state machine. * This method can assume that the log entry is already committed. * * @param logEntry log entry to apply */ final boolean mayApplyLogEntry(LogEntry logEntry) { assert Thread.holdsLock(this.raft); // Are we running out of memory, or keeping around too many log entries? If so, go ahead no matter what the subclass says. final long logEntryMemoryUsage = this.raft.getUnappliedLogMemoryUsage(); if (logEntryMemoryUsage > this.raft.maxUnappliedLogMemory || this.raft.raftLog.size() > this.raft.maxUnappliedLogEntries) { if (this.log.isTraceEnabled()) { this.trace("allowing log entry " + logEntry + " to be applied because memory usage " + logEntryMemoryUsage + " > " + this.raft.maxUnappliedLogMemory + " and/or log length " + this.raft.raftLog.size() + " > " + this.raft.maxUnappliedLogEntries); } return true; } // Check with subclass return this.roleMayApplyLogEntry(logEntry); } /** * Role-specific hook to determine whether the given log entry should be applied to the state machine. * This method can assume that the log entry is already committed. * * @param logEntry log entry to apply */ boolean roleMayApplyLogEntry(LogEntry logEntry) { return true; } /** * Trigger any key watches for changes in log entries committed since the last time we checked. * *

* This should be invoked: *

    *
  • After advancing the commitIndex
  • *
  • After resetting the state machine
  • *
  • After installing a snapshot
  • *
*/ void triggerKeyWatches() { // Sanity check assert Thread.holdsLock(this.raft); assert this.raft.commitIndex >= this.raft.lastAppliedIndex; assert this.raft.commitIndex <= this.raft.lastAppliedIndex + this.raft.raftLog.size(); assert this.raft.keyWatchIndex <= this.raft.commitIndex; // If nobody is watching, don't bother if (this.raft.keyWatchTracker == null) return; // If we have recevied a snapshot install, we may not be able to tell which keys have changed since last notification; // in that case, trigger all key watches; otherwise, trigger the keys affected by newly committed log entries if (this.raft.keyWatchIndex < this.raft.lastAppliedIndex) { this.raft.keyWatchTracker.triggerAll(); this.raft.keyWatchIndex = this.raft.commitIndex; } else { while (this.raft.keyWatchIndex < this.raft.commitIndex) this.raft.keyWatchTracker.trigger(this.raft.getLogEntryAtIndex(++this.raft.keyWatchIndex).getWrites()); } } // Transactions /** * Check a transaction that is ready to be committed (in the {@link TxState#COMMIT_READY} state). * *

* This should be invoked: *

    *
  • After changing roles
  • *
  • After a transaction has entered the {@link TxState#COMMIT_READY} state
  • *
  • After the leader is newly known (in {@link FollowerRole})
  • *
  • After the leader's output queue goes from non-empty to empty (in {@link FollowerRole})
  • *
  • After the leader's {@code commitIndex} has advanced, in case a config change transaction * is waiting on a previous config change transaction (in {@link LeaderRole})
  • *
* * @param tx the transaction * @throws KVTransactionException if an error occurs */ void checkReadyTransaction(RaftKVTransaction tx) { // Sanity check assert Thread.holdsLock(this.raft); assert tx.getState().equals(TxState.COMMIT_READY); // Determine whether transaction is truly read-only, i.e., it does not imply adding a new Raft log entry final boolean readOnly = !tx.addsLogEntry(); // Check whether we can commit the transaction immediately if (readOnly && !tx.getConsistency().isWaitsForLogEntryToBeCommitted()) { // i.e., UNCOMMITTED, EVENTUAL_COMMITTED if (this.log.isTraceEnabled()) this.trace("trivial commit for read-only, " + tx.getConsistency() + " " + tx); this.raft.succeed(tx); return; } // Check whether we don't need to bother talking to the leader if (readOnly && !tx.getConsistency().isGuaranteesUpToDateReads()) { // i.e., EVENTUAL this.advanceReadyTransaction(tx, tx.baseTerm, tx.baseIndex); return; } // Requires leader communication - let subclass handle it this.checkReadyLinearizableTransaction(tx, readOnly); // i.e., LINEARIZABLE } /** * Check a transaction that is ready to be committed (in the {@link TxState#COMMIT_READY} state) * and requires communication with the leader. * *

* This will not be invoked unless the transaction is read/write or the consistency level provides up-to-date reads. * * @param tx the transaction * @param readOnly if transaction commit does not imply adding a Raft log entry * @throws KVTransactionException if an error occurs */ abstract void checkReadyLinearizableTransaction(RaftKVTransaction tx, boolean readOnly); /** * Advance a transaction from the {@link TxState#COMMIT_READY} state to the {@link TxState#COMMIT_WAITING} state. * * @param tx the transaction * @param commitTerm term of log entry that must be committed before the transaction may succeed * @param commitIndex index of log entry that must be committed before the transaction may succeed */ void advanceReadyTransaction(RaftKVTransaction tx, long commitTerm, long commitIndex) { // Sanity check assert Thread.holdsLock(this.raft); assert tx.getState().equals(TxState.COMMIT_READY); // Set commit term & index and update state if (this.log.isDebugEnabled()) this.debug("advancing " + tx + " to " + TxState.COMMIT_WAITING + " with commit " + commitIndex + "t" + commitTerm); tx.setCommitTerm(commitTerm); tx.setCommitIndex(commitIndex); tx.setState(TxState.COMMIT_WAITING); // Discard information we no longer need tx.view.disableReadTracking(); // Check this transaction to see if it can be committed this.raft.requestService(this.checkWaitingTransactionsService); } /** * Check a transaction waiting for its log entry to be committed (in the {@link TxState#COMMIT_WAITING} state). * *

* This should be invoked: *

    *
  • After changing roles
  • *
  • After a transaction has entered the {@link TxState#COMMIT_WAITING} state
  • *
  • After advancing my {@code commitIndex} (as leader or follower)
  • *
  • After receiving an updated {@linkplain AppendResponse#getLeaderLeaseTimeout leader lease timeout} * (in {@link FollowerRole})
  • *
* * @param tx the transaction * @throws KVTransactionException if an error occurs */ void checkWaitingTransaction(RaftKVTransaction tx) { // Sanity check assert Thread.holdsLock(this.raft); assert tx.getConsistency().isGuaranteesUpToDateReads(); // Check whether the transaction's commit index and term corresponds to a committed log entry final long commitIndex = tx.getCommitIndex(); final long commitTerm = tx.getCommitTerm(); if (!tx.isCommitIndexCommitted()) { // Has the transaction's commit log entry been committed yet? If not, the transaction is not committed yet. if (commitIndex > this.raft.commitIndex) return; // Determine the actual term of the log entry corresponding to commitIndex final long commitIndexActualTerm; if (commitIndex >= this.raft.lastAppliedIndex) commitIndexActualTerm = this.raft.getLogTermAtIndex(commitIndex); else { // The commit log entry has already been applied to the state machine; we may or may not know what its term was if ((commitIndexActualTerm = this.raft.getAppliedLogEntryTerm(commitIndex)) == 0) { // This can happen if we lose contact and by the time we're back the log entry has // already been applied to the state machine on some leader and that leader sent // us an InstallSnapshot message. We don't know whether it actually got committed // or not, so the transaction must be retried. throw new RetryTransactionException(tx, "commit index " + commitIndex + " < last applied log index " + this.raft.lastAppliedIndex); } } // Verify the term of the committed log entry; if not what we expect, the log entry was overwritten by a new leader if (commitTerm != commitIndexActualTerm) { throw new RetryTransactionException(tx, "leader was deposed during commit and transaction's log entry " + commitIndex + "t" + commitTerm + " overwritten by " + commitIndex + "t" + commitIndexActualTerm); } // The transaction's commit log entry is committed tx.setCommitIndexCommitted(); } // Check with subclass if (!this.mayCommit(tx)) return; // Transaction is officially committed now if (this.log.isTraceEnabled()) this.trace("commit successful for " + tx + " (commit index " + this.raft.commitIndex + " >= " + commitIndex + ")"); this.raft.succeed(tx); } boolean mayCommit(RaftKVTransaction tx) { return true; } /** * Rebase all rebasable transactions. * *

* This should be invoked after appending a new Raft log entry. * * @param tx the transaction * @throws KVTransactionException if an error occurs */ void rebaseTransactions() { // Sanity check assert Thread.holdsLock(this.raft); // Rebase all rebasable transactions new ArrayList<>(this.raft.openTransactions.values()).stream() .filter(this::shouldRebase) .forEach(tx -> { try { this.rebaseTransaction(tx); } catch (KVTransactionException e) { this.raft.fail(tx, e); } catch (Exception | Error e) { this.raft.error("error rebasing transaction " + tx, e); this.raft.fail(tx, new KVTransactionException(tx, e)); } }); } /** * Rebase the given transaction so that its base log entry the last log entry. * *

* This should be invoked for any {@linkplain #shouldRebase rebasable} transaction * after appending a new Raft log entry. * *

* This method assumes that the given transaction is {@linkplain #shouldRebase rebasable}. * * @param tx the transaction * @throws KVTransactionException if an error occurs */ private void rebaseTransaction(RaftKVTransaction tx) { // Sanity check assert Thread.holdsLock(this.raft); assert this.shouldRebase(tx); assert tx.failure == null; assert tx.baseIndex >= this.raft.lastAppliedIndex; // Lock the mutable view so the rebase appears to happen atomically to any threads viewing the transaction synchronized (tx.view) { // Check for conflicts between transaction reads and newly committed log entries long baseIndex = tx.baseIndex; final long lastIndex = this.raft.getLastLogIndex(); while (baseIndex < lastIndex) { // Get log entry final LogEntry logEntry = this.raft.getLogEntryAtIndex(++baseIndex); // Check for conflict if (tx.view.getReads().isConflict(logEntry.getWrites())) { if (this.log.isDebugEnabled()) this.debug("cannot rebase " + tx + " past " + logEntry + " due to conflicts, failing"); if (this.raft.dumpConflicts) this.dumpConflicts(tx.view.getReads(), logEntry, "local txId=" + tx.txId); throw new RetryTransactionException(tx, "writes of committed transaction at index " + baseIndex + " conflict with transaction reads from transaction base index " + tx.baseIndex); } } assert baseIndex == lastIndex; final long baseTerm = this.raft.getLogTermAtIndex(baseIndex); // Rebase transaction if (this.log.isDebugEnabled()) this.debug("rebasing " + tx + " from " + tx.baseIndex + "t" + tx.baseTerm + " -> " + baseIndex + "t" + baseTerm); switch (tx.getState()) { case EXECUTING: final MostRecentView view = new MostRecentView(this.raft, false); assert view.getIndex() == baseIndex; assert view.getTerm() == baseTerm; tx.rebase(baseTerm, baseIndex, view.getView().getKVStore(), view.getSnapshot()); break; case COMMIT_READY: tx.rebase(baseTerm, baseIndex); break; default: throw new RuntimeException("internal error"); } } } void dumpConflicts(Reads reads, LogEntry logEntry, String description) { final StringBuilder buf = new StringBuilder(); buf.append(description + " failing due to conflicts with " + logEntry + ":"); for (String conflict : reads.getConflicts(logEntry.getWrites())) buf.append("\n ").append(conflict); this.info(buf.toString()); } /** * Determine if the given transaction should be kept rebased on the last log entry. */ boolean shouldRebase(RaftKVTransaction tx) { assert Thread.holdsLock(this.raft); return (tx.getState().equals(TxState.EXECUTING) || tx.getState().equals(TxState.COMMIT_READY)) && (tx.getConsistency().isGuaranteesUpToDateReads() || tx.addsLogEntry()); } /** * Perform any role-specific transaction cleanups. * *

* Invoked either when transaction is completed or this role is being shutdown. * *

* The implementation in {@link Role} does nothing; subclasses should override if appropriate. * * @param tx the transaction */ void cleanupForTransaction(RaftKVTransaction tx) { assert Thread.holdsLock(this.raft); } // Messages // This is a package access version of "implements MessageSwitch" abstract void caseAppendRequest(AppendRequest msg, NewLogEntry newLogEntry); abstract void caseAppendResponse(AppendResponse msg); abstract void caseCommitRequest(CommitRequest msg, NewLogEntry newLogEntry); abstract void caseCommitResponse(CommitResponse msg); abstract void caseGrantVote(GrantVote msg); abstract void caseInstallSnapshot(InstallSnapshot msg); abstract void caseRequestVote(RequestVote msg); void casePingRequest(PingRequest msg) { assert Thread.holdsLock(this.raft); final int responseClusterId = this.raft.clusterId != 0 ? this.raft.clusterId : msg.getClusterId(); this.raft.sendMessage(new PingResponse(responseClusterId, this.raft.identity, msg.getSenderId(), this.raft.currentTerm, msg.getTimestamp())); } void casePingResponse(PingResponse msg) { assert Thread.holdsLock(this.raft); // ignore by default } boolean mayAdvanceCurrentTerm(Message msg) { return true; } void failUnexpectedMessage(Message msg) { this.warn("rec'd unexpected message " + msg + " while in role " + this + "; ignoring"); } // Debug abstract boolean checkState(); void checkTransaction(RaftKVTransaction tx) { if (this.shouldRebase(tx)) assert tx.baseIndex == this.raft.getLastLogIndex() && tx.baseTerm == this.raft.getLastLogTerm(); } // Logging void trace(String msg, Throwable t) { this.raft.trace(msg, t); } void trace(String msg) { this.raft.trace(msg); } void debug(String msg, Throwable t) { this.raft.debug(msg, t); } void debug(String msg) { this.raft.debug(msg); } void info(String msg, Throwable t) { this.raft.info(msg, t); } void info(String msg) { this.raft.info(msg); } void warn(String msg, Throwable t) { this.raft.warn(msg, t); } void warn(String msg) { this.raft.warn(msg); } void error(String msg, Throwable t) { this.raft.error(msg, t); } void error(String msg) { this.raft.error(msg); } // Object @Override public abstract String toString(); String toStringPrefix() { assert Thread.holdsLock(this.raft); return this.getClass().getSimpleName() + "[term=" + this.raft.currentTerm + ",applied=" + this.raft.lastAppliedIndex + "t" + this.raft.lastAppliedTerm + ",commit=" + this.raft.commitIndex + ",log=" + this.raft.raftLog + "]"; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy