org.jsimpledb.kv.raft.RaftKVDatabase Maven / Gradle / Ivy
Show all versions of jsimpledb-kv-raft Show documentation
/*
* Copyright (C) 2015 Archie L. Cobbs. All rights reserved.
*/
package org.jsimpledb.kv.raft;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import com.google.common.util.concurrent.ListenableFuture;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.DirectoryStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.TreeMap;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import javax.annotation.PostConstruct;
import javax.annotation.PreDestroy;
import org.dellroad.stuff.io.ByteBufferInputStream;
import org.dellroad.stuff.java.TimedWait;
import org.dellroad.stuff.net.Network;
import org.dellroad.stuff.net.TCPNetwork;
import org.dellroad.stuff.util.LongMap;
import org.jsimpledb.kv.KVDatabase;
import org.jsimpledb.kv.KVPair;
import org.jsimpledb.kv.KVTransactionException;
import org.jsimpledb.kv.KeyRange;
import org.jsimpledb.kv.RetryTransactionException;
import org.jsimpledb.kv.mvcc.AtomicKVStore;
import org.jsimpledb.kv.mvcc.Writes;
import org.jsimpledb.kv.raft.msg.AppendRequest;
import org.jsimpledb.kv.raft.msg.AppendResponse;
import org.jsimpledb.kv.raft.msg.CommitRequest;
import org.jsimpledb.kv.raft.msg.CommitResponse;
import org.jsimpledb.kv.raft.msg.GrantVote;
import org.jsimpledb.kv.raft.msg.InstallSnapshot;
import org.jsimpledb.kv.raft.msg.Message;
import org.jsimpledb.kv.raft.msg.MessageSwitch;
import org.jsimpledb.kv.raft.msg.PingRequest;
import org.jsimpledb.kv.raft.msg.PingResponse;
import org.jsimpledb.kv.raft.msg.RequestVote;
import org.jsimpledb.kv.util.KeyWatchTracker;
import org.jsimpledb.util.ByteUtil;
import org.jsimpledb.util.CloseableIterator;
import org.jsimpledb.util.LongEncoder;
import org.jsimpledb.util.ThrowableUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A distributed {@link org.jsimpledb.kv.KVDatabase} based on the Raft consensus algorithm.
*
*
* Raft defines a distributed consensus algorithm for maintaining a shared state machine.
* Each Raft node maintains a complete copy of the state machine. Cluster nodes elect a
* leader who collects and distributes updates and provides for consistent reads.
* As long as as a node is part of a majority, the state machine is fully operational.
*
*
* {@link RaftKVDatabase} turns this into a transactional, highly available clustered key/value database with linearizable
* ACID semantics. A {@link RaftKVDatabase} appears to each node in the cluster as a shared, fully consistent key/value
* database. As long as a node can communicate with a majority of other nodes (i.e., at least half of the cluster), then the
* database is fully available. Conflict detection allows all nodes to perform write transactions simultaneously such that
* transactions always guarantee strict linearizable semantics, even in the face of arbitrary network drops, delays, and
* reorderings. When two transactions conflict, the loser receives a {@link RetryTransactionException}.
*
*
* Because each node maintains a complete copy of the database, persistence is guaranteed even if up to half of the cluster
* is lost. Each node stores its private persistent state in an {@link AtomicKVStore} (see {@link #setKVStore setKVStore()}).
*
*
* Optional support for falling back to a "standalone mode" based on the most recent copy of the database when a majority of
* nodes can't be reached is provided by {@link org.jsimpledb.kv.raft.fallback.FallbackKVDatabase}.
*
*
Raft Implementation Details
*
*
* - The Raft state machine is the key/value store data.
* - Unapplied log entries are stored on disk as serialized mutations, and also cached in memory.
* - Concurrent transactions are supported through a simple optimistic locking MVCC scheme (similar to that used by
* {@link org.jsimpledb.kv.mvcc.SnapshotKVDatabase}):
*
* - Transactions execute locally until commit time, using a {@link org.jsimpledb.kv.mvcc.MutableView} to collect mutations.
* The {@link org.jsimpledb.kv.mvcc.MutableView} is based on the local node's last unapplied log entry,
* if any (whether committed or not), or else directly on the underlying key/value store; this defines
* the base term and index for the transaction.
* - Since the transaction's view incorporates all unapplied log entries down to the underlying
* compacted key/value store, transaction performance degrades as the number of unapplied log
* entries grows. Log entries are applied as soon as possible, subject to certain conditions, but
* in any case limits on the {@linkplain #setMaxUnappliedLogEntries the number of unapplied log entries}
* as well as their {@linkplain #setMaxUnappliedLogMemory total memory usage} are enforced.
* - On commit, the transaction's {@link org.jsimpledb.kv.mvcc.Reads}, {@link org.jsimpledb.kv.mvcc.Writes},
* base index and term, and any config change are {@linkplain CommitRequest sent} to the leader.
* - The leader confirms that the log entry corresponding to the transaction's base index is either not yet applied to its
* own state machine, or was its most recently applied log entry. If this is not the case, then the transaction's base
* log entry is too old (e.g., it was applied and discarded early due to memory pressure), and so the transaction is
* rejected with a {@link RetryTransactionException}.
*
- The leader verifies that the transaction's base term matches its log; if not, the transaction's base log entry has
* been overwritten, and the transaction is rejected with a {@link RetryTransactionException}.
*
- The leader confirms that the {@link Writes} associated with log entries (if any) after the transaction's base log entry
* do not create {@linkplain org.jsimpledb.kv.mvcc.Reads#isConflict conflicts} when compared against the transaction's
* {@link org.jsimpledb.kv.mvcc.Reads}. If so, the transaction is rejected with a {@link RetryTransactionException}.
* - The leader adds a new log entry consisting of the transaction's {@link Writes} (and any config change) to its log.
* The associated term and index become the transaction's commit term and index; the leader then
* {@linkplain CommitResponse replies} to the follower with this information.
* - If/when the follower sees a committed (in the Raft sense) log entry appear in its log matching the
* transaction's commit term and index, then the transaction is complete.
* - As an optimization, when the leader sends a log entry to the same follower who committed the corresponding
* transaction in the first place, only the transaction ID is sent, because the follower already has the data.
* - After adding a new log entry, both followers and leaders "rebase" any open {@link Consistency#LINEARIZABLE}
* transactions by checking for conflicts in the manner described above.
*
*
* - For transactions occurring on a leader, the logic is similar except of course no network communication occurs.
* - On leaders, committed log entries are not applied to the state machine immediately; instead they are kept around until all
* followers have confirmed receipt. The point of waiting is to avoid follower transactions being rejected because
* their base log entry has already been compacted. Assuming message reordering is unlikely or impossible and each node is
* rebasing committed entries as described above, once a follower has confirmed receipt of a committed log entry, there
* should be no further commit requests from that follower for transactions based on earlier log entries.
* - For read-only transactions, the leader does not create a new log entry; instead, the transaction's commit
* term and index are set to the base term and index, and the leader also calculates its current "leader lease timeout",
* which is the earliest time at which it is possible for another leader to be elected.
* This is calculated as the time in the past at which the leader sent {@link AppendRequest}'s to a majority of followers
* who have since responded, plus the {@linkplain #setMinElectionTimeout minimum election timeout}, minus a small adjustment
* for possible clock drift (this assumes all nodes have the same minimum election timeout configured). If the current
* time is prior to the leader lease timeout (expected normally), the transaction may be committed as soon as log entry
* corresponding to the commit term and index is committed (it may already be); otherwise, the current time is returned
* to the follower as minimum required leader lease timeout before the transaction may be committed.
* - For read-only transactions, followers {@linkplain CommitRequest send} the base term and index to the leader as soon
* as the transaction is set read-only, without any conflict information. This allows the leader to capture and return
* the lowest possible commit index to the follower while the transaction is still open, and lets followers stop
* rebasing the transaction (at the returned commit index) as soon as possible, minimizing conflicts.
*
- Every {@link AppendRequest} includes the leader's current timestamp and leader lease timeout, so followers can commit
* any waiting read-only transactions. Leaders keep track of which followers are waiting on which leader lease
* timeout values, and when the leader lease timeout advances to allow a follower to commit a transaction, the follower
* is immediately notified.
* - Optional weaker consistency guarantees are availble on a per-transaction bases; see {@link #OPTION_CONSISTENCY}.
* Setting the consistency to any level other than {@link Consistency#LINEARIZABLE} implicitly sets the transaction
* to read-only.
*
*
* Limitations
*
*
* - A transaction's mutations must fit in memory.
* - All nodes must be configured with the same {@linkplain #setMinElectionTimeout minimum election timeout}.
* This guarantees that the leader's lease timeout calculation is valid.
* - Due to the optimistic locking approach used, this implementation will perform poorly when there is a high
* rate of conflicting transactions; the result will be many transaction retries.
* - Performance will suffer when the amount of data associated with a typical transaction cannot be delivered
* quickly and reliably over the network.
*
*
*
* In general, the algorithm should function correctly under all non-Byzantine conditions. The level of difficultly
* the system is experiencing, due to contention, network errors, etc., can be measured in terms of:
*
* - The average amount of time it takes to commit a transaction
* - The frequency of {@link RetryTransactionException}'s
*
*
* Cluster Configuration
*
*
* Instances support dynamic cluster configuration changes at runtime.
*
*
* Initially, all nodes are in an unconfigured state, where nothing has been added to the Raft log yet and no
* cluster is defined. Unconfigured nodes are passive: they stay in follower mode (i.e., they will not start elections),
* and they disallow local transactions that make any changes other than as described below to create a new cluster.
*
*
* An unconfigured node becomes configured when either:
*
* - {@link RaftKVTransaction#configChange RaftKVTransaction.configChange()} is invoked and committed within
* a local transaction, which creates a new single node cluster and commits its first log entry; or
* - An {@link AppendRequest} is received from a leader of some existing cluster, in which case the node
* records the cluster ID thereby joining the cluster (see below), and applies the received cluster configuration.
*
*
*
* A node is configured if and only if it has recorded one or more log entries. The very first log entry
* always contains the initial cluster configuration (containing only the node that created it, whether local or remote),
* so any node that has a non-empty log is configured.
*
*
* Newly created clusters are assigned a random 32-bit cluster ID (option #1 above). This ID is included in all messages sent
* over the network, and adopted by unconfigured nodes that join the cluster (via option #2 above). Configured nodes discard
* incoming messages containing a cluster ID different from the one they have joined. This prevents data corruption that can
* occur if nodes from two different clusters are inadvertently "mixed" together on the same network.
*
*
* Once a node joins a cluster with a specific cluster ID, it cannot be reassigned to a different cluster without first
* returning it to the unconfigured state; to do that, it must be shut it down and its persistent state deleted.
*
*
Configuration Changes
*
*
* Once a node is configured, a separate issue is whether the node is included in its own configuration, i.e., whether
* the node is a member of its cluster according to the current cluster configuration. A node that is not a member of its
* cluster does not count its own vote to determine committed log entries (if a leader), and does not start elections
* (if a follower). However, it will accept and respond to incoming {@link AppendRequest}s and {@link RequestVote}s.
*
*
* In addition, leaders follow these rules with respect to configuration changes:
*
* - If a leader is removed from a cluster, it remains the leader until the corresponding configuration change
* is committed (not counting its own vote), and then steps down (i.e., reverts to follower).
* - If a follower is added to a cluster, the leader immediately starts sending that follower {@link AppendRequest}s.
* - If a follower is removed from a cluster, the leader continues to send that follower {@link AppendRequest}s
* until the follower acknowledges receipt of the log entry containing the configuration change.
* - Leaders defer configuration changes until they have committed at least one log entry in the current term
* (see this discussion).
* - Configuration changes that remove the last node in a cluster are disallowed.
* - Only one configuration change may take place at a time.
*
*
* Follower Probes
*
*
* This implementation includes a modification to the Raft state machine to avoid unnecessary, disruptive elections
* when a node or nodes is disconnected from, and then reconnected to, the majority.
*
*
* When a follower's election timeout fires, before converting into a candidate, the follower is required to verify
* communication with a majority of the cluster using {@linkplain PingRequest} messages. Only when the follower has
* successfully done so may it become a candidate. While in this intermediate "probing" mode, the follower responds
* normally to incoming messages. In particular, if the follower receives a valid {@link AppendRequest} from the leader, it
* reverts back to normal operation.
*
*
* This behavior is optional, but enabled by default; see {@link #setFollowerProbingEnabled setFollowerProbingEnabled()}.
*
*
Key Watches
*
*
* {@linkplain RaftKVTransaction#watchKey Key watches} are supported.
*
*
Mutable Snapshots
*
*
* {@linkplain RaftKVTransaction#mutableSnapshot Mutable snapshots} are supported.
*
*
Spring Isolation Levels
*
*
* In Spring applications, the transaction {@link Consistency} level may be configured through the Spring
* {@link org.jsimpledb.spring.JSimpleDBTransactionManager} by (ab)using the transaction isolation level setting,
* for example, via the {@link org.springframework.transaction.annotation.Transactional @Transactional} annotation's
* {@link org.springframework.transaction.annotation.Transactional#isolation isolation()} property.
* All Raft consistency levels are made available this way, though the mapping from Spring's isolation levels to
* {@link RaftKVDatabase}'s consistency levels is only semantically approximate:
*
*
*
*
* Spring isolation level
* {@link RaftKVDatabase} consistency level
*
*
* {@link org.springframework.transaction.annotation.Isolation#DEFAULT DEFAULT}
* {@link Consistency#LINEARIZABLE}
*
*
* {@link org.springframework.transaction.annotation.Isolation#SERIALIZABLE SERIALIZABLE}
* {@link Consistency#LINEARIZABLE}
*
*
* {@link org.springframework.transaction.annotation.Isolation#REPEATABLE_READ REPEATABLE_READ}
* {@link Consistency#EVENTUAL}
*
*
* {@link org.springframework.transaction.annotation.Isolation#READ_COMMITTED READ_COMMITTED}
* {@link Consistency#EVENTUAL_COMMITTED}
*
*
* {@link org.springframework.transaction.annotation.Isolation#READ_UNCOMMITTED READ_UNCOMMITTED}
* {@link Consistency#UNCOMMITTED}
*
*
*
*
* @see The Raft Consensus Algorithm
*/
public class RaftKVDatabase implements KVDatabase {
/**
* Default minimum election timeout ({@value #DEFAULT_MIN_ELECTION_TIMEOUT}ms).
*
* @see #setMinElectionTimeout
*/
public static final int DEFAULT_MIN_ELECTION_TIMEOUT = 750;
/**
* Default maximum election timeout ({@value #DEFAULT_MAX_ELECTION_TIMEOUT}ms).
*
* @see #setMaxElectionTimeout
*/
public static final int DEFAULT_MAX_ELECTION_TIMEOUT = 1000;
/**
* Default heartbeat timeout ({@value DEFAULT_HEARTBEAT_TIMEOUT}ms).
*
* @see #setHeartbeatTimeout
*/
public static final int DEFAULT_HEARTBEAT_TIMEOUT = 200;
/**
* Default maximum supported outstanding transaction duration ({@value DEFAULT_MAX_TRANSACTION_DURATION}ms).
*
* @see #setMaxTransactionDuration
*/
public static final int DEFAULT_MAX_TRANSACTION_DURATION = 5 * 1000;
/**
* Default maximum supported applied log entry memory usage ({@value DEFAULT_MAX_UNAPPLIED_LOG_MEMORY} bytes).
*
* @see #setMaxUnappliedLogMemory
*/
public static final long DEFAULT_MAX_UNAPPLIED_LOG_MEMORY = 100 * 1024 * 1024; // 100MB
/**
* Default maximum number of unapplied log entries ({@value DEFAULT_MAX_UNAPPLIED_LOG_ENTRIES} bytes).
*
* @see #setMaxUnappliedLogEntries
*/
public static final int DEFAULT_MAX_UNAPPLIED_LOG_ENTRIES = 64;
/**
* Default maximum number of heartbeat intervals a leader will wait for a follower to acknowledge receipt of a log entry.
*
* @see #setMaxFollowerAckHeartbeats
*/
public static final int DEFAULT_MAX_FOLLOWER_ACK_HEARTBEATS = 5;
/**
* Default transaction commit timeout ({@value DEFAULT_COMMIT_TIMEOUT}).
*
* @see #setCommitTimeout
* @see RaftKVTransaction#setTimeout
*/
public static final int DEFAULT_COMMIT_TIMEOUT = 5000; // 5 seconds
/**
* Default TCP port ({@value #DEFAULT_TCP_PORT}) used to communicate with peers.
*/
public static final int DEFAULT_TCP_PORT = 9660;
/**
* Option key for {@link #createTransaction(Map)}. Value should be a {@link Consistency} instance,
* or the {@link Consistency#name name()} thereof.
*/
public static final String OPTION_CONSISTENCY = "consistency";
// Internal constants
static final int MAX_SNAPSHOT_TRANSMIT_AGE = (int)TimeUnit.SECONDS.toMillis(90); // 90 seconds
static final int FOLLOWER_LINGER_HEARTBEATS = 3; // how long to keep updating removed followers
static final float MAX_CLOCK_DRIFT = 0.01f; // max clock drift per heartbeat as a percentage ratio
static final int MAX_APPLIED_TERMS = 128; // how many already-applied log entry terms to rememeber
// File prefixes and suffixes
static final String TX_FILE_PREFIX = "tx-";
static final String TEMP_FILE_PREFIX = "temp-";
static final String TEMP_FILE_SUFFIX = ".tmp";
static final Pattern TEMP_FILE_PATTERN = Pattern.compile(".*" + Pattern.quote(TEMP_FILE_SUFFIX));
// Keys for persistent Raft state
static final byte[] CLUSTER_ID_KEY = ByteUtil.parse("0001");
static final byte[] CURRENT_TERM_KEY = ByteUtil.parse("0002");
static final byte[] LAST_APPLIED_TERM_KEY = ByteUtil.parse("0003");
static final byte[] LAST_APPLIED_INDEX_KEY = ByteUtil.parse("0004");
static final byte[] LAST_APPLIED_CONFIG_KEY = ByteUtil.parse("0005");
static final byte[] VOTED_FOR_KEY = ByteUtil.parse("0006");
static final byte[] FLIP_FLOP_KEY = ByteUtil.parse("0007");
// Prefix for all state machine key/value keys (we alternate between these to handle snapshot installs)
private static final byte[] STATE_MACHINE_PREFIXES = new byte[] { (byte)0x80, (byte)0x81 };
// Logging
final Logger log = LoggerFactory.getLogger(this.getClass());
// Configuration state
Network network = new TCPNetwork(DEFAULT_TCP_PORT);
String identity;
int minElectionTimeout = DEFAULT_MIN_ELECTION_TIMEOUT;
int maxElectionTimeout = DEFAULT_MAX_ELECTION_TIMEOUT;
int heartbeatTimeout = DEFAULT_HEARTBEAT_TIMEOUT;
int maxTransactionDuration = DEFAULT_MAX_TRANSACTION_DURATION;
int commitTimeout = DEFAULT_COMMIT_TIMEOUT;
long maxUnappliedLogMemory = DEFAULT_MAX_UNAPPLIED_LOG_MEMORY;
int maxUnappliedLogEntries = DEFAULT_MAX_UNAPPLIED_LOG_ENTRIES;
int maxFollowerAckHeartbeats = DEFAULT_MAX_FOLLOWER_ACK_HEARTBEATS;
boolean followerProbingEnabled;
boolean disableSync;
boolean dumpConflicts;
File logDir;
// Raft runtime state
Role role; // Raft state: LEADER, FOLLOWER, or CANDIDATE
SecureRandom random; // used to randomize election timeout, etc.
boolean flipflop; // determines which state machine prefix we are using
int clusterId; // cluster ID (zero if unconfigured - usually)
long currentTerm; // current Raft term (zero if unconfigured)
long currentTermStartTime; // timestamp of the start of the current Raft term
long commitIndex; // current Raft commit index (zero if unconfigured)
long keyWatchIndex; // index of last log entry that triggered key watches
long lastAppliedTerm; // key/value store last applied term (zero if unconfigured)
long lastAppliedIndex; // key/value store last applied index (zero if unconfigured)
final long[] appliedTerms = new long[MAX_APPLIED_TERMS]; // terms of log entries already applied to state machine
final ArrayList raftLog = new ArrayList<>(); // unapplied log entries (empty if unconfigured)
Map lastAppliedConfig; // key/value store last applied config (empty if none)
Map currentConfig; // most recent cluster config (empty if unconfigured)
Map protocolVersionMap = new HashMap<>(); // peer message encoding protocol versions
// Non-Raft runtime state
AtomicKVStore kv;
FileChannel logDirChannel; // null on Windows - no support for sync'ing directories
String returnAddress; // return address for message currently being processed
IOThread ioThread; // performs background I/O tasks
ScheduledExecutorService serviceExecutor; // does stuff for us asynchronously
final HashSet transmitting = new HashSet<>(); // network addresses whose output queues are not empty
final LongMap openTransactions = new LongMap<>(); // transactions open on this instance
final LinkedHashSet pendingService = new LinkedHashSet<>(); // pending work for serviceExecutor
KeyWatchTracker keyWatchTracker; // instantiated on demand
boolean performingService; // true when serviceExecutor does not need to be woken up
boolean shuttingDown; // prevents new transactions from being created
Throwable lastInternalError; // most recent exception in service executor
// Configuration
/**
* Configure the {@link AtomicKVStore} in which local persistent state is stored.
*
*
* Required property.
*
* @param kvstore local persistent data store
* @throws IllegalStateException if this instance is already started
*/
public synchronized void setKVStore(AtomicKVStore kvstore) {
Preconditions.checkState(this.role == null, "already started");
this.kv = kvstore;
}
/**
* Configure the directory in which uncommitted log entries are stored.
*
*
* Required property.
*
* @param directory log directory
* @throws IllegalStateException if this instance is already started
*/
public synchronized void setLogDirectory(File directory) {
Preconditions.checkState(this.role == null, "already started");
this.logDir = directory;
}
/**
* Get the directory in which uncommitted log entries are stored.
*
* @return configured log directory
*/
public synchronized File getLogDirectory() {
return this.logDir;
}
/**
* Configure the {@link Network} to use for inter-node communication.
*
*
* By default, a {@link TCPNetwork} instance communicating on {@link #DEFAULT_TCP_PORT} is used.
*
* @param network network implementation; must not be {@linkplain Network#start started}
* @throws IllegalStateException if this instance is already started
*/
public synchronized void setNetwork(Network network) {
Preconditions.checkState(this.role == null, "already started");
this.network = network;
}
/**
* Configure the Raft identity.
*
*
* Required property.
*
* @param identity unique Raft identity of this node in its cluster
* @throws IllegalStateException if this instance is already started
*/
public synchronized void setIdentity(String identity) {
Preconditions.checkState(this.role == null, "already started");
this.identity = identity;
}
/**
* Get this node's Raft identity.
*
* @return the unique identity of this node in its cluster
*/
public synchronized String getIdentity() {
return this.identity;
}
/**
* Configure the minimum election timeout.
*
*
* This must be set to a value greater than the {@linkplain #setHeartbeatTimeout heartbeat timeout}.
*
*
* Default is {@link #DEFAULT_MIN_ELECTION_TIMEOUT}.
*
*
* Warning: currently all nodes must have the same configured minimum election timeout,
* otherwise read-only transactions are not guaranteed to be completely up-to-date.
*
* @param timeout minimum election timeout in milliseconds
* @throws IllegalStateException if this instance is already started
* @throws IllegalArgumentException if {@code timeout <= 0}
*/
public synchronized void setMinElectionTimeout(int timeout) {
Preconditions.checkArgument(timeout > 0, "timeout <= 0");
Preconditions.checkState(this.role == null, "already started");
this.minElectionTimeout = timeout;
}
/**
* Get the configured minimum election timeout.
*
* @return minimum election timeout in milliseconds
*/
public synchronized int getMinElectionTimeout() {
return this.minElectionTimeout;
}
/**
* Configure the maximum election timeout.
*
*
* Default is {@link #DEFAULT_MAX_ELECTION_TIMEOUT}.
*
* @param timeout maximum election timeout in milliseconds
* @throws IllegalStateException if this instance is already started
* @throws IllegalArgumentException if {@code timeout <= 0}
*/
public synchronized void setMaxElectionTimeout(int timeout) {
Preconditions.checkArgument(timeout > 0, "timeout <= 0");
Preconditions.checkState(this.role == null, "already started");
this.maxElectionTimeout = timeout;
}
/**
* Get the configured maximum election timeout.
*
* @return maximum election timeout in milliseconds
*/
public synchronized int getMaxElectionTimeout() {
return this.maxElectionTimeout;
}
/**
* Configure the heartbeat timeout.
*
*
* This must be set to a value less than the {@linkplain #setMinElectionTimeout minimum election timeout}.
*
*
* Default is {@link #DEFAULT_HEARTBEAT_TIMEOUT}.
*
* @param timeout heartbeat timeout in milliseconds
* @throws IllegalStateException if this instance is already started
* @throws IllegalArgumentException if {@code timeout <= 0}
*/
public synchronized void setHeartbeatTimeout(int timeout) {
Preconditions.checkArgument(timeout > 0, "timeout <= 0");
Preconditions.checkState(this.role == null, "already started");
this.heartbeatTimeout = timeout;
}
/**
* Get the configured heartbeat timeout.
*
* @return heartbeat timeout in milliseconds
*/
public synchronized int getHeartbeatTimeout() {
return this.heartbeatTimeout;
}
/**
* Configure the maximum supported duration for outstanding transactions.
*
*
* This value may be changed while this instance is already running.
*
*
* Default is {@link #DEFAULT_MAX_TRANSACTION_DURATION}.
*
* @param duration maximum supported duration for outstanding transactions in milliseconds
* @throws IllegalArgumentException if {@code duration <= 0}
* @see #setMaxUnappliedLogMemory
*/
public synchronized void setMaxTransactionDuration(int duration) {
Preconditions.checkArgument(duration > 0, "duration <= 0");
this.maxTransactionDuration = duration;
}
/**
* Get the configured maximum supported duration for outstanding transactions.
*
* @return maximum supported duration for outstanding transactions in milliseconds
*/
public synchronized int getMaxTransactionDuration() {
return this.maxTransactionDuration;
}
/**
* Configure the maximum allowed memory used for unapplied log entries.
*
*
* A higher value means higher transaction concurrency and that transactions may stay open longer without causing a
* {@link RetryTransactionException}, but at the cost of possibly slower data access.
*
*
* This memory measurement value is approximate.
*
*
* This value may be changed while this instance is already running.
*
*
* Default is {@link #DEFAULT_MAX_UNAPPLIED_LOG_MEMORY}.
*
* @param maxUnappliedLogMemory maximum allowed memory usage for unapplied log entries
* @throws IllegalArgumentException if {@code maxUnappliedLogMemory <= 0}
*/
public synchronized void setMaxUnappliedLogMemory(long maxUnappliedLogMemory) {
Preconditions.checkArgument(maxUnappliedLogMemory > 0, "maxUnappliedLogMemory <= 0");
this.maxUnappliedLogMemory = maxUnappliedLogMemory;
}
/**
* Get the configured maximum allowed memory used for unapplied log entries.
*
* @return maximum allowed memory usage for cached applied log entries
*/
public synchronized long getMaxUnappliedLogMemory() {
return this.maxUnappliedLogMemory;
}
/**
* Configure the maximum number of unapplied log entries.
*
*
* A higher value means higher transaction concurrency and that transactions may stay open longer without causing a
* {@link RetryTransactionException}, but at the cost of possibly slower data access.
*
*
* This value may be changed while this instance is already running.
*
*
* Default is {@link #DEFAULT_MAX_UNAPPLIED_LOG_ENTRIES}.
*
* @param maxUnappliedLogEntries maximum number of unapplied log entries
* @throws IllegalArgumentException if {@code maxUnappliedLogEntries <= 0}
*/
public synchronized void setMaxUnappliedLogEntries(int maxUnappliedLogEntries) {
Preconditions.checkArgument(maxUnappliedLogEntries > 0, "maxUnappliedLogEntries <= 0");
this.maxUnappliedLogEntries = maxUnappliedLogEntries;
}
/**
* Get the configured maximum number of unapplied log entries.
*
* @return maximum number of unapplied log entries
*/
public synchronized long getMaxUnappliedLogEntries() {
return this.maxUnappliedLogEntries;
}
/**
* Configure the maximum number of heartbeat intervals a leader will wait for any follower to acknowledge
* receipt of a log entry before compacting it.
*
*
* Higher values may be needed when the network is lossy.
*
*
* This value may be changed while this instance is already running.
*
*
* Default is {@link #DEFAULT_MAX_FOLLOWER_ACK_HEARTBEATS}.
*
* @param maxFollowerAckHeartbeats maximum number of heartbeats for a leader to wait on a follower before compacting a log entry
* @throws IllegalArgumentException if {@code maxFollowerAckHeartbeats <= 0}
*/
public synchronized void setMaxFollowerAckHeartbeats(int maxFollowerAckHeartbeats) {
Preconditions.checkArgument(maxFollowerAckHeartbeats > 0, "maxFollowerAckHeartbeats <= 0");
this.maxFollowerAckHeartbeats = maxFollowerAckHeartbeats;
}
/**
* Get the maximum number of heartbeat intervals a leader will wait for a follower to acknowledge
* receipt of a log entry before compacting it.
*
* @return maximum number of heartbeats for a leader to wait on a follower before compacting a log entry
*/
public synchronized long getMaxFollowerAckHeartbeats() {
return this.maxFollowerAckHeartbeats;
}
/**
* Configure the default transaction commit timeout.
*
*
* This value determines how transactions will wait once {@link RaftKVTransaction#commit commit()}
* is invoked for the commit to succeed before failing with a {@link RetryTransactionException}.
* This can be overridden on a per-transaction basis via {@link RaftKVTransaction#setTimeout}.
*
*
* This value may be changed while this instance is already running.
*
*
* Default is {@link #DEFAULT_COMMIT_TIMEOUT}.
*
* @param timeout transaction commit timeout in milliseconds, or zero for unlimited
* @throws IllegalArgumentException if {@code timeout} is negative
* @see RaftKVTransaction#setTimeout
*/
public synchronized void setCommitTimeout(int timeout) {
Preconditions.checkArgument(timeout >= 0, "timeout < 0");
this.commitTimeout = timeout;
}
/**
* Get the configured default transaction commit timeout.
*
* @return transaction commit timeout in milliseconds, or zero for unlimited
*/
public synchronized int getCommitTimeout() {
return this.commitTimeout;
}
/**
* Configure whether followers should be required to probe for network connectivity with a majority of the
* cluster after an election timeout prior to becoming a candidate.
*
*
* This value may be changed at any time.
*
*
* The default is enabled.
*
* @param followerProbingEnabled true to enable, false to disable
*/
public synchronized void setFollowerProbingEnabled(boolean followerProbingEnabled) {
this.followerProbingEnabled = followerProbingEnabled;
}
/**
* Determine whether follower probing prior to becoming a candidate is enabled.
*
* @return true if follower probing is enabled, otherwise false
*/
public synchronized boolean isFollowerProbingEnabled() {
return this.followerProbingEnabled;
}
/**
* Disable filesystem data sync.
*
*
* This gives higher performance in exchange for losing the guarantee of durability if the system crashes.
* Note: this feature is experimental and may violate consistency and/or durability guaratees.
*
*
* Default is false.
*
* @param disableSync true to disable data sync
*/
public synchronized void setDisableSync(boolean disableSync) {
this.disableSync = disableSync;
}
/**
* Determine whether filesystem sync is disabled.
*
* @return true if filesystem sync is disabled, otherwise false
*/
public synchronized boolean isDisableSync() {
return this.disableSync;
}
/**
* Enable explicit logging of transaction conflicts.
*
*
* If enabled, when a transaction fails to due to conflicts, the conflicting key ranges are logged.
*
*
* Default is false.
*
* @param dumpConflicts true to disable data sync
*/
public synchronized void setDumpConflicts(boolean dumpConflicts) {
this.dumpConflicts = dumpConflicts;
}
/**
* Determine whether explicit logging of transaction conflicts is enabled.
*
* @return true if explicit logging of transaction conflicts is enabled, otherwise false
*/
public synchronized boolean isDumpConflicts() {
return this.dumpConflicts;
}
// Status
/**
* Retrieve the unique 32-bit ID for this node's cluster.
*
*
* A value of zero indicates an unconfigured system. Usually the reverse true, though an unconfigured system
* can have a non-zero cluster ID in the rare case where an error occurred persisting the initial log entry.
*
* @return the unique ID of this node's cluster, or zero if this node is unconfigured
*/
public synchronized int getClusterId() {
return this.clusterId;
}
/**
* Retrieve the current cluster configuration as understood by this node.
*
*
* Configuration changes are performed and committed in the context of a normal transaction; see
* {@link RaftKVTransaction#configChange RaftKVTransaction.configChange()}.
*
*
* If this system is unconfigured, an empty map is returned (and vice-versa).
*
*
* The returned map is a copy; changes have no effect on this instance.
*
* @return current configuration mapping from node identity to network address,
* or empty if this node is not started or unconfigured
*/
public synchronized Map getCurrentConfig() {
return this.currentConfig != null ? new TreeMap<>(this.currentConfig) : new TreeMap<>();
}
/**
* Determine whether this instance is configured.
*
*
* A node is configured if and only if it has at least one log entry. The first log entry always
* includes a configuration change that adds the node that created it to the (previously empty) cluster.
*
* @return true if this instance is started and configured, otherwise false
*/
public synchronized boolean isConfigured() {
return this.lastAppliedIndex > 0 || !this.raftLog.isEmpty();
}
/**
* Determine whether this node thinks that it is part of its cluster, as determined by its
* {@linkplain #getCurrentConfig current configuration}.
*
* @return true if this instance is started and part of the cluster, otherwise false
*/
public synchronized boolean isClusterMember() {
return this.isClusterMember(this.identity);
}
/**
* Determine whether this node thinks that the specified node is part of the cluster, as determined by its
* {@linkplain #getCurrentConfig current configuration}.
*
* @param node node identity
* @return true if this instance is started and the specified node is part of the cluster, otherwise false
*/
public synchronized boolean isClusterMember(String node) {
return this.currentConfig != null ? this.currentConfig.containsKey(node) : false;
}
/**
* Get this instance's current role: leadeer, follower, or candidate.
*
* @return current {@link Role}, or null if not running
*/
public synchronized Role getCurrentRole() {
return this.role;
}
/**
* Get this instance's current term.
*
* @return current term, or zero if not running
*/
public synchronized long getCurrentTerm() {
return this.currentTerm;
}
/**
* Get the time at which this instance's current term advanced to its current value.
*
* @return current term's start time in milliseconds since the epoch, or zero if unknown
*/
public synchronized long getCurrentTermStartTime() {
return this.currentTermStartTime;
}
/**
* Get this instance's current commit index..
*
* @return current commit index, or zero if not running
*/
public synchronized long getCommitIndex() {
return this.commitIndex;
}
/**
* Get this instance's last applied log entry term.
*
* @return last applied term, or zero if not running
*/
public synchronized long getLastAppliedTerm() {
return this.lastAppliedTerm;
}
/**
* Get this instance's last applied log entry index.
*
* @return last applied index, or zero if not running
*/
public synchronized long getLastAppliedIndex() {
return this.lastAppliedIndex;
}
/**
* Get the unapplied {@link LogEntry}s in this instance's Raft log.
*
*
* The returned list is a copy; changes have no effect on this instance.
*
* @return unapplied log entries; or null if this instance is not running
*/
public synchronized List getUnappliedLog() {
return this.raftLog != null ? new ArrayList<>(this.raftLog) : null;
}
/**
* Get the estimated total memory used by unapplied log entries.
*
* @return unapplied log entry memory usage, or zero if this instance is not running
*/
public synchronized long getUnappliedLogMemoryUsage() {
long total = 0;
for (LogEntry logEntry : this.raftLog)
total += logEntry.getFileSize();
return total;
}
/**
* Get the set of open transactions associated with this database.
*
*
* The returned set is a copy; changes have no effect on this instance.
*
* @return all open transactions
*/
public synchronized List getOpenTransactions() {
final ArrayList list;
synchronized (this) {
list = new ArrayList<>(this.openTransactions.values());
}
Collections.sort(list, RaftKVTransaction.SORT_BY_ID);
return list;
}
// Lifecycle
@Override
@PostConstruct
public synchronized void start() {
// Sanity check
assert this.checkState();
if (this.role != null)
return;
Preconditions.checkState(!this.shuttingDown, "shutdown in progress");
Preconditions.checkState(this.logDir != null, "no Raft log directory configured");
Preconditions.checkState(this.kv != null, "no Raft local persistence key/value store configured");
Preconditions.checkState(this.network != null, "no Raft network configured");
Preconditions.checkState(this.minElectionTimeout <= this.maxElectionTimeout, "minElectionTimeout > maxElectionTimeout");
Preconditions.checkState(this.heartbeatTimeout < this.minElectionTimeout, "heartbeatTimeout >= minElectionTimeout");
Preconditions.checkState(this.identity != null, "no Raft identity configured");
// Log
if (this.log.isDebugEnabled())
this.debug("starting " + this.getClass().getName() + " in directory " + this.logDir);
// Start up local database
boolean success = false;
try {
// Create/verify log directory
if (!this.logDir.exists())
Files.createDirectories(this.logDir.toPath());
if (!this.logDir.isDirectory())
throw new IOException("file `" + this.logDir + "' is not a directory");
// Start k/v store
this.kv.start();
// Open directory containing log entry files so we have a way to fsync() it
assert this.logDirChannel == null;
try {
this.logDirChannel = FileChannel.open(this.logDir.toPath());
} catch (IOException e) {
if (!this.isWindows())
throw e;
}
// Create randomizer
assert this.random == null;
this.random = new SecureRandom();
// Start background I/O thread
assert this.ioThread == null;
final String ioThreadName = "Raft I/O [" + this.identity + "]";
this.ioThread = new IOThread(this.logDir, ioThreadName);
this.ioThread.start();
// Start up service executor thread
assert this.serviceExecutor == null;
final String serviceThreadName = "Raft Service [" + this.identity + "]";
this.serviceExecutor = Executors.newSingleThreadScheduledExecutor(action -> {
final Thread thread = new Thread(action);
thread.setName(serviceThreadName);
return thread;
});
// Start network
this.network.start(new Network.Handler() {
@Override
public void handle(String sender, ByteBuffer buf) {
RaftKVDatabase.this.handle(sender, buf);
}
@Override
public void outputQueueEmpty(String address) {
RaftKVDatabase.this.outputQueueEmpty(address);
}
});
// Reload persistent raft info
this.clusterId = (int)this.decodeLong(CLUSTER_ID_KEY, 0);
this.currentTerm = this.decodeLong(CURRENT_TERM_KEY, 0);
this.currentTermStartTime = System.currentTimeMillis();
final String votedFor = this.decodeString(VOTED_FOR_KEY, null);
this.lastAppliedTerm = this.decodeLong(LAST_APPLIED_TERM_KEY, 0);
this.lastAppliedIndex = this.decodeLong(LAST_APPLIED_INDEX_KEY, 0);
Arrays.fill(this.appliedTerms, 0);
this.lastAppliedConfig = this.decodeConfig(LAST_APPLIED_CONFIG_KEY);
this.flipflop = this.decodeBoolean(FLIP_FLOP_KEY);
this.currentConfig = this.buildCurrentConfig();
// Reset protocol version info
this.protocolVersionMap.clear();
// If we crashed part way through a snapshot install, recover by discarding partial install
if (this.discardFlipFloppedStateMachine() && this.log.isDebugEnabled())
this.debug("detected partially applied snapshot install, discarding");
// Initialize commit index and key watch index
this.commitIndex = this.lastAppliedIndex;
this.keyWatchIndex = this.commitIndex;
// Reload outstanding log entries from disk
this.loadLog();
// Show recovered state
if (this.log.isDebugEnabled()) {
this.debug("recovered Raft state:"
+ "\n clusterId=" + (this.clusterId != 0 ? String.format("0x%08x", this.clusterId) : "none")
+ "\n currentTerm=" + this.currentTerm
+ "\n lastApplied=" + this.lastAppliedIndex + "t" + this.lastAppliedTerm
+ "\n lastAppliedConfig=" + this.lastAppliedConfig
+ "\n currentConfig=" + this.currentConfig
+ "\n votedFor=" + (votedFor != null ? "\"" + votedFor + "\"" : "nobody")
+ "\n log=" + this.raftLog);
}
// Validate recovered state
if (this.isConfigured()) {
Preconditions.checkArgument(this.clusterId != 0);
Preconditions.checkArgument(this.currentTerm > 0);
Preconditions.checkArgument(this.getLastLogTerm() > 0);
Preconditions.checkArgument(this.getLastLogIndex() > 0);
Preconditions.checkArgument(!this.currentConfig.isEmpty());
} else {
Preconditions.checkArgument(this.lastAppliedTerm == 0);
Preconditions.checkArgument(this.lastAppliedIndex == 0);
Preconditions.checkArgument(this.getLastLogTerm() == 0);
Preconditions.checkArgument(this.getLastLogIndex() == 0);
Preconditions.checkArgument(this.currentConfig.isEmpty());
Preconditions.checkArgument(this.raftLog.isEmpty());
}
// Start as follower (with unknown leader)
this.changeRole(new FollowerRole(this, null, null, votedFor));
// Done
this.info("successfully started " + this + " in directory " + this.logDir);
success = true;
} catch (IOException e) {
throw new RuntimeException("error starting up database", e);
} finally {
if (!success)
this.cleanup();
}
// Sanity check
assert this.checkState();
}
@Override
@PreDestroy
public void stop() {
// Set flag to prevent new transactions
synchronized (this) {
// Sanity check
assert this.checkState();
if (this.role == null || this.shuttingDown)
return;
// Set shutting down flag
this.info("starting shutdown of " + this);
this.shuttingDown = true;
// Fail all remaining open transactions
for (RaftKVTransaction tx : new ArrayList<>(this.openTransactions.values())) {
switch (tx.getState()) {
case EXECUTING:
case COMMIT_READY:
case COMMIT_WAITING:
this.fail(tx, new KVTransactionException(tx, "database shutdown"));
break;
case COMPLETED:
break;
default:
assert false;
break;
}
}
// Sleep while we wait for transactions to clean themselves up
try {
if (!TimedWait.wait(this, 5000, this.openTransactions::isEmpty))
this.warn("open transactions not cleaned up during shutdown");
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
// Shut down the service executor and wait for pending tasks to finish
this.serviceExecutor.shutdownNow();
try {
this.serviceExecutor.awaitTermination(1000, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
// Shutdown I/O thread
this.ioThread.shutdown();
try {
this.ioThread.join();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
// Final cleanup
synchronized (this) {
this.serviceExecutor = null;
this.ioThread = null;
this.cleanup();
}
// Done
this.info("completed shutdown of " + this);
}
/**
* Get the exception most recently thrown by the internal service thread, if any.
* This is used mainly during testing.
*
* @return most recent service exception, or null if none
*/
public synchronized Throwable getLastInternalError() {
return this.lastInternalError;
}
private void cleanup() {
assert Thread.holdsLock(this);
assert this.openTransactions.isEmpty();
if (this.role != null) {
this.role.shutdown();
this.role = null;
}
if (this.serviceExecutor != null) {
this.serviceExecutor.shutdownNow();
try {
this.serviceExecutor.awaitTermination(1000, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
this.serviceExecutor = null;
}
if (this.ioThread != null) {
this.ioThread.shutdown();
try {
this.ioThread.join();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
this.ioThread = null;
}
this.kv.stop();
Util.closeIfPossible(this.logDirChannel);
this.logDirChannel = null;
this.raftLog.clear();
this.random = null;
this.network.stop();
this.currentTerm = 0;
this.currentTermStartTime = 0;
this.commitIndex = 0;
this.keyWatchIndex = 0;
this.clusterId = 0;
this.lastAppliedTerm = 0;
this.lastAppliedIndex = 0;
Arrays.fill(this.appliedTerms, 0);
this.lastAppliedConfig = null;
this.currentConfig = null;
this.protocolVersionMap.clear();
if (this.keyWatchTracker != null) {
this.keyWatchTracker.close();
this.keyWatchTracker = null;
}
this.transmitting.clear();
this.pendingService.clear();
this.shuttingDown = false;
}
/**
* Initialize our in-memory state from the persistent state reloaded from disk.
* This is invoked on initial startup.
*/
private void loadLog() throws IOException {
// Sanity check
assert Thread.holdsLock(this);
assert this.raftLog.isEmpty();
// Scan for log entry files
this.raftLog.clear();
try (DirectoryStream files = Files.newDirectoryStream(this.logDir.toPath())) {
for (Path path : files) {
final File file = path.toFile();
// Ignore sub-directories (typically owned by the underlying k/v store)
if (file.isDirectory())
continue;
// Is this a log entry file?
if (LogEntry.LOG_FILE_PATTERN.matcher(file.getName()).matches()) {
if (this.log.isDebugEnabled())
this.debug("recovering log file " + file.getName());
final LogEntry logEntry = LogEntry.fromFile(file);
this.raftLog.add(logEntry);
continue;
}
// Is this a leftover temporary file?
if (TEMP_FILE_PATTERN.matcher(file.getName()).matches()) {
if (this.log.isDebugEnabled())
this.debug("deleting leftover temporary file " + file.getName());
this.deleteFile(file, "leftover temporary file");
continue;
}
// Unknown
this.warn("ignoring unrecognized file " + file.getName() + " in my log directory");
}
}
// Verify we have a contiguous range of log entries starting from the snapshot index; discard bogus log files
Collections.sort(this.raftLog, LogEntry.SORT_BY_INDEX);
long lastTermSeen = this.lastAppliedTerm;
long expectedIndex = this.lastAppliedIndex + 1;
for (Iterator i = this.raftLog.iterator(); i.hasNext(); ) {
final LogEntry logEntry = i.next();
String error = null;
if (logEntry.getTerm() < lastTermSeen)
error = "term " + logEntry.getTerm() + " < last applied term " + lastTermSeen;
else if (logEntry.getIndex() < this.lastAppliedIndex)
error = "index " + logEntry.getIndex() + " < last applied index " + this.lastAppliedIndex;
else if (logEntry.getIndex() != expectedIndex)
error = "index " + logEntry.getIndex() + " != expected index " + expectedIndex;
if (error != null) {
this.warn("deleting bogus log file " + logEntry.getFile().getName() + ": " + error);
this.deleteFile(logEntry.getFile(), "bogus log file");
i.remove();
} else {
expectedIndex++;
lastTermSeen = logEntry.getTerm();
}
}
if (this.log.isDebugEnabled()) {
this.debug("recovered " + this.raftLog.size() + " log entries: " + this.raftLog
+ " (" + this.getUnappliedLogMemoryUsage() + " total bytes)");
}
// Rebuild current configuration
this.currentConfig = this.buildCurrentConfig();
}
/**
* Reconstruct the current config by starting with the last applied config and applying
* configuration deltas from unapplied log entries.
*/
Map buildCurrentConfig() {
// Start with last applied config
final HashMap config = new HashMap<>(this.lastAppliedConfig);
// Apply any changes found in uncommitted log entries
for (LogEntry logEntry : this.raftLog)
logEntry.applyConfigChange(config);
// Done
return config;
}
// Key Watches
synchronized ListenableFuture watchKey(RaftKVTransaction tx, byte[] key) {
Preconditions.checkState(this.role != null, "not started");
tx.verifyExecuting();
if (this.keyWatchTracker == null)
this.keyWatchTracker = new KeyWatchTracker();
return this.keyWatchTracker.register(key);
}
// Transactions
/**
* Create a new transaction.
*
*
* Equivalent to: {@link #createTransaction(Consistency) createTransaction}{@code (}{@link Consistency#LINEARIZABLE}{@code )}.
*
* @throws IllegalStateException if this instance is not {@linkplain #start started} or in the process of shutting down
*/
@Override
public RaftKVTransaction createTransaction() {
return this.createTransaction(Consistency.LINEARIZABLE);
}
@Override
public RaftKVTransaction createTransaction(Map options) {
// Any options?
if (options == null)
return this.createTransaction(Consistency.LINEARIZABLE);
// Look for options from the JSimpleDBTransactionManager
Consistency consistency = null;
Object isolation = options.get("org.springframework.transaction.annotation.Isolation");
if (isolation instanceof Enum)
isolation = ((Enum>)isolation).name();
if (isolation != null) {
switch (isolation.toString()) {
case "READ_UNCOMMITTED":
consistency = Consistency.UNCOMMITTED;
break;
case "READ_COMMITTED":
consistency = Consistency.EVENTUAL_COMMITTED;
break;
case "REPEATABLE_READ":
consistency = Consistency.EVENTUAL;
break;
case "SERIALIZABLE":
consistency = Consistency.LINEARIZABLE;
break;
default:
break;
}
}
// Look for OPTION_CONSISTENCY option
try {
final Object value = options.get(OPTION_CONSISTENCY);
if (value instanceof Consistency)
consistency = (Consistency)value;
else if (value instanceof String)
consistency = Consistency.valueOf((String)value);
} catch (Exception e) {
// ignore
}
// Configure consistency level
return this.createTransaction(consistency != null ? consistency : Consistency.LINEARIZABLE);
}
/**
* Create a new transaction with the specified consistency.
*
*
* Transactions that wish to use {@link Consistency#EVENTUAL_COMMITTED} must be created using this method,
* because the log entry on which the transaction is based is determined at creation time.
*
* @param consistency consistency level
* @return newly created transaction
* @throws IllegalArgumentException if {@code consistency} is null
* @throws IllegalStateException if this instance is not {@linkplain #start started} or in the process of shutting down
*/
public synchronized RaftKVTransaction createTransaction(Consistency consistency) {
// Sanity check
assert this.checkState();
Preconditions.checkState(consistency != null, "null consistency");
Preconditions.checkState(this.role != null, "not started");
Preconditions.checkState(!this.shuttingDown, "shutting down");
// Base transaction on the most recent log entry (if !committed). This is itself a form of optimistic locking: we assume
// that the most recent log entry has a high probability of being committed (in the Raft sense), which is of course
// required in order to commit any transaction based on it.
final MostRecentView view = new MostRecentView(this, consistency.isBasedOnCommittedLogEntry() ? this.commitIndex : -1);
final long baseTerm = view.getTerm();
final long baseIndex = view.getIndex();
// Create transaction
final RaftKVTransaction tx = new RaftKVTransaction(this,
consistency, baseTerm, baseIndex, view.getSnapshot(), view.getView());
tx.setTimeout(this.commitTimeout);
this.openTransactions.put(tx.txId, tx);
// Set commit term+index if already known
switch (consistency) {
case UNCOMMITTED:
tx.setCommittable();
break;
case EVENTUAL_COMMITTED:
tx.setCommitInfo(baseTerm, baseIndex, null);
tx.setCommittable();
break;
case EVENTUAL:
tx.setCommitInfo(baseTerm, baseIndex, null);
this.role.checkCommittable(tx);
break;
case LINEARIZABLE:
break;
default:
assert false;
break;
}
// Done
if (this.log.isDebugEnabled())
this.debug("created new transaction " + tx);
return tx;
}
/**
* Commit a transaction.
*/
void commit(final RaftKVTransaction tx) {
try {
// Mark transaction as "commit ready" - service thread will do the rest
synchronized (this) {
// Sanity check
assert this.checkState();
assert this.role != null;
// Check tx state
switch (tx.getState()) {
case EXECUTING:
// Transition to COMMIT_READY state
if (this.log.isDebugEnabled())
this.debug("committing transaction " + tx);
tx.setState(TxState.COMMIT_READY);
this.requestService(new CheckReadyTransactionService(this.role, tx));
// From this point on, throw a StaleTransactionException if accessed, instead of retry exception or whatever
tx.setFailure(null);
// Setup commit timer
if (tx.getTimeout() != 0) {
final Timer commitTimer = new Timer(this, "commit timer for " + tx,
new Service("commit timeout for tx#" + tx.txId) {
@Override
public void run() {
switch (tx.getState()) {
case COMMIT_READY:
case COMMIT_WAITING:
RaftKVDatabase.this.fail(tx, new RetryTransactionException(tx,
"transaction failed to complete within " + tx.getTimeout()
+ "ms (in state " + tx.getState() + ")"));
break;
default:
break;
}
}
});
commitTimer.timeoutAfter(tx.getTimeout());
tx.setCommitTimer(commitTimer);
}
break;
case CLOSED: // this transaction has already been committed or rolled back
try {
tx.verifyExecuting(); // always throws some kind of exception
} finally {
tx.setFailure(null); // from now on, throw StaleTransactionException if accessed
}
assert false;
return;
default: // another thread is already doing the commit
this.warn("simultaneous commit()'s requested for " + tx + " by two different threads");
break;
}
}
// Wait for completion
try {
tx.getCommitFuture().get();
} catch (InterruptedException e) {
throw new RetryTransactionException(tx, "thread interrupted while waiting for commit", e);
} catch (ExecutionException e) {
final Throwable cause = e.getCause();
ThrowableUtil.prependCurrentStackTrace(cause);
Throwables.throwIfUnchecked(cause);
throw new KVTransactionException(tx, "commit failed", cause); // should never get here
}
} finally {
this.cleanupTransaction(tx);
}
}
/**
* Rollback a transaction.
*/
synchronized void rollback(RaftKVTransaction tx) {
// Sanity check
assert this.checkState();
assert this.role != null;
// From this point on, throw a StaleTransactionException if accessed, instead of retry exception or whatever
tx.setFailure(null);
// Check tx state
switch (tx.getState()) {
case EXECUTING:
if (this.log.isDebugEnabled())
this.debug("rolling back transaction " + tx);
this.cleanupTransaction(tx);
break;
case CLOSED:
break;
default: // another thread is currently committing!
this.warn("simultaneous commit() and rollback() requested for " + tx + " by two different threads");
break;
}
}
// Clean up transaction and transition to state CLOSED
synchronized void cleanupTransaction(RaftKVTransaction tx) {
// Debug
if (this.log.isTraceEnabled())
this.trace("cleaning up transaction " + tx);
// Do any per-role cleanups
if (this.role != null)
this.role.cleanupForTransaction(tx);
// Cancel commit timer
if (tx.getCommitTimer() != null)
tx.getCommitTimer().cancel();
// Remove from open transactions set
this.openTransactions.remove(tx.txId);
// Transition to CLOSED
tx.setState(TxState.CLOSED);
tx.setNoLongerRebasable();
// Notify waiting thread if doing shutdown
if (this.shuttingDown)
this.notify();
}
// Mark a transaction as having succeeded; it must be in COMMIT_READY or COMMIT_WAITING
void succeed(RaftKVTransaction tx) {
// Sanity check
assert Thread.holdsLock(this);
assert this.role != null;
assert tx.getState().equals(TxState.COMMIT_READY) || tx.getState().equals(TxState.COMMIT_WAITING);
// Succeed transaction
if (this.log.isDebugEnabled())
this.debug("successfully committed " + tx);
tx.getCommitFuture().set(null);
tx.setState(TxState.COMPLETED);
tx.setNoLongerRebasable();
this.role.cleanupForTransaction(tx);
}
// Mark a transaction as having failed
void fail(RaftKVTransaction tx, KVTransactionException e) {
// Sanity check
assert Thread.holdsLock(this);
assert this.role != null;
assert e != null;
// Fail transaction
if (this.log.isDebugEnabled())
this.debug("failing transaction " + tx + ": " + e);
switch (tx.getState()) {
case EXECUTING:
assert tx.getFailure() == null;
tx.setFailure(e);
this.cleanupTransaction(tx);
break;
case COMMIT_READY:
case COMMIT_WAITING:
tx.getCommitFuture().setException(e);
tx.setState(TxState.COMPLETED);
tx.setNoLongerRebasable();
this.role.cleanupForTransaction(tx);
break;
default: // too late, nobody cares
return;
}
}
// Service
/**
* Request service to be invoked after the current service (if any) completes.
*
*
* If {@code service} has an associated {@link Role}, and that {@link Role} is no longer active
* when the service is handled, nothing will be done.
*
* @param service the service to perform
*/
void requestService(Service service) {
assert Thread.holdsLock(this);
assert service != null;
if (!this.pendingService.add(service) || this.performingService)
return;
try {
this.serviceExecutor.submit(() -> {
try {
this.handlePendingService();
} catch (Throwable t) {
RaftKVDatabase.this.error("exception in handlePendingService()", t);
this.lastInternalError = t;
}
});
} catch (RejectedExecutionException e) {
if (!this.shuttingDown) {
this.warn("service executor task rejected, skipping", e);
this.lastInternalError = e;
}
}
}
// Performs pending service requests (do not invoke directly)
private synchronized void handlePendingService() {
// Sanity check
assert this.checkState();
if (this.role == null)
return;
// While there is work to do, do it
this.performingService = true;
try {
while (!this.pendingService.isEmpty()) {
final Iterator i = this.pendingService.iterator();
final Service service = i.next();
i.remove();
assert service != null;
assert service.getRole() == null || service.getRole() == this.role;
if (this.log.isTraceEnabled())
this.trace("SERVICE [" + service + "] in " + this.role);
try {
service.run();
} catch (Throwable t) {
RaftKVDatabase.this.error("exception in " + service, t);
this.lastInternalError = t;
}
}
} finally {
this.performingService = false;
}
}
// Raft state
/**
* Discard all key/value pairs in the "flip-flopped" state machine, i.e., the one that we are not currently using.
*
* @return true if there was anything to remove, otherwise false
*/
boolean discardFlipFloppedStateMachine() {
final byte[] dirtyPrefix = this.getFlipFloppedStateMachinePrefix();
final boolean dirty;
try (final CloseableIterator i = this.kv.getRange(KeyRange.forPrefix(dirtyPrefix))) {
dirty = i.hasNext();
}
if (dirty)
this.kv.removeRange(dirtyPrefix, ByteUtil.getKeyAfterPrefix(dirtyPrefix));
return dirty;
}
/**
* Perform a state machine flip-flop operation. Normally this would happen after a successful snapshot install.
*/
boolean flipFlopStateMachine(long term, long index, Map config) {
// Sanity check
assert Thread.holdsLock(this);
assert term >= 0;
assert index >= 0;
if (this.log.isDebugEnabled())
this.debug("performing state machine flip-flop to " + index + "t" + term + " with config " + config);
if (config == null)
config = new HashMap<>(0);
// Prepare updates
final Writes writes = new Writes();
writes.getPuts().put(LAST_APPLIED_TERM_KEY, LongEncoder.encode(term));
writes.getPuts().put(LAST_APPLIED_INDEX_KEY, LongEncoder.encode(index));
writes.getPuts().put(LAST_APPLIED_CONFIG_KEY, this.encodeConfig(config));
writes.getPuts().put(FLIP_FLOP_KEY, this.encodeBoolean(!this.flipflop));
// Update persistent store
try {
this.kv.mutate(writes, true);
} catch (Exception e) {
this.error("flip-flop error updating key/value store term/index to " + index + "t" + term, e);
return false;
}
// Delete all unapplied log files (no longer applicable)
this.raftLog.clear();
try (DirectoryStream files = Files.newDirectoryStream(this.logDir.toPath())) {
for (Path path : files) {
final File file = path.toFile();
if (LogEntry.LOG_FILE_PATTERN.matcher(file.getName()).matches())
this.deleteFile(file, "unapplied log file");
}
} catch (IOException e) {
this.error("error deleting unapplied log files in " + this.logDir + " (ignoring)", e);
}
// Update in-memory copy of persistent state
this.flipflop = !this.flipflop;
this.lastAppliedTerm = term;
this.lastAppliedIndex = index;
Arrays.fill(this.appliedTerms, 0);
this.lastAppliedConfig = config;
this.commitIndex = this.lastAppliedIndex;
final TreeMap previousConfig = new TreeMap<>(this.currentConfig);
this.currentConfig = this.buildCurrentConfig();
if (!this.currentConfig.equals(previousConfig))
this.info("apply new cluster configuration after snapshot install: " + this.currentConfig);
// Discard the flip-flopped state machine
this.discardFlipFloppedStateMachine();
// Trigger key watches
this.requestService(this.role.triggerKeyWatchesService);
// Done
return true;
}
/**
* Update and persist a new current term.
*/
boolean advanceTerm(long newTerm) {
// Sanity check
assert Thread.holdsLock(this);
assert newTerm > this.currentTerm;
if (this.log.isDebugEnabled())
this.debug("advancing current term from " + this.currentTerm + " -> " + newTerm);
// Update persistent store
final Writes writes = new Writes();
writes.getPuts().put(CURRENT_TERM_KEY, LongEncoder.encode(newTerm));
writes.getRemoves().add(new KeyRange(VOTED_FOR_KEY));
try {
this.kv.mutate(writes, true);
} catch (Exception e) {
this.error("error persisting new term " + newTerm, e);
return false;
}
// Update in-memory copy
this.currentTerm = newTerm;
this.currentTermStartTime = System.currentTimeMillis();
return true;
}
/**
* Join the specified cluster and persist the specified cluster ID.
*
* @param newClusterId cluster ID; must not be zero
* @return true if successful, false if an error occurred
* @throws IllegalStateException if this node is already part of some cluster
* @throws IllegalArgumentException if {@code newClusterId} is zero
*/
boolean joinCluster(int newClusterId) {
// Sanity check
assert Thread.holdsLock(this);
Preconditions.checkArgument(newClusterId != 0);
Preconditions.checkState(this.clusterId == 0);
// Persist it
this.info("joining cluster with ID " + String.format("0x%08x", newClusterId));
final Writes writes = new Writes();
writes.getPuts().put(CLUSTER_ID_KEY, LongEncoder.encode(newClusterId));
try {
this.kv.mutate(writes, true);
} catch (Exception e) {
this.error("error updating key/value store with new cluster ID", e);
return false;
}
// Done
this.clusterId = newClusterId;
return true;
}
/**
* Get the prefix for state machine we are currently using.
*/
byte[] getStateMachinePrefix() {
return this.getStateMachinePrefix(false);
}
/**
* Get the prefix for the flip-flopped state machine.
*/
byte[] getFlipFloppedStateMachinePrefix() {
return this.getStateMachinePrefix(true);
}
private byte[] getStateMachinePrefix(boolean flipFlopped) {
return new byte[] { STATE_MACHINE_PREFIXES[flipFlopped ^ this.flipflop ? 1 : 0] };
}
/**
* Set the Raft role.
*
* @param role new role
*/
void changeRole(Role role) {
// Sanity check
assert Thread.holdsLock(this);
assert role != null;
// Shutdown previous role (if any)
if (this.role != null) {
this.role.shutdown();
for (Iterator i = this.pendingService.iterator(); i.hasNext(); ) {
final Service service = i.next();
if (service.getRole() != null)
i.remove();
}
}
// Setup new role
this.role = role;
this.role.setup();
if (this.log.isDebugEnabled())
this.debug("changing role to " + role);
// Check state
assert this.checkState();
}
/**
* Append a log entry to the Raft log.
*
* @param term new log entry term
* @param entry entry to add; the {@linkplain NewLogEntry#getTempFile temporary file} must be already durably persisted,
* and will be renamed
* @return new {@link LogEntry}
* @throws Exception if an error occurs
*/
LogEntry appendLogEntry(long term, NewLogEntry newLogEntry) throws Exception {
// Sanity check
assert Thread.holdsLock(this);
assert this.role != null;
assert newLogEntry != null;
// Get file length
final LogEntry.Data data = newLogEntry.getData();
final File tempFile = newLogEntry.getTempFile();
final long fileLength = Util.getLength(tempFile);
// Create new log entry
final LogEntry logEntry = new LogEntry(term, this.getLastLogIndex() + 1, this.logDir, data, fileLength);
if (this.log.isDebugEnabled())
this.debug("adding new log entry " + logEntry + " using " + tempFile.getName());
// Atomically rename file and fsync() directory to durably persist
Files.move(tempFile.toPath(), logEntry.getFile().toPath(), StandardCopyOption.ATOMIC_MOVE);
if (this.logDirChannel != null && !this.disableSync)
this.logDirChannel.force(true);
// Temp file no longer exists, so don't try to delete it later
newLogEntry.resetTempFile();
// Add new log entry to in-memory log
this.raftLog.add(logEntry);
// Update current config
if (logEntry.applyConfigChange(this.currentConfig))
this.info("applying new cluster configuration from log entry " + logEntry + ": " + this.currentConfig);
// Done
return logEntry;
}
long getLastLogIndex() {
assert Thread.holdsLock(this);
return this.lastAppliedIndex + this.raftLog.size();
}
long getLastLogTerm() {
assert Thread.holdsLock(this);
return this.getLogTermAtIndex(this.getLastLogIndex());
}
// Get the term of the not-yet-applied log entry at the specified index
long getLogTermAtIndex(long index) {
assert Thread.holdsLock(this);
assert index >= this.lastAppliedIndex;
assert index <= this.getLastLogIndex();
return index == this.lastAppliedIndex ? this.lastAppliedTerm : this.getLogEntryAtIndex(index).getTerm();
}
// Get the not-yet-applied log entry at the specified index
LogEntry getLogEntryAtIndex(long index) {
assert Thread.holdsLock(this);
assert index > this.lastAppliedIndex;
assert index <= this.getLastLogIndex();
return this.raftLog.get((int)(index - this.lastAppliedIndex - 1));
}
// Increment the index of the last applied log entry
void incrementLastAppliedIndex(long term) {
assert Thread.holdsLock(this);
this.appliedTerms[(int)(this.lastAppliedIndex % MAX_APPLIED_TERMS)] = this.lastAppliedTerm;
this.lastAppliedIndex++;
this.lastAppliedTerm = term;
}
// Get the term of an already-applied log entry, if known, otherwise zero
long getAppliedLogEntryTerm(long index) {
assert index < this.lastAppliedIndex;
if (index < this.lastAppliedIndex - MAX_APPLIED_TERMS)
return 0;
return this.appliedTerms[(int)(index % MAX_APPLIED_TERMS)];
}
// Get the term of a log entry (either applied or not-yet-applied), if known, otherwise zero
long getLogTermAtIndexIfKnown(long index) {
return index >= this.lastAppliedIndex ? this.getLogTermAtIndex(index) : this.getAppliedLogEntryTerm(index);
}
// Object
@Override
public synchronized String toString() {
return this.getClass().getSimpleName()
+ "[identity=" + (this.identity != null ? "\"" + this.identity + "\"" : null)
+ ",logDir=" + this.logDir
+ ",term=" + this.currentTerm
+ ",commitIndex=" + this.commitIndex
+ ",lastApplied=" + this.lastAppliedIndex + "t" + this.lastAppliedTerm
+ ",raftLog=" + this.raftLog
+ ",role=" + this.role
+ (this.shuttingDown ? ",shuttingDown" : "")
+ "]";
}
// Network.Handler and Messaging
private void handle(String sender, ByteBuffer buf) {
// Decode message
final int protocolVersion;
final Message msg;
try {
protocolVersion = Message.decodeProtocolVersion(buf);
msg = Message.decode(buf, protocolVersion);
} catch (IllegalArgumentException e) {
this.error("rec'd bogus message from " + sender + ", ignoring", e);
return;
}
// If message contains serialized mutation data, at some point we are going to need to write that data to a log entry file.
// Instead of doing that (slow) operation while holding the lock, do it now, before we acquire the lock.
ByteBuffer mutationData =
msg instanceof AppendRequest ? ((AppendRequest)msg).getMutationData() :
msg instanceof CommitRequest ? ((CommitRequest)msg).getMutationData() : null;
final NewLogEntry newLogEntry;
if (mutationData != null) {
File tempFile = null;
try {
// Write serialized mutation data into temporary file
tempFile = this.getTempFile();
try (FileWriter output = new FileWriter(tempFile, this.disableSync)) {
final FileChannel channel = output.getFileOutputStream().getChannel();
for (ByteBuffer writeBuf = mutationData.asReadOnlyBuffer(); writeBuf.hasRemaining(); )
channel.write(writeBuf);
}
// Deserialize mutation data and create new log entry instance
try (ByteBufferInputStream input = new ByteBufferInputStream(mutationData)) {
newLogEntry = new NewLogEntry(LogEntry.readData(input), tempFile);
}
// Indicate success
tempFile = null;
} catch (IOException e) {
this.error("error persisting mutations from " + msg + ", ignoring", e);
return;
} finally {
if (tempFile != null)
this.deleteFile(tempFile, "new log entry temp file");
}
} else
newLogEntry = null;
// Handle message
try {
this.receiveMessage(sender, msg, protocolVersion, newLogEntry);
} finally {
if (newLogEntry != null)
newLogEntry.cleanup(this);
}
}
private synchronized void outputQueueEmpty(String address) {
// Sanity check
assert this.checkState();
// Update transmitting status
if (!this.transmitting.remove(address))
return;
if (this.log.isTraceEnabled())
this.trace("QUEUE_EMPTY address " + address + " in " + this.role);
// Notify role
if (this.role == null)
return;
this.role.outputQueueEmpty(address);
}
boolean isTransmitting(String address) {
return this.transmitting.contains(address);
}
// Messages
synchronized boolean sendMessage(Message msg) {
// Sanity check
assert Thread.holdsLock(this);
// Get peer's address; if unknown, use the return address of the message being processed (if any)
final String peer = msg.getRecipientId();
String address = this.currentConfig.get(peer);
if (address == null)
address = this.returnAddress;
if (address == null) {
this.warn("can't send " + msg + " to unknown peer \"" + peer + "\"");
return false;
}
// Determine protocol version to use
final int protocolVersion = this.protocolVersionMap.getOrDefault(peer, Message.getCurrentProtocolVersion());
// Encode messagse
if (this.log.isTraceEnabled())
this.trace("XMIT " + msg + " to " + address + " (version " + protocolVersion + ")");
final ByteBuffer encodedMessage;
try {
encodedMessage = msg.encode(protocolVersion);
} catch (IllegalArgumentException e) { // can happen if peer running older code
this.warn("can't send " + msg + " to peer \"" + peer + "\": " + e, e);
return false;
}
// Send message
if (this.network.send(address, encodedMessage)) {
this.transmitting.add(address);
return true;
}
// Transmit failed
this.warn("transmit of " + msg + " to \"" + peer + "\" failed locally");
return false;
}
synchronized void receiveMessage(String address, Message msg, int protocolVersion, final NewLogEntry newLogEntry) {
// Sanity check newLogEntry
assert newLogEntry == null || (msg instanceof AppendRequest || msg instanceof CommitRequest);
// Sanity check
assert Thread.holdsLock(this);
assert this.checkState();
if (this.role == null) {
if (this.log.isDebugEnabled())
this.debug("rec'd " + msg + " rec'd in shutdown state; ignoring");
return;
}
// Sanity check cluster ID
if (msg.getClusterId() == 0) {
this.warn("rec'd " + msg + " with zero cluster ID from " + address + "; ignoring");
return;
}
if (this.clusterId != 0 && msg.getClusterId() != this.clusterId) {
this.warn("rec'd " + msg + " with foreign cluster ID "
+ String.format("0x%08x", msg.getClusterId()) + " != " + String.format("0x%08x", this.clusterId) + "; ignoring");
return;
}
// Sanity check sender
final String peer = msg.getSenderId();
if (peer.equals(this.identity)) {
this.warn("rec'd " + msg + " from myself (\"" + peer + "\", address " + address + "); ignoring");
return;
}
// Sanity check recipient
final String dest = msg.getRecipientId();
if (!dest.equals(this.identity)) {
this.warn("rec'd misdirected " + msg + " intended for \"" + dest + "\" from " + address + "; ignoring");
return;
}
// Update sender's protocol version
if (protocolVersion != -1) {
final Integer previousVersion = this.protocolVersionMap.put(peer, protocolVersion);
if (!((Integer)protocolVersion).equals(previousVersion) && this.log.isDebugEnabled())
this.debug("set protocol encoding version for peer \"" + peer + "\" to " + protocolVersion);
}
// Is my term too low? If so update and revert to follower
if (msg.getTerm() > this.currentTerm) {
// First check with current role; in some special cases we ignore this
if (!this.role.mayAdvanceCurrentTerm(msg)) {
if (this.log.isTraceEnabled()) {
this.trace("rec'd " + msg + " with term " + msg.getTerm() + " > " + this.currentTerm + " from \""
+ peer + "\" but current role says to ignore it");
}
return;
}
// Revert to follower
if (this.log.isDebugEnabled()) {
this.debug("rec'd " + msg.getClass().getSimpleName() + " with term " + msg.getTerm() + " > "
+ this.currentTerm + " from \"" + peer + "\", updating term and "
+ (this.role instanceof FollowerRole ? "remaining a" : "reverting to") + " follower");
}
if (!this.advanceTerm(msg.getTerm()))
return;
this.changeRole(msg.isLeaderMessage() ? new FollowerRole(this, peer, address) : new FollowerRole(this));
}
// Is sender's term too low? Ignore it (except ping request)
if (msg.getTerm() < this.currentTerm && !(msg instanceof PingRequest)) {
if (this.log.isDebugEnabled()) {
this.debug("rec'd " + msg + " with term " + msg.getTerm() + " < " + this.currentTerm
+ " from \"" + peer + "\" at " + address + ", ignoring");
}
return;
}
// Debug
if (this.log.isTraceEnabled())
this.trace("RECV " + msg + " in " + this.role + " from " + address + " (protocol version " + protocolVersion + ")");
// Handle message
this.returnAddress = address;
try {
msg.visit(new MessageSwitch() {
@Override
public void caseAppendRequest(AppendRequest msg) {
RaftKVDatabase.this.role.caseAppendRequest(msg, newLogEntry);
}
@Override
public void caseAppendResponse(AppendResponse msg) {
RaftKVDatabase.this.role.caseAppendResponse(msg);
}
@Override
public void caseCommitRequest(CommitRequest msg) {
RaftKVDatabase.this.role.caseCommitRequest(msg, newLogEntry);
}
@Override
public void caseCommitResponse(CommitResponse msg) {
RaftKVDatabase.this.role.caseCommitResponse(msg);
}
@Override
public void caseGrantVote(GrantVote msg) {
RaftKVDatabase.this.role.caseGrantVote(msg);
}
@Override
public void caseInstallSnapshot(InstallSnapshot msg) {
RaftKVDatabase.this.role.caseInstallSnapshot(msg);
}
@Override
public void casePingRequest(PingRequest msg) {
RaftKVDatabase.this.role.casePingRequest(msg);
}
@Override
public void casePingResponse(PingResponse msg) {
RaftKVDatabase.this.role.casePingResponse(msg);
}
@Override
public void caseRequestVote(RequestVote msg) {
RaftKVDatabase.this.role.caseRequestVote(msg);
}
});
} finally {
this.returnAddress = null;
}
}
// I/O Thread
synchronized void deleteFile(File file, String description) {
if (this.ioThread == null) { // should never happen
Util.delete(file, description);
return;
}
this.ioThread.deleteFile(file, description);
}
synchronized File getTempFile() throws IOException {
if (this.ioThread == null)
throw new IOException("instance is shutdown");
return this.ioThread.getTempFile();
}
private static final class IOThread extends Thread {
private static final long MAX_WAIT_SECONDS = 1;
private static final int MAX_TEMP_FILES = 10;
private static final int MAX_DELETE_FILES = 1000;
private final Logger log = LoggerFactory.getLogger(this.getClass());
private final File tempDir;
private final ArrayBlockingQueue availableTempFiles = new ArrayBlockingQueue<>(MAX_TEMP_FILES);
private final ArrayBlockingQueue filesToDelete = new ArrayBlockingQueue<>(MAX_DELETE_FILES);
private boolean shutdown;
private boolean didWarnDelete;
private boolean didWarnTempFile;
private IOThread(File tempDir, String threadName) {
super(threadName);
Preconditions.checkArgument(tempDir != null);
this.tempDir = tempDir;
}
public synchronized void shutdown() {
this.shutdown = true;
this.notifyAll();
}
public synchronized void deleteFile(File file, String description) {
assert file != null;
assert description != null;
try {
this.filesToDelete.add(new FileInfo(file, description));
} catch (IllegalStateException e) {
if (!this.didWarnDelete) {
this.log.error("file deletion queue is full (suppressing further warnings)", e);
this.didWarnDelete = true;
}
Util.delete(file, description);
return;
}
this.notifyAll();
}
public synchronized File getTempFile() throws IOException {
final FileInfo fileInfo;
try {
fileInfo = this.availableTempFiles.remove();
} catch (NoSuchElementException e) {
return File.createTempFile(TEMP_FILE_PREFIX, TEMP_FILE_SUFFIX, this.tempDir);
}
this.notifyAll();
return fileInfo.getFile();
}
@Override
public void run() {
try {
while (true) {
// Sleep until there's something to do
synchronized (this) {
// Wait for something to do
while (!this.shutdown && this.filesToDelete.isEmpty() && this.availableTempFiles.remainingCapacity() == 0) {
try {
this.wait();
} catch (InterruptedException e) {
this.log.warn(this + " interrupted, ignoring", e);
}
}
// Shutdown, if needed
if (this.shutdown)
break;
}
// Delete deletable files, if any
if (!this.filesToDelete.isEmpty())
this.deleteFiles(this.filesToDelete, true);
// Create a new temporary file, if needed
if (this.availableTempFiles.remainingCapacity() > 0) {
try {
this.availableTempFiles.add(new FileInfo(
File.createTempFile(TEMP_FILE_PREFIX, TEMP_FILE_SUFFIX, this.tempDir), "ready temporary file"));
} catch (IOException e) {
if (!this.didWarnTempFile) {
this.log.error("error creating temporary file in "
+ this.tempDir + " (suppressing further warnings)", e);
this.didWarnTempFile = true;
}
}
}
}
} catch (ThreadDeath t) {
throw t;
} catch (Throwable t) {
this.log.error("error in " + this + ", bailing out", t);
} finally {
this.cleanup();
}
}
private void cleanup() {
this.deleteFiles(this.availableTempFiles, false);
this.deleteFiles(this.filesToDelete, true);
}
private void deleteFiles(ArrayBlockingQueue queue, boolean warn) {
while (true) {
// Get next file
final FileInfo fileInfo;
try {
fileInfo = queue.remove();
} catch (NoSuchElementException e) {
break;
}
// Delete file
Util.delete(fileInfo.getFile(), warn ? fileInfo.getDescription() : null);
}
}
}
private static class FileInfo {
private final File file;
private final String description;
FileInfo(File file) {
this(file, null);
}
FileInfo(File file, String description) {
Preconditions.checkArgument(file != null);
this.file = file;
this.description = description;
}
public File getFile() {
return this.file;
}
public String getDescription() {
return this.description;
}
}
// Utility methods
byte[] encodeBoolean(boolean value) {
return new byte[] { value ? (byte)1 : (byte)0 };
}
boolean decodeBoolean(byte[] key) throws IOException {
final byte[] value = this.kv.get(key);
return value != null && value.length > 0 && value[0] != 0;
}
long decodeLong(byte[] key, long defaultValue) throws IOException {
final byte[] value = this.kv.get(key);
if (value == null)
return defaultValue;
try {
return LongEncoder.decode(value);
} catch (IllegalArgumentException e) {
throw new IOException("can't interpret encoded long value "
+ ByteUtil.toString(value) + " under key " + ByteUtil.toString(key), e);
}
}
String decodeString(byte[] key, String defaultValue) throws IOException {
final byte[] value = this.kv.get(key);
if (value == null)
return defaultValue;
final DataInputStream input = new DataInputStream(new ByteArrayInputStream(value));
try {
return input.readUTF();
} catch (IOException e) {
throw new IOException("can't interpret encoded string value "
+ ByteUtil.toString(value) + " under key " + ByteUtil.toString(key), e);
}
}
byte[] encodeString(String value) {
final ByteArrayOutputStream buf = new ByteArrayOutputStream();
final DataOutputStream output = new DataOutputStream(buf);
try {
output.writeUTF(value);
output.flush();
} catch (IOException e) {
throw new RuntimeException("unexpected error", e);
}
return buf.toByteArray();
}
Map decodeConfig(byte[] key) throws IOException {
final Map config = new HashMap<>();
final byte[] value = this.kv.get(key);
if (value == null)
return config;
try {
final DataInputStream data = new DataInputStream(new ByteArrayInputStream(value));
while (true) {
data.mark(1);
if (data.read() == -1)
break;
data.reset();
config.put(data.readUTF(), data.readUTF());
}
} catch (IOException e) {
throw new IOException("can't interpret encoded config "
+ ByteUtil.toString(value) + " under key " + ByteUtil.toString(key), e);
}
return config;
}
byte[] encodeConfig(Map config) {
final ByteArrayOutputStream buf = new ByteArrayOutputStream();
final DataOutputStream data = new DataOutputStream(buf);
try {
for (Map.Entry entry : config.entrySet()) {
data.writeUTF(entry.getKey());
data.writeUTF(entry.getValue());
}
data.flush();
} catch (IOException e) {
throw new RuntimeException("unexpected error", e);
}
return buf.toByteArray();
}
// Logging
void trace(String msg, Throwable t) {
this.log.trace(String.format("%s %s: %s", new Timestamp(), this.identity, msg), t);
}
void trace(String msg) {
this.log.trace(String.format("%s %s: %s", new Timestamp(), this.identity, msg));
}
void debug(String msg, Throwable t) {
this.log.debug(String.format("%s %s: %s", new Timestamp(), this.identity, msg), t);
}
void debug(String msg) {
this.log.debug(String.format("%s %s: %s", new Timestamp(), this.identity, msg));
}
void info(String msg, Throwable t) {
this.log.info(String.format("%s %s: %s", new Timestamp(), this.identity, msg), t);
}
void info(String msg) {
this.log.info(String.format("%s %s: %s", new Timestamp(), this.identity, msg));
}
void warn(String msg, Throwable t) {
this.log.warn(String.format("%s %s: %s", new Timestamp(), this.identity, msg), t);
}
void warn(String msg) {
this.log.warn(String.format("%s %s: %s", new Timestamp(), this.identity, msg));
}
void error(String msg, Throwable t) {
this.log.error(String.format("%s %s: %s", new Timestamp(), this.identity, msg), t);
}
void error(String msg) {
this.log.error(String.format("%s %s: %s", new Timestamp(), this.identity, msg));
}
// Debug/Sanity Checking
private boolean checkState() {
try {
this.doCheckState();
} catch (AssertionError e) {
throw new AssertionError("checkState() failure for " + this, e);
}
return true;
}
private void doCheckState() {
assert Thread.holdsLock(this);
// Handle stopped state
if (this.role == null) {
assert this.random == null;
assert this.currentTerm == 0;
assert this.currentTermStartTime == 0;
assert this.commitIndex == 0;
assert this.lastAppliedTerm == 0;
assert this.lastAppliedIndex == 0;
assert this.lastAppliedConfig == null;
assert this.currentConfig == null;
assert this.clusterId == 0;
assert this.raftLog.isEmpty();
assert this.logDirChannel == null;
assert this.serviceExecutor == null;
assert this.keyWatchTracker == null;
assert this.transmitting.isEmpty();
assert this.openTransactions.isEmpty();
assert this.pendingService.isEmpty();
assert !this.shuttingDown;
return;
}
// Handle running state
assert this.kv != null;
assert this.random != null;
assert this.serviceExecutor != null;
assert this.logDirChannel != null || this.isWindows();
assert !this.serviceExecutor.isShutdown() || this.shuttingDown;
assert this.currentTerm >= 0;
assert this.commitIndex >= 0;
assert this.lastAppliedTerm >= 0;
assert this.lastAppliedIndex >= 0;
assert this.lastAppliedConfig != null;
assert this.currentConfig != null;
assert this.currentTerm >= this.lastAppliedTerm;
assert this.commitIndex >= this.lastAppliedIndex;
assert this.commitIndex <= this.lastAppliedIndex + this.raftLog.size();
assert this.keyWatchIndex <= this.commitIndex;
long index = this.lastAppliedIndex;
long term = this.lastAppliedTerm;
for (LogEntry logEntry : this.raftLog) {
assert logEntry.getIndex() == index + 1;
assert logEntry.getTerm() >= term;
index = logEntry.getIndex();
term = logEntry.getTerm();
}
// Check configured vs. unconfigured
if (this.isConfigured()) {
assert this.clusterId != 0;
assert this.currentTerm > 0;
assert this.lastAppliedTerm >= 0;
assert this.lastAppliedIndex >= 0;
assert !this.currentConfig.isEmpty();
assert this.currentConfig.equals(this.buildCurrentConfig());
assert this.getLastLogTerm() > 0;
assert this.getLastLogIndex() > 0;
} else {
assert this.lastAppliedTerm == 0;
assert this.lastAppliedIndex == 0;
assert this.lastAppliedConfig.isEmpty();
assert this.currentConfig.isEmpty();
assert this.raftLog.isEmpty();
}
// Check role
assert this.role.checkState();
// Check transactions
for (RaftKVTransaction tx : this.openTransactions.values()) {
try {
assert !tx.getState().equals(TxState.CLOSED);
tx.checkStateOpen(this.currentTerm, this.getLastLogIndex(), this.commitIndex);
this.role.checkTransaction(tx);
} catch (AssertionError e) {
throw new AssertionError("checkState() failure for " + tx, e);
}
}
}
private boolean isWindows() {
return System.getProperty("os.name", "generic").toLowerCase(Locale.ENGLISH).contains("win");
}
}