net.kuujo.copycat.raft.LeaderState Maven / Gradle / Ivy
The newest version!
/*
* Copyright 2014 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package net.kuujo.copycat.raft;
import net.kuujo.copycat.CopycatException;
import net.kuujo.copycat.raft.protocol.*;
import net.kuujo.copycat.util.function.TriFunction;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import com.google.common.collect.Ordering;
/**
* Leader state.
*
* @author Jordan Halterman
*/
class LeaderState extends ActiveState {
private static final int MAX_BATCH_SIZE = 1024 * 512;
private ScheduledFuture> currentTimer;
private final Replicator replicator = new Replicator();
public LeaderState(RaftContext context) {
super(context);
}
@Override
public Type type() {
return Type.LEADER;
}
@Override
public synchronized CompletableFuture open() {
return super.open()
.thenRun(this::applyEntries)
.thenRun(replicator::commit)
.thenRun(this::takeLeadership)
.thenRun(this::startHeartbeatTimer);
}
/**
* Sets the current node as the cluster leader.
*/
private void takeLeadership() {
context.setLeader(context.getLocalMember());
}
/**
* Applies all unapplied entries to the log.
*/
private void applyEntries() {
Long lastIndex = context.log().lastIndex();
if (lastIndex != null) {
int count = 0;
for (long commitIndex = context.getCommitIndex() != null ? Long.valueOf(context.getCommitIndex() + 1) : context.log().firstIndex(); commitIndex <= lastIndex; commitIndex++) {
context.setCommitIndex(commitIndex);
applyEntry(commitIndex);
count++;
}
LOGGER.debug("{} - Applied {} entries to log", context.getLocalMember(), count);
}
}
/**
* Starts heartbeating all cluster members.
*/
private void startHeartbeatTimer() {
// Set a timer that will be used to periodically synchronize with other nodes
// in the cluster. This timer acts as a heartbeat to ensure this node remains
// the leader.
LOGGER.debug("{} - Setting heartbeat timer", context.getLocalMember());
currentTimer = context.executor().scheduleAtFixedRate(this::heartbeatMembers, 0, context.getHeartbeatInterval(), TimeUnit.MILLISECONDS);
}
/**
* Sends a heartbeat to all members of the cluster.
*/
private void heartbeatMembers() {
context.checkThread();
if (isOpen()) {
replicator.commit();
}
}
@Override
public CompletableFuture poll(final PollRequest request) {
return CompletableFuture.completedFuture(logResponse(PollResponse.builder()
.withUri(context.getLocalMember())
.withTerm(context.getTerm())
.withAccepted(false)
.build()));
}
@Override
public CompletableFuture vote(final VoteRequest request) {
if (request.term() > context.getTerm()) {
LOGGER.debug("{} - Received greater term", context.getLocalMember());
transition(Type.FOLLOWER);
return super.vote(request);
} else {
return CompletableFuture.completedFuture(logResponse(VoteResponse.builder()
.withUri(context.getLocalMember())
.withTerm(context.getTerm())
.withVoted(false)
.build()));
}
}
@Override
public CompletableFuture append(final AppendRequest request) {
context.checkThread();
if (request.term() > context.getTerm()) {
return super.append(request);
} else if (request.term() < context.getTerm()) {
return CompletableFuture.completedFuture(logResponse(AppendResponse.builder()
.withUri(context.getLocalMember())
.withTerm(context.getTerm())
.withSucceeded(false)
.withLogIndex(context.log().lastIndex())
.build()));
} else {
transition(Type.FOLLOWER);
return super.append(request);
}
}
@Override
public CompletableFuture query(QueryRequest request) {
context.checkThread();
logRequest(request);
CompletableFuture future = new CompletableFuture<>();
TriFunction consumer = context.consumer();
switch (request.consistency()) {
// Consistency mode WEAK or DEFAULT is immediately evaluated and returned.
case WEAK:
future.complete(logResponse(QueryResponse.builder()
.withUri(context.getLocalMember())
.withResult(consumer.apply(context.getTerm(), null, request.entry()))
.build()));
break;
// For DEFAULT consistency mode Copycat will ensure no other member could have become the leader.
// This is done by making sure time since last successful quorum commit is less than the time
// it takes for election to timeout
case DEFAULT:
if (replicator.replicas.isEmpty() || System.nanoTime() - replicator.commitTime() < context.getElectionTimeout() * 1000L) {
future.complete(logResponse(QueryResponse.builder()
.withUri(context.getLocalMember())
.withResult(consumer.apply(context.getTerm(), null, request.entry()))
.build()));
break;
} // else fall through and handle as if Consistency mode is STRONG.
// Consistency mode STRONG requires synchronous consistency check prior to applying the query.
case STRONG:
LOGGER.debug("{} - Synchronizing logs to index {} for read", context.getLocalMember(), context.log().lastIndex());
long term = context.getTerm();
replicator.commit().whenComplete((index, error) -> {
context.checkThread();
if (isOpen()) {
if (error == null) {
try {
future.complete(logResponse(QueryResponse.builder()
.withUri(context.getLocalMember())
.withResult(consumer.apply(term, null, request.entry()))
.build()));
} catch (Exception e) {
future.complete(logResponse(QueryResponse.builder()
.withUri(context.getLocalMember())
.withStatus(Response.Status.ERROR)
.withError(e)
.build()));
}
} else {
future.complete(logResponse(QueryResponse.builder()
.withUri(context.getLocalMember())
.withStatus(Response.Status.ERROR)
.withError(error)
.build()));
}
}
});
break;
}
return future;
}
@Override
public CompletableFuture commit(final CommitRequest request) {
context.checkThread();
logRequest(request);
CompletableFuture future = new CompletableFuture<>();
ByteBuffer entry = request.entry();
TriFunction consumer = context.consumer();
// Create a log entry containing the current term and entry.
ByteBuffer logEntry = ByteBuffer.allocate(entry.capacity() + 8);
long term = context.getTerm();
logEntry.putLong(term);
logEntry.put(entry);
entry.flip();
// Try to append the entry to the log. If appending the entry fails then just reply with an exception immediately.
final long index;
try {
index = context.log().appendEntry(logEntry);
context.log().flush();
} catch (IOException e) {
future.completeExceptionally(new CopycatException(e));
return future;
}
LOGGER.debug("{} - Appended entry to log at index {}", context.getLocalMember(), index);
LOGGER.debug("{} - Replicating logs up to index {} for write", context.getLocalMember(), index);
// Attempt to replicate the entry to a quorum of the cluster.
replicator.commit(index).whenComplete((resultIndex, error) -> {
context.checkThread();
if (isOpen()) {
if (error == null) {
try {
future.complete(logResponse(CommitResponse.builder()
.withUri(context.getLocalMember())
.withResult(consumer.apply(term, index, entry))
.build()));
} catch (Exception e) {
future.complete(logResponse(CommitResponse.builder()
.withUri(context.getLocalMember())
.withStatus(Response.Status.ERROR)
.withError(e)
.build()));
} finally {
context.setLastApplied(index);
}
} else {
future.complete(logResponse(CommitResponse.builder()
.withUri(context.getLocalMember())
.withStatus(Response.Status.ERROR)
.withError(error)
.build()));
}
}
});
return future;
}
/**
* Cancels the ping timer.
*/
private void cancelPingTimer() {
if (currentTimer != null) {
LOGGER.debug("{} - Cancelling ping timer", context.getLocalMember());
currentTimer.cancel(false);
}
}
@Override
public synchronized CompletableFuture close() {
return super.close().thenRun(this::cancelPingTimer);
}
/**
* Log replicator.
*/
private class Replicator {
private final List replicas;
private final int quorum;
private final int quorumIndex;
private final List commitTimes;
private long commitTime;
private CompletableFuture commitFuture;
private CompletableFuture nextCommitFuture;
private long commitFailures;
private final TreeMap> commitFutures = new TreeMap<>();
@SuppressWarnings("all")
private Replicator() {
replicas = new ArrayList<>(context.getActiveMembers().size() - 1);
commitTimes = new ArrayList<>(context.getActiveMembers().size() - 1);
int i = 0;
for (String member : context.getActiveMembers()) {
if (!member.equals(context.getLocalMember())) {
replicas.add(new Replica(i++, member));
commitTimes.add(System.nanoTime());
}
}
// Quorum is floor(replicas.size / 2) since this node is implicitly counted in the quorum count.
this.quorum = (int) Math.floor(context.getActiveMembers().size() / 2.0);
this.quorumIndex = quorum - 1;
}
/**
* Triggers a commit.
*
* @return A completable future to be completed the next time entries are committed to a majority of the cluster.
*/
private CompletableFuture commit() {
if (replicas.isEmpty()) {
return CompletableFuture.completedFuture(null);
}
if (commitFuture == null) {
commitFuture = new CompletableFuture<>();
commitTime = System.nanoTime();
replicas.forEach(Replica::commit);
return commitFuture;
} else if (nextCommitFuture == null) {
nextCommitFuture = new CompletableFuture<>();
return nextCommitFuture;
} else {
return nextCommitFuture;
}
}
/**
* Registers a commit handler for the given commit index.
*
* @param index The index for which to register the handler.
* @return A completable future to be completed once the given log index has been committed.
*/
private CompletableFuture commit(long index) {
if (replicas.isEmpty()) {
return CompletableFuture.completedFuture(index);
}
return commitFutures.computeIfAbsent(index, i -> {
replicas.forEach(Replica::commit);
return new CompletableFuture<>();
});
}
/**
* Returns the last time a majority of the cluster was contacted.
*/
private long commitTime() {
List sortedCommitTimes = Ordering.natural().reverse().sortedCopy(commitTimes);
return sortedCommitTimes.get(quorumIndex);
}
/**
* Sets a commit time.
*/
private void commitTime(int id, Throwable error) {
if (commitFuture == null) {
return;
}
boolean completed = false;
synchronized (this) {
if (error != null) {
if (replicas.get(id).commitStartTime == commitTime && quorum > replicas.size() - ++commitFailures) {
commitFuture.completeExceptionally(new CopycatException("Failed to reach quorum"));
completed = true;
}
} else {
commitTimes.set(id, System.nanoTime());
// Sort the list of commit times. Use the quorum index to get the last time the majority of the cluster
// was contacted. If the current commitFuture's time is less than the commit time then trigger the
// commit future and reset it to the next commit future.
if (commitFuture != null && this.commitTime < commitTime()) {
commitFuture.complete(null);
completed = true;
}
if (completed) {
commitFuture = nextCommitFuture;
nextCommitFuture = null;
commitFailures = 0;
}
}
if (completed && this.commitFuture != null) {
this.commitTime = System.nanoTime();
replicas.forEach(Replica::commit);
}
}
}
/**
* Checks whether any futures can be completed.
*/
private void commitEntries() {
context.checkThread();
// Sort the list of replicas, order by the last index that was replicated
// to the replica. This will allow us to determine the median index
// for all known replicated entries across all cluster members.
Collections.sort(replicas, (o1, o2) -> Long.compare(o2.matchIndex != null ? o2.matchIndex : 0L, o1.matchIndex != null ? o1.matchIndex : 0L));
// Set the current commit index as the median replicated index.
// Since replicas is a list with zero based indexes, use the negation of
// the required quorum size to get the index of the replica with the least
// possible quorum replication. That replica's match index is the commit index.
// Set the commit index. Once the commit index has been set we can run
// all tasks up to the given commit.
Long commitIndex = replicas.get(quorumIndex).matchIndex;
if (commitIndex != null) {
context.setCommitIndex(commitIndex);
SortedMap> futures = commitFutures.headMap(commitIndex, true);
for (Map.Entry> entry : futures.entrySet()) {
entry.getValue().complete(entry.getKey());
}
futures.clear();
}
}
/**
* Remote replica.
*/
private class Replica {
private final List EMPTY_LIST = new ArrayList<>(0);
private final int id;
private final String member;
private Long nextIndex;
private Long matchIndex;
private boolean committing;
private long commitStartTime;
private Replica(int id, String member) {
this.id = id;
this.member = member;
}
/**
* Triggers a commit for the replica.
*/
private void commit() {
if (!committing && isOpen()) {
commitStartTime = commitTime;
// If the log is empty then send an empty commit.
// If the next index hasn't yet been set then we send an empty commit first.
// If the next index is greater than the last index then send an empty commit.
if (context.log().isEmpty() || nextIndex == null || nextIndex > context.log().lastIndex()) {
emptyCommit();
} else {
entriesCommit();
}
}
}
/**
* Gets the previous index.
*/
private Long getPrevIndex() {
if (nextIndex == null) {
return context.log().isEmpty() ? null : context.log().lastIndex();
}
return nextIndex - 1 > 0 ? nextIndex - 1 : null;
}
/**
* Gets the previous entry.
*/
private ByteBuffer getPrevEntry(Long prevIndex) {
if (prevIndex != null && context.log().containsIndex(prevIndex)) {
return context.log().getEntry(prevIndex);
}
return null;
}
/**
* Gets a list of entries to send.
*/
private List getEntries(Long prevIndex) {
long index;
if (context.log().isEmpty()) {
return EMPTY_LIST;
} else if (prevIndex != null) {
index = prevIndex + 1;
} else {
index = context.log().firstIndex();
}
List entries = new ArrayList<>(1024);
int size = 0;
while (size < MAX_BATCH_SIZE && index <= context.log().lastIndex()) {
ByteBuffer entry = context.log().getEntry(index);
size += entry.limit();
entries.add(entry);
index++;
}
return entries;
}
/**
* Performs an empty commit.
*/
private void emptyCommit() {
Long prevIndex = getPrevIndex();
ByteBuffer prevEntry = getPrevEntry(prevIndex);
commit(prevIndex, prevEntry, EMPTY_LIST);
}
/**
* Performs a commit with entries.
*/
private void entriesCommit() {
Long prevIndex = getPrevIndex();
ByteBuffer prevEntry = getPrevEntry(prevIndex);
List entries = getEntries(prevIndex);
commit(prevIndex, prevEntry, entries);
}
/**
* Sends a commit message.
*/
private void commit(Long prevIndex, ByteBuffer prevEntry, List entries) {
AppendRequest request = AppendRequest.builder()
.withUri(member)
.withTerm(context.getTerm())
.withLeader(context.getLocalMember())
.withLogIndex(prevIndex)
.withLogTerm(prevEntry != null ? prevEntry.getLong() : null)
.withEntries(entries)
.withFirstIndex(prevIndex == null || context.log().firstIndex() == prevIndex + 1)
.withCommitIndex(context.getCommitIndex())
.build();
committing = true;
LOGGER.debug("{} - Sent {} to {}", context.getLocalMember(), request, member);
appendHandler.apply(request).whenCompleteAsync((response, error) -> {
committing = false;
context.checkThread();
if (isOpen()) {
if (error == null) {
LOGGER.debug("{} - Received {} from {}", context.getLocalMember(), response, member);
if (response.status() == Response.Status.OK) {
// Update the commit time for the replica. This will cause heartbeat futures to be triggered.
commitTime(id, null);
// If replication succeeded then trigger commit futures.
if (response.succeeded()) {
updateMatchIndex(response);
updateNextIndex();
// If entries were committed to the replica then check commit indexes.
if (!entries.isEmpty()) {
commitEntries();
}
// If there are more entries to send then attempt to send another commit.
if (hasMoreEntries()) {
commit();
}
} else if (response.term() > context.getTerm()) {
transition(Type.FOLLOWER);
} else {
resetMatchIndex(response);
resetNextIndex();
// If there are more entries to send then attempt to send another commit.
if (hasMoreEntries()) {
commit();
}
}
} else if (response.term() > context.getTerm()) {
LOGGER.debug("{} - Received higher term from {}", context.getLocalMember(), member);
transition(Type.FOLLOWER);
} else {
LOGGER.warn("{} - {}", context.getLocalMember(), response.error() != null ? response.error().getMessage() : "");
}
} else {
LOGGER.debug("{} - {}", context.getLocalMember(), error.getMessage());
commitTime(id, error);
}
}
}, context.executor());
}
/**
* Returns a boolean value indicating whether there are more entries to send.
*/
private boolean hasMoreEntries() {
return nextIndex != null && !context.log().isEmpty() && nextIndex < context.log().lastIndex();
}
/**
* Updates the match index when a response is received.
*/
private void updateMatchIndex(AppendResponse response) {
// If the replica returned a valid match index then update the existing match index. Because the
// replicator pipelines replication, we perform a MAX(matchIndex, logIndex) to get the true match index.
if (response.logIndex() != null) {
if (matchIndex != null) {
matchIndex = Math.max(matchIndex, response.logIndex());
} else {
matchIndex = response.logIndex();
}
}
}
/**
* Updates the next index when the match index is updated.
*/
private void updateNextIndex() {
// If the match index was set, update the next index to be greater than the match index if necessary.
// Note that because of pipelining append requests, the next index can potentially be much larger than
// the match index. We rely on the algorithm to reject invalid append requests.
if (matchIndex != null) {
if (nextIndex != null) {
nextIndex = Math.max(nextIndex, matchIndex + 1);
} else {
nextIndex = matchIndex + 1;
}
}
if (nextIndex != null && nextIndex < context.log().firstIndex()) {
LOGGER.info("Log does not contain nextIndex {} due to rollover. "
+ "Setting nextIndex for {} to log's firstIndex: {}", nextIndex, member, context.log().firstIndex());
nextIndex = context.log().firstIndex();
}
}
/**
* Resets the match index when a response fails.
*/
private void resetMatchIndex(AppendResponse response) {
if (matchIndex == null) {
matchIndex = response.logIndex();
} else if (response.logIndex() != null) {
matchIndex = Math.min(matchIndex - 1L, response.logIndex());
if (matchIndex == 0L) {
matchIndex = null;
}
} else if (response.logIndex() == null) {
matchIndex = null;
}
LOGGER.debug("{} - Reset match index for {} to {}", context.getLocalMember(), member, matchIndex);
}
/**
* Resets the next index when a response fails.
*/
private void resetNextIndex() {
if (matchIndex != null) {
nextIndex = matchIndex + 1;
} else {
nextIndex = context.log().firstIndex();
}
LOGGER.debug("{} - Reset next index for {} to {}", context.getLocalMember(), member, nextIndex);
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy