All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.atomix.copycat.server.state.LeaderState Maven / Gradle / Ivy

There is a newer version: 1.2.8
Show newest version
/*
 * Copyright 2015 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package io.atomix.copycat.server.state;

import io.atomix.catalyst.transport.Address;
import io.atomix.catalyst.transport.Connection;
import io.atomix.catalyst.util.concurrent.ComposableFuture;
import io.atomix.catalyst.util.concurrent.Scheduled;
import io.atomix.copycat.client.Command;
import io.atomix.copycat.client.Query;
import io.atomix.copycat.client.error.InternalException;
import io.atomix.copycat.client.error.RaftError;
import io.atomix.copycat.client.error.RaftException;
import io.atomix.copycat.client.request.*;
import io.atomix.copycat.client.response.*;
import io.atomix.copycat.server.CopycatServer;
import io.atomix.copycat.server.request.*;
import io.atomix.copycat.server.response.*;
import io.atomix.copycat.server.storage.entry.*;

import java.time.Duration;
import java.util.*;
import java.util.concurrent.CompletableFuture;

/**
 * Leader state.
 *
 * @author Jordan Halterman
 */
final class LeaderState extends ActiveState {
  private static final int MAX_BATCH_SIZE = 1024 * 28;
  private Scheduled currentTimer;
  private final Replicator replicator = new Replicator();
  private long leaderIndex;
  private long configuring;

  public LeaderState(ServerState context) {
    super(context);
  }

  @Override
  public CopycatServer.State type() {
    return CopycatServer.State.LEADER;
  }

  @Override
  public synchronized CompletableFuture open() {
    // Schedule the initial entries commit to occur after the state is opened. Attempting any communication
    // within the open() method will result in a deadlock since RaftProtocol calls this method synchronously.
    // What is critical about this logic is that the heartbeat timer not be started until a no-op entry has been committed.
    context.getThreadContext().execute(this::commitEntries).whenComplete((result, error) -> {
      if (isOpen() && error == null) {
        startHeartbeatTimer();
      }
    });

    return super.open()
      .thenRun(this::takeLeadership)
      .thenApply(v -> this);
  }

  /**
   * Sets the current node as the cluster leader.
   */
  private void takeLeadership() {
    context.setLeader(context.getAddress().hashCode());
  }

  /**
   * Commits a no-op entry to the log, ensuring any entries from a previous term are committed.
   */
  private CompletableFuture commitEntries() {
    final long term = context.getTerm();
    final long index;
    try (NoOpEntry entry = context.getLog().create(NoOpEntry.class)) {
      entry.setTerm(term)
        .setTimestamp(System.currentTimeMillis());
      index = context.getLog().append(entry);
    }

    // Store the index at which the leader took command.
    leaderIndex = index;

    CompletableFuture future = new CompletableFuture<>();
    replicator.commit(index).whenComplete((resultIndex, error) -> {
      context.checkThread();
      if (isOpen()) {
        if (error == null) {
          applyEntries(resultIndex);
          future.complete(null);
        } else {
          transition(CopycatServer.State.FOLLOWER);
        }
      }
    });
    return future;
  }

  /**
   * Applies all unapplied entries to the log.
   */
  private void applyEntries(long index) {
    if (!context.getLog().isEmpty()) {
      int count = 0;
      for (long lastApplied = Math.max(context.getLastApplied(), context.getLog().firstIndex()); lastApplied <= index; lastApplied++) {
        Entry entry = context.getLog().get(lastApplied);
        if (entry != null) {
          context.getStateMachine().apply(entry).whenComplete((result, error) -> {
            if (isOpen() && error != null) {
              LOGGER.info("{} - An application error occurred: {}", context.getAddress(), error.getMessage());
            }
            entry.release();
          });
        }
        count++;
      }

      LOGGER.debug("{} - Applied {} entries to log", context.getAddress(), count);
    }
  }

  /**
   * Starts heartbeating all cluster members.
   */
  private void startHeartbeatTimer() {
    // Set a timer that will be used to periodically synchronize with other nodes
    // in the cluster. This timer acts as a heartbeat to ensure this node remains
    // the leader.
    LOGGER.debug("{} - Starting heartbeat timer", context.getAddress());
    currentTimer = context.getThreadContext().schedule(Duration.ZERO, context.getHeartbeatInterval(), this::heartbeatMembers);
  }

  /**
   * Sends a heartbeat to all members of the cluster.
   */
  private void heartbeatMembers() {
    context.checkThread();
    if (isOpen()) {
      replicator.commit();
    }
  }

  /**
   * Checks for expired sessions.
   */
  private void checkSessions() {
    long term = context.getTerm();
    for (ServerSession session : context.getStateMachine().executor().context().sessions().sessions.values()) {
      if (!session.isUnregistering() && session.isSuspect()) {
        LOGGER.debug("{} - Detected expired session: {}", context.getAddress(), session.id());

        final long index;
        try (UnregisterEntry entry = context.getLog().create(UnregisterEntry.class)) {
          entry.setTerm(term)
            .setSession(session.id())
            .setTimestamp(System.currentTimeMillis());
          index = context.getLog().append(entry);
          LOGGER.debug("{} - Appended {} to log at index {}", context.getAddress(), entry, index);
        }

        replicator.commit(index).whenComplete((result, error) -> {
          if (isOpen()) {
            UnregisterEntry entry = context.getLog().get(index);
            applyEntry(entry);
          }
        });
        session.unregister();
      }
    }
  }

  @Override
  public CompletableFuture join(final JoinRequest request) {
    context.checkThread();
    logRequest(request);

    // If another configuration change is already under way, reject the configuration.
    if (configuring > 0) {
      return CompletableFuture.completedFuture(logResponse(JoinResponse.builder()
        .withStatus(Response.Status.ERROR)
        .build()));
    }

    // If the leader index is 0 or is greater than the commitIndex, reject the join requests.
    // Configuration changes should not be allowed until the leader has committed a no-op entry.
    // See https://groups.google.com/forum/#!topic/raft-dev/t4xj6dJTP6E
    if (leaderIndex == 0 || context.getCommitIndex() < leaderIndex) {
      return CompletableFuture.completedFuture(logResponse(JoinResponse.builder()
        .withStatus(Response.Status.ERROR)
        .build()));
    }

    // If the member is already a known member of the cluster, complete the join successfully.
    if (context.getCluster().getMember(request.member().hashCode()) != null) {
      return CompletableFuture.completedFuture(logResponse(JoinResponse.builder()
        .withStatus(Response.Status.OK)
        .withVersion(context.getCluster().getVersion())
        .withActiveMembers(context.getCluster().buildActiveMembers())
        .withPassiveMembers(context.getCluster().buildPassiveMembers())
        .build()));
    }

    final long term = context.getTerm();
    final long index;

    Collection
activeMembers = context.getCluster().buildActiveMembers(); Collection
passiveMembers = context.getCluster().buildPassiveMembers(); passiveMembers.add(request.member()); try (ConfigurationEntry entry = context.getLog().create(ConfigurationEntry.class)) { entry.setTerm(term) .setActive(activeMembers) .setPassive(passiveMembers); index = context.getLog().append(entry); LOGGER.debug("{} - Appended {} to log at index {}", context.getAddress(), entry, index); configuring = index; context.getCluster().configure(entry.getIndex(), entry.getActive(), entry.getPassive()); } CompletableFuture future = new CompletableFuture<>(); replicator.commit(index).whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { configuring = 0; if (commitError == null) { future.complete(logResponse(JoinResponse.builder() .withStatus(Response.Status.OK) .withVersion(index) .withActiveMembers(activeMembers) .withPassiveMembers(passiveMembers) .build())); } else { future.complete(logResponse(JoinResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } } }); return future; } @Override public CompletableFuture leave(final LeaveRequest request) { context.checkThread(); logRequest(request); // If another configuration change is already under way, reject the configuration. if (configuring > 0) { return CompletableFuture.completedFuture(logResponse(LeaveResponse.builder() .withStatus(Response.Status.ERROR) .build())); } // If the leader index is 0 or is greater than the commitIndex, reject the join requests. // Configuration changes should not be allowed until the leader has committed a no-op entry. // See https://groups.google.com/forum/#!topic/raft-dev/t4xj6dJTP6E if (leaderIndex == 0 || context.getCommitIndex() < leaderIndex) { return CompletableFuture.completedFuture(logResponse(LeaveResponse.builder() .withStatus(Response.Status.ERROR) .build())); } // If the leaving member is not a known member of the cluster, complete the leave successfully. if (context.getMember(request.member().hashCode()) == null) { return CompletableFuture.completedFuture(logResponse(LeaveResponse.builder() .withStatus(Response.Status.OK) .withActiveMembers(context.getCluster().buildActiveMembers()) .withPassiveMembers(context.getCluster().buildPassiveMembers()) .build())); } final long term = context.getTerm(); final long index; Collection
activeMembers = context.getCluster().buildActiveMembers(); activeMembers.remove(request.member()); Collection
passiveMembers = context.getCluster().buildPassiveMembers(); passiveMembers.remove(request.member()); try (ConfigurationEntry entry = context.getLog().create(ConfigurationEntry.class)) { entry.setTerm(term) .setActive(activeMembers) .setPassive(passiveMembers); index = context.getLog().append(entry); LOGGER.debug("{} - Appended {} to log at index {}", context.getAddress(), entry, index); configuring = index; context.getCluster().configure(entry.getIndex(), entry.getActive(), entry.getPassive()); } CompletableFuture future = new CompletableFuture<>(); replicator.commit(index).whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { configuring = 0; if (commitError == null) { future.complete(logResponse(LeaveResponse.builder() .withStatus(Response.Status.OK) .withVersion(index) .withActiveMembers(activeMembers) .withPassiveMembers(passiveMembers) .build())); } else { future.complete(logResponse(LeaveResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } } }); return future; } @Override public CompletableFuture poll(final PollRequest request) { return CompletableFuture.completedFuture(logResponse(PollResponse.builder() .withStatus(Response.Status.OK) .withTerm(context.getTerm()) .withAccepted(false) .build())); } @Override public CompletableFuture vote(final VoteRequest request) { if (request.term() > context.getTerm()) { LOGGER.debug("{} - Received greater term", context.getAddress()); context.setLeader(0); transition(CopycatServer.State.FOLLOWER); return super.vote(request); } else { return CompletableFuture.completedFuture(logResponse(VoteResponse.builder() .withStatus(Response.Status.OK) .withTerm(context.getTerm()) .withVoted(false) .build())); } } @Override public CompletableFuture append(final AppendRequest request) { context.checkThread(); if (request.term() > context.getTerm()) { return super.append(request); } else if (request.term() < context.getTerm()) { return CompletableFuture.completedFuture(logResponse(AppendResponse.builder() .withStatus(Response.Status.OK) .withTerm(context.getTerm()) .withSucceeded(false) .withLogIndex(context.getLog().lastIndex()) .build())); } else { context.setLeader(request.leader()); transition(CopycatServer.State.FOLLOWER); return super.append(request); } } @Override protected CompletableFuture command(final CommandRequest request) { context.checkThread(); logRequest(request); // Get the client's server session. If the session doesn't exist, return an unknown session error. ServerSession session = context.getStateMachine().executor().context().sessions().getSession(request.session()); if (session == null) { return CompletableFuture.completedFuture(logResponse(CommandResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.UNKNOWN_SESSION_ERROR) .build())); } ComposableFuture future = new ComposableFuture<>(); Command command = request.command(); // If the command is LINEARIZABLE and the session's current sequence number is less then one prior to the request // sequence number, queue this request for handling later. We want to handle command requests in the order in which // they were sent by the client. Note that it's possible for the session sequence number to be greater than the request // sequence number. In that case, it's likely that the command was submitted more than once to the // cluster, and the command will be deduplicated once applied to the state machine. if (request.sequence() > session.nextRequest()) { session.registerRequest(request.sequence(), () -> command(request).whenComplete(future)); return future; } final long term = context.getTerm(); final long timestamp = System.currentTimeMillis(); final long index; // Create a CommandEntry and append it to the log. try (CommandEntry entry = context.getLog().create(CommandEntry.class)) { entry.setTerm(term) .setSession(request.session()) .setTimestamp(timestamp) .setSequence(request.sequence()) .setCommand(command); index = context.getLog().append(entry); LOGGER.debug("{} - Appended entry to log at index {}", context.getAddress(), index); } replicator.commit(index).whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { if (commitError == null) { CommandEntry entry = context.getLog().get(index); LOGGER.debug("{} - Applying {}", context.getAddress(), entry); context.getStateMachine().apply(entry, true).whenComplete((result, error) -> { if (isOpen()) { if (error == null) { future.complete(logResponse(CommandResponse.builder() .withStatus(Response.Status.OK) .withVersion(entry.getIndex()) .withResult(result) .build())); } else if (error instanceof RaftException) { future.complete(logResponse(CommandResponse.builder() .withStatus(Response.Status.ERROR) .withVersion(entry.getIndex()) .withError(((RaftException) error).getType()) .build())); } else { future.complete(logResponse(CommandResponse.builder() .withStatus(Response.Status.ERROR) .withVersion(entry.getIndex()) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } checkSessions(); } entry.release(); }); } else { future.complete(logResponse(CommandResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } } }); // Set the last processed request for the session. This will cause sequential command callbacks to be executed. session.setRequest(request.sequence()); return future; } @Override protected CompletableFuture query(final QueryRequest request) { Query query = request.query(); final long timestamp = System.currentTimeMillis(); final long index = context.getCommitIndex(); context.checkThread(); logRequest(request); QueryEntry entry = context.getLog().create(QueryEntry.class) .setIndex(index) .setTerm(context.getTerm()) .setTimestamp(timestamp) .setSession(request.session()) .setSequence(request.sequence()) .setVersion(request.version()) .setQuery(query); Query.ConsistencyLevel consistency = query.consistency(); if (consistency == null) return submitQueryLinearizable(entry); switch (consistency) { case CAUSAL: case SEQUENTIAL: return submitQueryLocal(entry); case BOUNDED_LINEARIZABLE: return submitQueryBoundedLinearizable(entry); case LINEARIZABLE: return submitQueryLinearizable(entry); default: throw new IllegalStateException("unknown consistency level"); } } /** * Submits a query with serializable consistency. */ private CompletableFuture submitQueryLocal(QueryEntry entry) { return applyQuery(entry, new CompletableFuture<>()); } /** * Submits a query with lease bounded linearizable consistency. */ private CompletableFuture submitQueryBoundedLinearizable(QueryEntry entry) { long commitTime = replicator.commitTime(); if (System.currentTimeMillis() - commitTime < context.getElectionTimeout().toMillis()) { return submitQueryLocal(entry); } else { return submitQueryLinearizable(entry); } } /** * Submits a query with strict linearizable consistency. */ private CompletableFuture submitQueryLinearizable(QueryEntry entry) { CompletableFuture future = new CompletableFuture<>(); replicator.commit().whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { if (commitError == null) { entry.acquire(); applyQuery(entry, future); } else { future.complete(logResponse(QueryResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.COMMAND_ERROR) .build())); } } entry.release(); }); return future; } /** * Applies a query to the state machine. */ private CompletableFuture applyQuery(QueryEntry entry, CompletableFuture future) { // In the case of the leader, the state machine is always up to date, so no queries will be queued and all query // versions will be the last applied index. final long version = context.getStateMachine().getLastApplied(); applyEntry(entry).whenComplete((result, error) -> { if (isOpen()) { if (error == null) { future.complete(logResponse(QueryResponse.builder() .withStatus(Response.Status.OK) .withVersion(version) .withResult(result) .build())); } else if (error instanceof RaftException) { future.complete(logResponse(QueryResponse.builder() .withStatus(Response.Status.ERROR) .withError(((RaftException) error).getType()) .build())); } else { future.complete(logResponse(QueryResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } checkSessions(); } entry.release(); }); return future; } @Override protected CompletableFuture register(RegisterRequest request) { final long timestamp = System.currentTimeMillis(); final long index; final long timeout = context.getSessionTimeout().toMillis(); context.checkThread(); logRequest(request); try (RegisterEntry entry = context.getLog().create(RegisterEntry.class)) { entry.setTerm(context.getTerm()) .setTimestamp(timestamp) .setClient(request.client()) .setTimeout(timeout); index = context.getLog().append(entry); LOGGER.debug("{} - Appended {}", context.getAddress(), entry); } CompletableFuture future = new CompletableFuture<>(); replicator.commit(index).whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { if (commitError == null) { RegisterEntry entry = context.getLog().get(index); applyEntry(entry).whenComplete((sessionId, sessionError) -> { if (isOpen()) { if (sessionError == null) { future.complete(logResponse(RegisterResponse.builder() .withStatus(Response.Status.OK) .withSession((Long) sessionId) .withTimeout(timeout) .withMembers(context.getCluster().buildActiveMembers()) .build())); } else if (sessionError instanceof RaftException) { future.complete(logResponse(RegisterResponse.builder() .withStatus(Response.Status.ERROR) .withError(((RaftException) sessionError).getType()) .build())); } else { future.complete(logResponse(RegisterResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } checkSessions(); } entry.release(); }); } else { future.complete(logResponse(RegisterResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } } }); return future; } @Override protected CompletableFuture connect(ConnectRequest request, Connection connection) { context.checkThread(); logRequest(request); context.getStateMachine().executor().context().sessions().registerConnection(request.session(), connection); AcceptRequest acceptRequest = AcceptRequest.builder() .withSession(request.session()) .withAddress(context.getAddress()) .build(); return accept(acceptRequest) .thenApply(acceptResponse -> ConnectResponse.builder().withStatus(Response.Status.OK).build()) .thenApply(this::logResponse); } @Override protected CompletableFuture accept(AcceptRequest request) { final long timestamp = System.currentTimeMillis(); final long index; context.checkThread(); logRequest(request); try (ConnectEntry entry = context.getLog().create(ConnectEntry.class)) { entry.setTerm(context.getTerm()) .setSession(request.session()) .setTimestamp(timestamp) .setAddress(request.address()); index = context.getLog().append(entry); LOGGER.debug("{} - Appended {}", context.getAddress(), entry); } context.getStateMachine().executor().context().sessions().registerAddress(request.session(), request.address()); CompletableFuture future = new CompletableFuture<>(); replicator.commit(index).whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { if (commitError == null) { ConnectEntry entry = context.getLog().get(index); applyEntry(entry).whenComplete((connectResult, connectError) -> { if (isOpen()) { if (connectError == null) { future.complete(logResponse(AcceptResponse.builder() .withStatus(Response.Status.OK) .build())); } else if (connectError instanceof RaftException) { future.complete(logResponse(AcceptResponse.builder() .withStatus(Response.Status.ERROR) .withError(((RaftException) connectError).getType()) .build())); } else { future.complete(logResponse(AcceptResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } checkSessions(); } entry.release(); }); } else { future.complete(logResponse(AcceptResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } } }); return future; } @Override protected CompletableFuture keepAlive(KeepAliveRequest request) { final long timestamp = System.currentTimeMillis(); final long index; context.checkThread(); logRequest(request); try (KeepAliveEntry entry = context.getLog().create(KeepAliveEntry.class)) { entry.setTerm(context.getTerm()) .setSession(request.session()) .setCommandSequence(request.commandSequence()) .setEventVersion(request.eventVersion()) .setTimestamp(timestamp); index = context.getLog().append(entry); LOGGER.debug("{} - Appended {}", context.getAddress(), entry); } CompletableFuture future = new CompletableFuture<>(); replicator.commit(index).whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { if (commitError == null) { KeepAliveEntry entry = context.getLog().get(index); applyEntry(entry).whenCompleteAsync((sessionResult, sessionError) -> { if (isOpen()) { if (sessionError == null) { future.complete(logResponse(KeepAliveResponse.builder() .withStatus(Response.Status.OK) .withMembers(context.getCluster().buildActiveMembers()) .build())); } else if (sessionError instanceof RaftException) { future.complete(logResponse(KeepAliveResponse.builder() .withStatus(Response.Status.ERROR) .withError(((RaftException) sessionError).getType()) .build())); } else { future.complete(logResponse(KeepAliveResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } checkSessions(); } entry.release(); }, context.getThreadContext().executor()); } else { future.complete(logResponse(KeepAliveResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } } }); return future; } @Override protected CompletableFuture unregister(UnregisterRequest request) { final long timestamp = System.currentTimeMillis(); final long index; context.checkThread(); logRequest(request); try (UnregisterEntry entry = context.getLog().create(UnregisterEntry.class)) { entry.setTerm(context.getTerm()) .setSession(request.session()) .setTimestamp(timestamp); index = context.getLog().append(entry); LOGGER.debug("{} - Appended {}", context.getAddress(), entry); } CompletableFuture future = new CompletableFuture<>(); replicator.commit(index).whenComplete((commitIndex, commitError) -> { context.checkThread(); if (isOpen()) { if (commitError == null) { KeepAliveEntry entry = context.getLog().get(index); applyEntry(entry).whenCompleteAsync((sessionResult, sessionError) -> { if (isOpen()) { if (sessionError == null) { future.complete(logResponse(UnregisterResponse.builder() .withStatus(Response.Status.OK) .build())); } else if (sessionError instanceof RaftException) { future.complete(logResponse(UnregisterResponse.builder() .withStatus(Response.Status.ERROR) .withError(((RaftException) sessionError).getType()) .build())); } else { future.complete(logResponse(UnregisterResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } checkSessions(); } entry.release(); }, context.getThreadContext().executor()); } else { future.complete(logResponse(UnregisterResponse.builder() .withStatus(Response.Status.ERROR) .withError(RaftError.Type.INTERNAL_ERROR) .build())); } } }); return future; } /** * Cancels the ping timer. */ private void cancelPingTimer() { if (currentTimer != null) { LOGGER.debug("{} - Cancelling heartbeat timer", context.getAddress()); currentTimer.cancel(); } } @Override public synchronized CompletableFuture close() { return super.close().thenRun(this::cancelPingTimer); } /** * Log replicator. */ private class Replicator { private final Set committing = new HashSet<>(); private long commitTime; private int commitFailures; private CompletableFuture commitFuture; private CompletableFuture nextCommitFuture; private final TreeMap> commitFutures = new TreeMap<>(); /** * Returns the current quorum index. * * @return The current quorum index. */ private int quorumIndex() { return context.getCluster().getQuorum() - 2; } /** * Triggers a commit. * * @return A completable future to be completed the next time entries are committed to a majority of the cluster. */ private CompletableFuture commit() { if (context.getCluster().getMembers().size() == 0) return CompletableFuture.completedFuture(null); if (commitFuture == null) { commitFuture = new CompletableFuture<>(); commitTime = System.currentTimeMillis(); for (MemberState member : context.getCluster().getMembers()) { commit(member); } return commitFuture; } else if (nextCommitFuture == null) { nextCommitFuture = new CompletableFuture<>(); return nextCommitFuture; } else { return nextCommitFuture; } } /** * Registers a commit handler for the given commit index. * * @param index The index for which to register the handler. * @return A completable future to be completed once the given log index has been committed. */ private CompletableFuture commit(long index) { if (index == 0) return commit(); if (context.getCluster().getActiveMembers().isEmpty()) { context.setCommitIndex(index); return CompletableFuture.completedFuture(index); } return commitFutures.computeIfAbsent(index, i -> { for (MemberState member : context.getCluster().getMembers()) { commit(member); } return new CompletableFuture<>(); }); } /** * Returns the last time a majority of the cluster was contacted. */ private long commitTime() { int quorumIndex = quorumIndex(); if (quorumIndex >= 0) { long commitTime = context.getCluster().getActiveMembers((m1, m2) -> (int) (m2.getCommitTime() - m1.getCommitTime())).get(quorumIndex).getCommitTime(); if (commitTime > 0) return commitTime; } return System.currentTimeMillis(); } /** * Sets a commit time or fails the commit if a quorum of successful responses cannot be achieved. */ private void commitTime(MemberState member, Throwable error) { if (commitFuture == null) { return; } boolean completed = false; if (error != null && member.getCommitStartTime() == this.commitTime) { int activeMemberSize = context.getCluster().getActiveMembers().size() + (context.getCluster().isActive() ? 1 : 0); int quorumSize = context.getCluster().getQuorum(); // If a quorum of successful responses cannot be achieved, fail this commit. if (activeMemberSize - quorumSize + 1 <= ++commitFailures) { commitFuture.completeExceptionally(new InternalException("Failed to reach consensus")); completed = true; } } else { member.setCommitTime(System.currentTimeMillis()); // Sort the list of commit times. Use the quorum index to get the last time the majority of the cluster // was contacted. If the current commitFuture's time is less than the commit time then trigger the // commit future and reset it to the next commit future. if (this.commitTime <= commitTime()) { commitFuture.complete(null); completed = true; } } if (completed) { commitFailures = 0; commitFuture = nextCommitFuture; nextCommitFuture = null; if (commitFuture != null) { this.commitTime = System.currentTimeMillis(); for (MemberState replica : context.getCluster().getMembers()) { commit(replica); } } } } /** * Checks whether any futures can be completed. */ private void commitEntries() { context.checkThread(); // Sort the list of replicas, order by the last index that was replicated // to the replica. This will allow us to determine the median index // for all known replicated entries across all cluster members. List members = context.getCluster().getActiveMembers((m1, m2) -> Long.compare(m2.getMatchIndex() != 0 ? m2.getMatchIndex() : 0l, m1.getMatchIndex() != 0 ? m1.getMatchIndex() : 0l)); // Set the current commit index as the median replicated index. // Since replicas is a list with zero based indexes, use the negation of // the required quorum count to get the index of the replica with the least // possible quorum replication. That replica's match index is the commit index. // Set the commit index. Once the commit index has been set we can run // all tasks up to the given commit. long commitIndex = members.get(quorumIndex()).getMatchIndex(); long globalIndex = members.get(members.size() - 1).getMatchIndex(); if (commitIndex > 0 && (leaderIndex > 0 && commitIndex >= leaderIndex)) { context.setCommitIndex(commitIndex); context.setGlobalIndex(globalIndex); context.getLog().commit(globalIndex); SortedMap> futures = commitFutures.headMap(commitIndex, true); for (Map.Entry> entry : futures.entrySet()) { entry.getValue().complete(entry.getKey()); } futures.clear(); } } /** * Triggers a commit for the replica. */ private void commit(MemberState member) { if (!committing.contains(member) && isOpen()) { // If the log is empty then send an empty commit. // If the next index hasn't yet been set then we send an empty commit first. // If the next index is greater than the last index then send an empty commit. if (context.getLog().isEmpty() || member.getNextIndex() > context.getLog().lastIndex()) { emptyCommit(member); } else { entriesCommit(member); } } } /** * Gets the previous index. */ private long getPrevIndex(MemberState member) { return member.getNextIndex() - 1; } /** * Gets the previous entry. */ private Entry getPrevEntry(MemberState member, long prevIndex) { if (prevIndex > 0) { return context.getLog().get(prevIndex); } return null; } /** * Performs an empty commit. */ private void emptyCommit(MemberState member) { long prevIndex = getPrevIndex(member); Entry prevEntry = getPrevEntry(member, prevIndex); AppendRequest.Builder builder = AppendRequest.builder() .withTerm(context.getTerm()) .withLeader(context.getAddress().hashCode()) .withLogIndex(prevIndex) .withLogTerm(prevEntry != null ? prevEntry.getTerm() : 0) .withCommitIndex(context.getCommitIndex()) .withGlobalIndex(context.getGlobalIndex()); commit(member, builder.build(), false); } /** * Performs a commit with entries. */ private void entriesCommit(MemberState member) { long prevIndex = getPrevIndex(member); Entry prevEntry = getPrevEntry(member, prevIndex); AppendRequest.Builder builder = AppendRequest.builder() .withTerm(context.getTerm()) .withLeader(context.getAddress().hashCode()) .withLogIndex(prevIndex) .withLogTerm(prevEntry != null ? prevEntry.getTerm() : 0) .withCommitIndex(context.getCommitIndex()) .withGlobalIndex(context.getGlobalIndex()); if (!context.getLog().isEmpty()) { long index = prevIndex != 0 ? prevIndex + 1 : context.getLog().firstIndex(); int size = 0; while (index <= context.getLog().lastIndex()) { Entry entry = context.getLog().get(index); if (entry != null) { if (size + entry.size() > MAX_BATCH_SIZE) { break; } size += entry.size(); builder.addEntry(entry); } index++; } } if (prevEntry != null) { prevEntry.release(); } commit(member, builder.build(), true); } /** * Connects to the member and sends a commit message. */ private void commit(MemberState member, AppendRequest request, boolean recursive) { committing.add(member); member.setCommitStartTime(commitTime); LOGGER.debug("{} - Sent {} to {}", context.getAddress(), request, member.getAddress()); context.getConnections().getConnection(member.getAddress()).whenComplete((connection, error) -> { context.checkThread(); if (isOpen()) { if (error == null) { commit(connection, member, request, recursive); } else { committing.remove(member); commitTime(member, error); failAttempt(member, error); } } }); } /** * Sends a commit message. */ private void commit(Connection connection, MemberState member, AppendRequest request, boolean recursive) { connection.send(request).whenComplete((response, error) -> { committing.remove(member); context.checkThread(); if (isOpen()) { if (error == null) { LOGGER.debug("{} - Received {} from {}", context.getAddress(), response, member.getAddress()); if (response.status() == Response.Status.OK) { // Reset the member failure count. member.resetFailureCount(); // Update the commit time for the replica. This will cause heartbeat futures to be triggered. commitTime(member, null); // If replication succeeded then trigger commit futures. if (response.succeeded()) { updateMatchIndex(member, response); updateNextIndex(member); updateConfiguration(member); // If entries were committed to the replica then check commit indexes. if (recursive) { commitEntries(); } // If there are more entries to send then attempt to send another commit. if (hasMoreEntries(member)) { commit(); } } else if (response.term() > context.getTerm()) { context.setLeader(0); transition(CopycatServer.State.FOLLOWER); } else { resetMatchIndex(member, response); resetNextIndex(member); // If there are more entries to send then attempt to send another commit. if (hasMoreEntries(member)) { commit(); } } } else if (response.term() > context.getTerm()) { LOGGER.debug("{} - Received higher term from {}", context.getAddress(), member.getAddress()); context.setLeader(0); transition(CopycatServer.State.FOLLOWER); } else { int failures = member.incrementFailureCount(); if (failures <= 3 || failures % 100 == 0) { LOGGER.warn("{} - {}", context.getAddress(), response.error() != null ? response.error() : ""); } } } else { commitTime(member, error); failAttempt(member, error); } } }); } /** * Fails an attempt to contact a member. */ private void failAttempt(MemberState member, Throwable error) { int failures = member.incrementFailureCount(); if (failures <= 3 || failures % 100 == 0) { LOGGER.warn("{} - {}", context.getAddress(), error.getMessage()); } // Verify that the leader has contacted a majority of the cluster within the last two election timeouts. // If the leader is not able to contact a majority of the cluster within two election timeouts, assume // that a partition occurred and transition back to the FOLLOWER state. if (System.currentTimeMillis() - commitTime() > context.getElectionTimeout().toMillis() * 2) { LOGGER.warn("{} - Suspected network partition. Stepping down", context.getAddress()); context.setLeader(0); transition(CopycatServer.State.FOLLOWER); } } /** * Returns a boolean value indicating whether there are more entries to send. */ private boolean hasMoreEntries(MemberState member) { return member.getNextIndex() < context.getLog().lastIndex(); } /** * Updates the match index when a response is received. */ private void updateMatchIndex(MemberState member, AppendResponse response) { // If the replica returned a valid match index then update the existing match index. Because the // replicator pipelines replication, we perform a MAX(matchIndex, logIndex) to get the true match index. member.setMatchIndex(Math.max(member.getMatchIndex(), response.logIndex())); } /** * Updates the next index when the match index is updated. */ private void updateNextIndex(MemberState member) { // If the match index was set, update the next index to be greater than the match index if necessary. // Note that because of pipelining append requests, the next index can potentially be much larger than // the match index. We rely on the algorithm to reject invalid append requests. member.setNextIndex(Math.max(member.getNextIndex(), Math.max(member.getMatchIndex() + 1, 1))); } /** * Updates the cluster configuration for the given member. */ private void updateConfiguration(MemberState member) { if (context.getCluster().isPassiveMember(member) && member.getMatchIndex() >= context.getCommitIndex()) { if (configuring > 0) { commit(configuring).whenComplete((result, error) -> { promoteConfiguration(member); }); } else { promoteConfiguration(member); } } } /** * Promotes the given member. */ private void promoteConfiguration(MemberState member) { LOGGER.info("{} - Promoting {}", context.getAddress(), member); Collection
activeMembers = context.getCluster().buildActiveMembers(); activeMembers.add(member.getAddress()); Collection
passiveMembers = context.getCluster().buildPassiveMembers(); passiveMembers.remove(member.getAddress()); try (ConfigurationEntry entry = context.getLog().create(ConfigurationEntry.class)) { entry.setTerm(context.getTerm()) .setActive(activeMembers) .setPassive(passiveMembers); long index = context.getLog().append(entry); LOGGER.debug("{} - Appended {} to log at index {}", context.getAddress(), entry, index); // Immediately apply the configuration upon appending the configuration entry. configuring = index; context.getCluster().configure(entry.getIndex(), entry.getActive(), entry.getPassive()); } commit(configuring).whenComplete((result, error) -> { context.checkThread(); configuring = 0; }); } /** * Resets the match index when a response fails. */ private void resetMatchIndex(MemberState member, AppendResponse response) { member.setMatchIndex(response.logIndex()); LOGGER.debug("{} - Reset match index for {} to {}", context.getAddress(), member, member.getMatchIndex()); } /** * Resets the next index when a response fails. */ private void resetNextIndex(MemberState member) { if (member.getMatchIndex() != 0) { member.setNextIndex(member.getMatchIndex() + 1); } else { member.setNextIndex(context.getLog().firstIndex()); } LOGGER.debug("{} - Reset next index for {} to {}", context.getAddress(), member, member.getNextIndex()); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy