All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.journalkeeper.core.server.Voter Maven / Gradle / Ivy

There is a newer version: 0.1.11
Show newest version
/**
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 

* http://www.apache.org/licenses/LICENSE-2.0 *

* Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package io.journalkeeper.core.server; import io.journalkeeper.core.api.JournalEntry; import io.journalkeeper.core.api.JournalEntryParser; import io.journalkeeper.core.api.ServerStatus; import io.journalkeeper.core.api.SnapshotEntry; import io.journalkeeper.core.api.SnapshotsEntry; import io.journalkeeper.core.api.StateFactory; import io.journalkeeper.core.api.UpdateRequest; import io.journalkeeper.core.api.VoterState; import io.journalkeeper.core.api.transaction.UUIDTransactionId; import io.journalkeeper.core.entry.internal.InternalEntriesSerializeSupport; import io.journalkeeper.core.entry.internal.InternalEntryType; import io.journalkeeper.core.entry.internal.UpdateVotersS1Entry; import io.journalkeeper.exceptions.UpdateConfigurationException; import io.journalkeeper.core.journal.Journal; import io.journalkeeper.core.state.ConfigState; import io.journalkeeper.exceptions.NotLeaderException; import io.journalkeeper.persistence.ServerMetadata; import io.journalkeeper.rpc.client.CheckLeadershipResponse; import io.journalkeeper.rpc.client.CompleteTransactionRequest; import io.journalkeeper.rpc.client.CompleteTransactionResponse; import io.journalkeeper.rpc.client.CreateTransactionRequest; import io.journalkeeper.rpc.client.CreateTransactionResponse; import io.journalkeeper.rpc.client.GetOpeningTransactionsResponse; import io.journalkeeper.rpc.client.GetServerStatusResponse; import io.journalkeeper.rpc.client.GetSnapshotsResponse; import io.journalkeeper.rpc.client.LastAppliedResponse; import io.journalkeeper.rpc.client.QueryStateRequest; import io.journalkeeper.rpc.client.QueryStateResponse; import io.journalkeeper.rpc.client.UpdateClusterStateRequest; import io.journalkeeper.rpc.client.UpdateClusterStateResponse; import io.journalkeeper.rpc.client.UpdateVotersRequest; import io.journalkeeper.rpc.client.UpdateVotersResponse; import io.journalkeeper.rpc.server.AsyncAppendEntriesRequest; import io.journalkeeper.rpc.server.AsyncAppendEntriesResponse; import io.journalkeeper.rpc.server.DisableLeaderWriteRequest; import io.journalkeeper.rpc.server.DisableLeaderWriteResponse; import io.journalkeeper.rpc.server.InstallSnapshotRequest; import io.journalkeeper.rpc.server.InstallSnapshotResponse; import io.journalkeeper.rpc.server.RequestVoteRequest; import io.journalkeeper.rpc.server.RequestVoteResponse; import io.journalkeeper.rpc.server.ServerRpcAccessPoint; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URI; import java.nio.file.Paths; import java.util.List; import java.util.Properties; import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import static io.journalkeeper.core.api.RaftJournal.INTERNAL_PARTITION; /** * @author LiYue * Date: 2019-03-18 */ class Voter extends AbstractServer implements CheckTermInterceptor { private static final Logger logger = LoggerFactory.getLogger(Voter.class); /** * 触发切换指定Leader的门限值 */ private static final long PREFERRED_LEADER_IN_SYNC_THRESHOLD = 128L; /** * Voter最后知道的任期号(从 0 开始递增) */ private final AtomicInteger currentTerm = new AtomicInteger(0); // LEADER ONLY /** * 串行处理所有RequestVoterRPC request/response */ private final Object voteRequestMutex = new Object(); private final Config config; /** * 选民状态,在LEADER、FOLLOWER和CANDIDATE之间转换。初始值为FOLLOWER。 */ private final VoterStateMachine voterState = new VoterStateMachine(); /** * 在当前任期内收到选票的候选人地址(如果没有就为 null) */ private URI votedFor = null; /** * 选举(心跳)超时 */ private long electionTimeoutMs; /** * 上次从LEADER收到心跳(asyncAppendEntries)的时间戳 */ private long lastHeartbeat = 0L; /** * 检查选举超时定时任务 */ private ScheduledFuture checkElectionTimeoutFuture; private ScheduledFuture printStateFuture; private Leader leader; private Follower follower; // 下次发起选举的时间 private long nextElectionTime = 0L; Voter(StateFactory stateFactory, JournalEntryParser journalEntryParser, ScheduledExecutorService scheduledExecutor, ExecutorService asyncExecutor, ServerRpcAccessPoint serverRpcAccessPoint, Properties properties) { super(stateFactory, journalEntryParser, scheduledExecutor, asyncExecutor, serverRpcAccessPoint, properties); this.config = toConfig(properties); state.addInterceptor(InternalEntryType.TYPE_UPDATE_VOTERS_S1, this::applyUpdateVotersInternalEntry); electionTimeoutMs = config.getElectionTimeoutMs() + randomInterval(config.getElectionTimeoutMs()); } private void applyUpdateVotersInternalEntry(InternalEntryType type, byte[] internalEntry) { voterConfigManager.applyReservedEntry(type, internalEntry, voterState(), state.getConfigState(), this); } @Override protected void onJournalFlushed() { if (null != leader) { leader.onJournalFlushed(); } } private Config toConfig(Properties properties) { Config config = new Config(); config.setElectionTimeoutMs(Long.parseLong( properties.getProperty( Config.ELECTION_TIMEOUT_KEY, String.valueOf(Config.DEFAULT_ELECTION_TIMEOUT_MS)))); config.setSnapshotIntervalSec(Integer.parseInt( properties.getProperty( Config.SNAPSHOT_INTERVAL_SEC_KEY, String.valueOf(AbstractServer.Config.DEFAULT_SNAPSHOT_INTERVAL_SEC)))); config.setJournalRetentionMin(Integer.parseInt( properties.getProperty( Config.JOURNAL_RETENTION_MIN_KEY, String.valueOf(AbstractServer.Config.DEFAULT_JOURNAL_RETENTION_MIN)))); config.setHeartbeatIntervalMs(Long.parseLong( properties.getProperty( Config.HEARTBEAT_INTERVAL_KEY, String.valueOf(Config.DEFAULT_HEARTBEAT_INTERVAL_MS)))); config.setTransactionTimeoutMs(Long.parseLong( properties.getProperty( Config.TRANSACTION_TIMEOUT_MS_KEY, String.valueOf(Config.DEFAULT_TRANSACTION_TIMEOUT_MS)))); config.setReplicationBatchSize(Integer.parseInt( properties.getProperty( Config.REPLICATION_BATCH_SIZE_KEY, String.valueOf(Config.DEFAULT_REPLICATION_BATCH_SIZE)))); config.setCacheRequests(Integer.parseInt( properties.getProperty( Config.CACHE_REQUESTS_KEY, String.valueOf(Config.DEFAULT_CACHE_REQUESTS)))); config.setEnablePreVote(Boolean.parseBoolean( properties.getProperty( Config.ENABLE_PRE_VOTE_KEY, String.valueOf(Config.DEFAULT_ENABLE_EVENTS)))); config.setRpcTimeoutMs(Long.parseLong( properties.getProperty( AbstractServer.Config.RPC_TIMEOUT_MS_KEY, String.valueOf(AbstractServer.Config.DEFAULT_RPC_TIMEOUT_MS)))); config.setFlushIntervalMs(Long.parseLong( properties.getProperty( AbstractServer.Config.FLUSH_INTERVAL_MS_KEY, String.valueOf(AbstractServer.Config.DEFAULT_FLUSH_INTERVAL_MS)))); config.setPrintStateIntervalSec(Integer.parseInt( properties.getProperty( Config.PRINT_STATE_INTERVAL_SEC_KEY, String.valueOf(Config.DEFAULT_PRINT_STATE_INTERVAL_SEC)))); config.setWorkingDir(Paths.get( properties.getProperty(AbstractServer.Config.WORKING_DIR_KEY, config.getWorkingDir().normalize().toString()))); config.setGetStateBatchSize(Integer.parseInt( properties.getProperty( AbstractServer.Config.GET_STATE_BATCH_SIZE_KEY, String.valueOf(AbstractServer.Config.DEFAULT_GET_STATE_BATCH_SIZE)))); config.setEnableMetric(Boolean.parseBoolean( properties.getProperty( AbstractServer.Config.ENABLE_METRIC_KEY, String.valueOf(AbstractServer.Config.DEFAULT_ENABLE_METRIC)))); config.setPrintMetricIntervalSec(Integer.parseInt( properties.getProperty( AbstractServer.Config.PRINT_METRIC_INTERVAL_SEC_KEY, String.valueOf(AbstractServer.Config.DEFAULT_PRINT_METRIC_INTERVAL_SEC)))); config.setEnableEvents(Boolean.parseBoolean( properties.getProperty( AbstractServer.Config.ENABLE_EVENTS_KEY, String.valueOf(AbstractServer.Config.DEFAULT_ENABLE_EVENTS)))); return config; } private void checkElectionTimeout() { try { if (voterState() == VoterState.FOLLOWER && System.currentTimeMillis() - lastHeartbeat > electionTimeoutMs) { convertToPreVoting(); // 如果不开启PreVote,直接转换成候选人 if(!config.isEnablePreVote()) { convertToCandidate(); } nextElectionTime = System.currentTimeMillis() + electionTimeoutMs; } if ((voterState() == VoterState.PRE_VOTING || voterState() == VoterState.CANDIDATE) && System.currentTimeMillis() > nextElectionTime) { startElection(false); } if (checkPreferredLeader()) { convertToPreVoting(); convertToCandidate(); startElection(true); } } catch (Throwable t) { logger.warn("CheckElectionTimeout Exception, {}: ", voterInfo(), t); } } private boolean isSingleNodeCluster() { return !state.getConfigState().isJointConsensus() && state.getConfigState().voters().size() == 1 && state.getConfigState().voters().contains(uri); } /** * 发起选举。 * 0. 角色转变为候选人 * 1. 自增当前任期号:term = term + 1; * 2. 给自己投票; * 3. 重置选举计时器:lastHeartBeat = now,生成一个随机的新的选举超时时间(RAFT的推荐值为150~300ms)。 * 4. 向其他Voter发送RequestVote请求 * 4.1. 如果收到了来自大多数服务器的投票:成为LEADER * 4.2. 如果收到了来自新领导人的asyncAppendEntries请求(heartbeat):转换状态为FOLLOWER * 4.3. 如果选举超时:开始新一轮的选举 */ private void startElection(boolean fromPreferredLeader) { nextElectionTime = Long.MAX_VALUE; votedFor = uri; boolean isPreVote = voterState.getState() == VoterState.PRE_VOTING; int term ; if (!isPreVote) { term = currentTerm.incrementAndGet(); logger.info("Start election, {}", voterInfo()); } else { term = currentTerm.get() + 1; logger.info("Start pre vote, {}", voterInfo()); } long lastLogIndex = journal.maxIndex() - 1; int lastLogTerm = journal.getTerm(lastLogIndex); RequestVoteRequest request = new RequestVoteRequest(term, uri, lastLogIndex, lastLogTerm, fromPreferredLeader, isPreVote); List destinations = state.getConfigState().voters().stream() .filter(uri -> !uri.equals(this.uri)).collect(Collectors.toList()); final AtomicBoolean isWinTheElection = new AtomicBoolean(false); final AtomicInteger votesGrantedInNewConfig = new AtomicInteger(0); final AtomicInteger votesGrantedInOldConfig = new AtomicInteger(0); updateVotes(isWinTheElection, votesGrantedInNewConfig, votesGrantedInOldConfig, this.uri); if (!isWinTheElection.get()) { final AtomicInteger pendingRequests = new AtomicInteger(destinations.size()); for (URI destination : destinations) { getServerRpc(destination) .thenComposeAsync(serverRpc -> { if (null != serverRpc) { logger.info("Request vote, dest uri: {}, {}...", serverRpc.serverUri(), voterInfo()); return serverRpc.requestVote(request) .thenApply(response -> { response.setUri(serverRpc.serverUri()); return response; }); } else { return CompletableFuture.completedFuture(null); } }, asyncExecutor) .thenAccept(response -> { if (null != response) { logger.info("Request vote result {}, dest uri: {}, {}...", response.isVoteGranted(), response.getUri(), voterInfo()); if (response.isVoteGranted()) { updateVotes(isWinTheElection, votesGrantedInNewConfig, votesGrantedInOldConfig, response.getUri()); } } }) .exceptionally(e -> { logger.warn("Request vote exception: {}!", e.getMessage()); return null; }).thenRun(() -> { if (pendingRequests.decrementAndGet() == 0 && !isWinTheElection.get()) { electionTimeoutMs = config.getElectionTimeoutMs() + randomInterval(config.getElectionTimeoutMs()); nextElectionTime = System.currentTimeMillis() + electionTimeoutMs; } }); } } } private void updateVotes(AtomicBoolean isWinTheElection, AtomicInteger votesGrantedInNewConfig, AtomicInteger votesGrantedInOldConfig, URI destination) { ConfigState configState = state.getConfigState(); if (configState.getConfigNew().contains(destination)) { votesGrantedInNewConfig.incrementAndGet(); } if (configState.getConfigOld().contains(destination)) { votesGrantedInOldConfig.incrementAndGet(); } boolean win; if (configState.isJointConsensus()) { win = votesGrantedInNewConfig.get() >= configState.getConfigNew().size() / 2 + 1 && votesGrantedInOldConfig.get() >= configState.getConfigOld().size() / 2 + 1; } else { win = votesGrantedInNewConfig.get() >= configState.getConfigNew().size() / 2 + 1; } if (win && isWinTheElection.compareAndSet(false, true)) { if (voterState.getState() == VoterState.PRE_VOTING) { convertToCandidate(); startElection(false); } else if (voterState.getState() == VoterState.CANDIDATE) { convertToLeader(); } } } private void convertToCandidate() { synchronized (voterState) { VoterState oldState = voterState.getState(); voterState.convertToCandidate(); logger.info("Convert voter state from {} to CANDIDATE, electionTimeout: {}, {}.", oldState, electionTimeoutMs, voterInfo()); } } private void convertToPreVoting() { synchronized (voterState) { VoterState oldState = voterState.getState(); if (oldState == VoterState.FOLLOWER && null != follower) { follower.stop(); follower = null; } voterState.convertToPreVoting(); logger.info("Convert voter state from {} to PRE_VOTING, electionTimeout: {}, {}.", oldState, electionTimeoutMs, voterInfo()); } } /** * 将状态转换为Leader */ private void convertToLeader() { synchronized (voterState) { VoterState oldState = voterState.getState(); voterState.convertToLeader(); this.leader = new Leader(journal, state, snapshots, currentTerm.get(), uri, config.getCacheRequests(), config.getHeartbeatIntervalMs(), config.getRpcTimeoutMs(), config.getReplicationBatchSize(), config.getSnapshotIntervalSec(), threads, this, this, scheduledExecutor, voterConfigManager, this, this.journalEntryParser, config.getTransactionTimeoutMs(), snapshots); leader.start(); this.leaderUri = this.uri; logger.info("Convert voter state from {} to LEADER, {}.", oldState, voterInfo()); } } @Override protected void onJournalRecovered(Journal journal) { super.onJournalRecovered(journal); maybeUpdateTermOnRecovery(journal); } private void maybeUpdateTermOnRecovery(Journal journal) { if (journal.minIndex() < journal.maxIndex()) { JournalEntry lastEntry = journal.read(journal.maxIndex() - 1); if (lastEntry.getTerm() > currentTerm.get()) { currentTerm.set(lastEntry.getTerm()); logger.info("Set current term to {}, this is the term of the last entry in the journal.", currentTerm.get()); } } } private void convertToFollower() { synchronized (voterState) { VoterState oldState = voterState.getState(); if (oldState == VoterState.LEADER && null != leader) { leader.stop(); leader = null; } voterState.convertToFollower(); if (oldState == VoterState.FOLLOWER && null != follower) { follower.stop(); follower = null; } follower = new Follower(journal, state, uri, currentTerm.get(), voterConfigManager, threads, snapshots, config.getCacheRequests()); follower.start(); this.electionTimeoutMs = config.getElectionTimeoutMs() + randomInterval(config.getElectionTimeoutMs()); logger.info("Convert voter state from {} to FOLLOWER, electionTimeout: {}, {}.", oldState, electionTimeoutMs, voterInfo()); } } @Override public Roll roll() { return Roll.VOTER; } /** * 将请求放到待处理队列中。 */ @Override public CompletableFuture asyncAppendEntries(AsyncAppendEntriesRequest request) { checkTerm(request.getTerm()); if (request.getTerm() < currentTerm.get()) { // 如果收到的请求term小于当前term,拒绝请求 return CompletableFuture.supplyAsync(() -> new AsyncAppendEntriesResponse(false, request.getPrevLogIndex() + 1, currentTerm.get(), request.getEntries().size())); } if (voterState() != VoterState.FOLLOWER) { convertToFollower(); } if (logger.isDebugEnabled() && request.getEntries() != null && !request.getEntries().isEmpty()) { logger.debug("Received appendEntriesRequest, term: {}, leader: {}, prevLogIndex: {}, prevLogTerm: {}, " + "entries: {}, leaderCommit: {}, {}.", request.getTerm(), request.getLeader(), request.getPrevLogIndex(), request.getPrevLogTerm(), request.getEntries().size(), request.getLeaderCommit(), voterInfo()); } // reset heartbeat lastHeartbeat = System.currentTimeMillis(); if (logger.isDebugEnabled()) { logger.debug("Update lastHeartbeat, {}.", voterInfo()); } if (null != request.getLeader() && !request.getLeader().equals(leaderUri)) { leaderUri = request.getLeader(); } return CompletableFuture.completedFuture(follower.handleAppendEntriesRequest(request)); } /** * 接收者收到requestVote方法后的实现流程如下: *

* 如果请求中的任期号 < 节点当前任期号,返回false; * 如果votedFor为空或者与candidateId相同,并且候选人的日志至少和自己的日志一样新,则给该候选人投票; */ @Override public CompletableFuture requestVote(RequestVoteRequest request) { return CompletableFuture.supplyAsync(() -> { synchronized (voteRequestMutex) { logger.debug("RequestVoteRpc received: term: {}, candidate: {}, " + "lastLogIndex: {}, lastLogTerm: {}, fromPreferredLeader: {}, isPreVote: {}, {}.", request.getTerm(), request.getCandidate(), request.getLastLogIndex(), request.getLastLogTerm(), request.isFromPreferredLeader(), request.isPreVote(), voterInfo()); String rejectMsg; int currentTerm = this.currentTerm.get(); // 来自推荐Leader的投票请求例外 if (!request.isFromPreferredLeader()) { // 如果当前是LEADER那直接拒绝投票 if (voterState() == VoterState.LEADER) { rejectMsg = "I'm the leader"; return rejectAndResponse(currentTerm, request.getCandidate(), rejectMsg); } // 如何上次收到心跳的至今小于最小选举超时,拒绝投票 if (System.currentTimeMillis() - lastHeartbeat < config.getElectionTimeoutMs()) { rejectMsg = "An election timeout not passed since last heartbeat received"; return rejectAndResponse(currentTerm, request.getCandidate(), rejectMsg); } } // 如果候选人的term 比我的term小,拒绝投票 if (request.getTerm() < currentTerm) { rejectMsg = String.format("The candidate's term %d less than my term %d.", request.getTerm(), currentTerm); return rejectAndResponse(currentTerm, request.getCandidate(), rejectMsg); } if (!request.isPreVote()) { checkTerm(request.getTerm()); currentTerm = this.currentTerm.get(); } // 如果已经投票给其它候选人,拒绝投票 if (votedFor != null && currentTerm == request.getTerm() && !votedFor.equals(request.getCandidate())) { rejectMsg = "Already vote to " + votedFor.toString(); return rejectAndResponse(currentTerm, request.getCandidate(), rejectMsg); } // 如果term相同,候选人的日志比我的短,拒绝投票 final long finalMaxJournalIndex = journal.maxIndex(); final int lastLogTerm = journal.getTerm(finalMaxJournalIndex - 1); if ((request.getLastLogTerm() <= lastLogTerm && (request.getLastLogTerm() != lastLogTerm || request.getLastLogIndex() < finalMaxJournalIndex - 1))) { rejectMsg = "Candidate’s log is at least as up-to-date as my log"; return rejectAndResponse(currentTerm, request.getCandidate(), rejectMsg); } if(request.isPreVote()) { logger.debug("Grant pre vote to candidate {}, {}.", request.getCandidate(), voterInfo()); } else { logger.debug("Grant vote to candidate {}, {}.", request.getCandidate(), voterInfo()); this.votedFor = request.getCandidate(); } return new RequestVoteResponse(currentTerm, true); } }, asyncExecutor); } private RequestVoteResponse rejectAndResponse(int term, URI candidate, String rejectMessage) { logger.debug("Reject vote request from candidate {}, cause: [{}], {}.", candidate, rejectMessage, voterInfo()); return new RequestVoteResponse(term, false); } @Override public CompletableFuture disableLeaderWrite(DisableLeaderWriteRequest request) { return CompletableFuture.supplyAsync(() -> { if (voterState() == VoterState.LEADER && null != leader) { leader.disableWrite(request.getTimeoutMs(), request.getTerm()); } else { throw new NotLeaderException(leaderUri); } return new DisableLeaderWriteResponse(currentTerm.get()); }, asyncExecutor).exceptionally(DisableLeaderWriteResponse::new); } //Receiver implementation: //1. Reply immediately if term < currentTerm //2. Create new snapshot file if first chunk (offset is 0) //3. Write data into snapshot file at given offset //4. Reply and wait for more data chunks if done is false //5. Save snapshot file, discard any existing or partial snapshot //with a smaller index //6. If existing log entry has same index and term as snapshot’s //last included entry, retain log entries following it and reply //7. Discard the entire log //8. Reset state machine using snapshot contents (and load //snapshot’s cluster configuration) @Override public CompletableFuture installSnapshot(InstallSnapshotRequest request) { if (checkTerm(request.getTerm())) { return CompletableFuture.completedFuture(new InstallSnapshotResponse(currentTerm.get())); } lastHeartbeat = System.currentTimeMillis(); return installSnapshotAsync(request); } private CompletableFuture installSnapshotAsync(InstallSnapshotRequest request) { return CompletableFuture.supplyAsync(() -> { InstallSnapshotResponse response; try { installSnapshot(request.getOffset(), request.getLastIncludedIndex(), request.getLastIncludedTerm(), request.getData(), request.isDone()); response = new InstallSnapshotResponse(currentTerm.get()); } catch (Throwable t) { logger.warn("Install snapshot exception!", t); response = new InstallSnapshotResponse(t); } return response; }, asyncExecutor); } @Override public CompletableFuture updateClusterState(UpdateClusterStateRequest request) { Leader finalLeader = leader; if (isLeaderAvailable(finalLeader)) { return finalLeader.updateClusterState(request) .exceptionally(UpdateClusterStateResponse::new); } else { return CompletableFuture.completedFuture(new UpdateClusterStateResponse(new NotLeaderException(leaderUri))); } } @Override public CompletableFuture queryClusterState(QueryStateRequest request) { return waitLeadership() .thenApplyAsync(aVoid -> state.query(request.getQuery(), journal).getResult(), asyncExecutor) .thenApply(QueryStateResponse::new) .exceptionally(exception -> { try { throw exception instanceof CompletionException ? exception.getCause() : exception; } catch (Throwable t) { return new QueryStateResponse(t); } }); } @Override public boolean checkTerm(int term) { boolean isTermChanged; synchronized (currentTerm) { if (term > currentTerm.get()) { logger.info("Set current term from {} to {}, {}.", currentTerm.get(), term, voterInfo()); currentTerm.set(term); this.votedFor = null; isTermChanged = true; } else { isTermChanged = false; } } if (isTermChanged) { convertToFollower(); } return isTermChanged; } // 改为同步方法,提升性能 @Override public CompletableFuture lastApplied() { return waitLeadership() .thenApplyAsync(aVoid -> new LastAppliedResponse(state.lastApplied()), asyncExecutor) .exceptionally(exception -> { try { throw exception instanceof CompletionException ? exception.getCause() : exception; } catch (Throwable t) { return new LastAppliedResponse(t); } }); } private CompletableFuture waitLeadership() { Leader finalLeader = leader; if (isLeaderAvailable(finalLeader)) { return finalLeader.waitLeadership(); } else { CompletableFuture future = new CompletableFuture<>(); future.completeExceptionally(new NotLeaderException(this.leaderUri)); return future; } } @Override public CompletableFuture getServerStatus() { return CompletableFuture.supplyAsync(() -> new ServerStatus( Roll.VOTER, journal.minIndex(), journal.maxIndex(), journal.commitIndex(), state.lastApplied(), voterState()), asyncExecutor) .thenApply(GetServerStatusResponse::new); } /** * * 在处理变更集群配置时,JournalKeeper采用RAFT协议中推荐的,二阶段变更的方式来避免在配置变更过程中可能出现的集群分裂。 * 包括LEADER在内的,变更前后包含的所有节点,都通过二个阶段来安全的完成配置变更。配置变更期间,集群依然可以对外提供服务。 * * * **共同一致阶段:** 每个节点在写入配置变更日志$C_{old, new}$后,不用等到这条日志被提交,立即变更配置,进入共同一致阶段。 * 在这个阶段,使用新旧配置的两个集群(这个时候这两个集群可能共享大部分节点,并且拥有相同的LEADER节点)同时在线,每一条日志都需要, * 在使用新旧配置的二个集群中达成大多数一致。或者说,日志需要在新旧二个集群中,分别复制到超过半数以上的节点上,才能被提交。 * * * **新配置阶段:** 每个节点在写入配置变更日志$C_{new}$后,不用等到这条日志被提交,立即变更配置,进入新配置阶段,完成配置变更。 * * 当客户端调用updateVoters方法时: * * 1. LEADER先在本地写入配置变更日志$C_{old, new}$,然后立刻变更自身的配置为$C_{old, new}$,进入共同一致阶段。 * * 2. 在共同一致阶段,LEADER把包括配置变更日志$C_{old, new}$和其它在这一阶段的其它日志,按照顺序一并复制到新旧两个集群的所有节点, * 每一条日志都需要在新旧二个集群都达到半数以上,才会被提交。**$C_{old, new}$被提交后, * 无论后续发生什么情况,这次配置变更最终都会执行成功。** * * 3. 处于新旧配置中的每个FOLLOWER节点,在收到并写入配置变更日志$C_{old, new}$后,无需等待日志提交, * 立刻将配置变更为$C_{old, new}$,并进入共同一致阶段。 * * 4. LEADER在配置变更日志$C_{old, new}$提交后,写入新的配置变更日志$C_{new}$,然后立即变更自身的配置为$C_{new}$,进入新配置阶段。 * * 5. 处于共同一致阶段中的每个FOLLOWER节点,在收到并写入配置变更日志$C_{new}$后,无需等待日志提交,立刻将配置变更为$C_{new}$, * 并进新配置阶段。此时节点需要检查一下自身是否还是是集群中的一员,如果不是,说明当前节点已经被从集群中移除,需要停止当前节点服务。 * * 6. LEADER在$C_{new}$被提交后,也需要检查一下自身是否还是是集群中的一员,如果不是,说明当前节点已经被从集群中移除, * 需要停止当前节点服务。新集群会自动选举出新的LEADER。 * * 如果变更过程中,节点发生了故障。为了确保节点能从故障中正确的恢复,需要保证: * **节点当前的配置总是和节点当前日志中最后一条配置变更日志(注意,这条日志可能已经提交也可能未被提交)保持一致。** * * 由于每个节点都遵循“写入配置变更日志-更新节点配置-提交配置变更日志”这样一个时序,所以,如果最后一条配置变更日志经被提交, * 那节点的配置和日志一定是一致的。但是,对于未提交配置变更日志,节点的配置有可能还没来得及更新就,节点宕机了。 * 这种情况下,节点的配置是落后于日志的,因此,需要: * * * 在节点启动时进行检查,如果存在一条未提交的配置变更日志,如果节点配置和日志不一致,需要按照日志更新节点配置。 * * 当节点删除未提交的日志时,如果被删除的日志中包含配置变更,需要将当前节点的配置也一并回滚;¡ * * 在这个方法中,只是构造第一阶段的配置变更日志$C_{old, new}$,调用{@link #updateClusterState(UpdateClusterStateRequest)}方法, * 正常写入$C_{old, new}$,$C_{old, new}$被提交之后,会返回 {@link UpdateClusterStateResponse},只要响应成功, * 虽然这时集群的配置并没有完成变更,但无论后续发生什么情况,集群最终都会完成此次变更。因此,直接返回客户端变更成功。 * * @param request See {@link UpdateVotersRequest} * @return See {@link UpdateVotersResponse} */ @Override public CompletableFuture updateVoters(UpdateVotersRequest request) { return CompletableFuture.supplyAsync( () -> new UpdateVotersS1Entry(request.getOldConfig(), request.getNewConfig()), asyncExecutor) .thenApply(InternalEntriesSerializeSupport::serialize) .thenApply(entry -> new UpdateClusterStateRequest(new UpdateRequest(entry, INTERNAL_PARTITION, 1))) .thenCompose(this::updateClusterState) .thenAccept(response -> { if (!response.success()) { throw new CompletionException(new UpdateConfigurationException("Failed to update voters configuration in step 1. " + response.errorString())); } }) .thenApply(aVoid -> new UpdateVotersResponse()) .exceptionally(UpdateVotersResponse::new); } @Override public CompletableFuture createTransaction(CreateTransactionRequest request) { if (voterState.getState() == VoterState.LEADER && leader != null) { return leader.createTransaction(request.getContext()) .thenApply(context -> new CreateTransactionResponse((UUIDTransactionId) context.transactionId(), context.timestamp())); } else { return CompletableFuture.completedFuture(new CreateTransactionResponse(new NotLeaderException(leaderUri))); } } @Override public CompletableFuture completeTransaction(CompleteTransactionRequest request) { if (voterState.getState() == VoterState.LEADER && leader != null) { return leader.completeTransaction(request.getTransactionId(), request.isCommitOrAbort()) .thenApply(aVoid -> new CompleteTransactionResponse()); } else { return CompletableFuture.completedFuture(new CompleteTransactionResponse(new NotLeaderException(leaderUri))); } } @Override public CompletableFuture getOpeningTransactions() { if (voterState.getState() == VoterState.LEADER && leader != null) { return CompletableFuture.completedFuture(leader.getOpeningTransactions()) .thenApply(GetOpeningTransactionsResponse::new); } else { return CompletableFuture.completedFuture(new GetOpeningTransactionsResponse(new NotLeaderException(leaderUri))); } } @Override public CompletableFuture checkLeadership() { return waitLeadership() .thenApply(aVoid -> new CheckLeadershipResponse()) .exceptionally(exception -> { try { throw exception instanceof CompletionException ? exception.getCause() : exception; } catch (Throwable t) { return new CheckLeadershipResponse(t); } }); } private boolean isLeaderAvailable(Leader finalLeader) { return voterState() == VoterState.LEADER && finalLeader != null; } @Override public CompletableFuture getSnapshots() { if (voterState.getState() == VoterState.LEADER && leader != null) { return CompletableFuture.completedFuture( snapshots.values() .stream() .map((snapshot) -> new SnapshotEntry(snapshot.lastApplied(), snapshot.timestamp())).collect(Collectors.toList())) .thenApply(SnapshotsEntry::new) .thenApply(GetSnapshotsResponse::new); } else { return CompletableFuture.completedFuture(new GetSnapshotsResponse(new NotLeaderException(leaderUri))); } } private void ensureLeadership(Leader finalLeader) { if (voterState() != VoterState.LEADER || finalLeader == null) { throw new NotLeaderException(leaderUri); } } private VoterState voterState() { return voterState.getState(); } @Override public void doStart() { if (isSingleNodeCluster()) { convertToPreVoting(); convertToCandidate(); convertToLeader(); } else { convertToFollower(); } this.checkElectionTimeoutFuture = scheduledExecutor.scheduleAtFixedRate(this::checkElectionTimeout, ThreadLocalRandom.current().nextLong(500L, 1000L), config.getHeartbeatIntervalMs(), TimeUnit.MILLISECONDS); if (config.getPrintStateIntervalSec() > 0) { this.printStateFuture = scheduledExecutor.scheduleAtFixedRate(this::printState, ThreadLocalRandom.current().nextLong(0, config.getPrintStateIntervalSec()), config.getPrintStateIntervalSec(), TimeUnit.SECONDS); } } private void printState() { logger.info(voterInfo()); } @Override public void doStop() { try { stopAndWaitScheduledFeature(checkElectionTimeoutFuture, 1000L); stopAndWaitScheduledFeature(printStateFuture, 1000L); if (null != leader) { leader.stop(); } if (null != follower) { follower.stop(); } } catch (Throwable t) { t.printStackTrace(); logger.warn("Exception, {}: ", voterInfo(), t); } } @Override protected void afterStateChanged(byte[] updateResult) { super.afterStateChanged(updateResult); if (null != leader) { try { leader.callback(state.lastApplied(), updateResult); } catch (Throwable e) { logger.warn("Callback exception! {}", voterInfo(), e); } } } @Override protected ServerMetadata createServerMetadata() { ServerMetadata serverMetadata = super.createServerMetadata(); serverMetadata.setCurrentTerm(currentTerm.get()); serverMetadata.setVotedFor(votedFor); return serverMetadata; } @Override protected void onMetadataRecovered(ServerMetadata metadata) { super.onMetadataRecovered(metadata); this.currentTerm.set(metadata.getCurrentTerm()); this.votedFor = metadata.getVotedFor(); } private String voterInfo() { String ret = String.format("VoterState: %s, currentTerm: %d, minIndex: %d, " + "maxIndex: %d, commitIndex: %d, lastApplied: %d, %s, leader: %s, uri: %s, ", voterState.getState(), currentTerm.get(), journal.minIndex(), journal.maxIndex(), journal.commitIndex(), state.lastApplied(), state.getConfigState().toString(), leaderUri, uri.toString()); if (leader != null) { ret += leader.getFollowers().stream().map(Leader.ReplicationDestination::toString).collect(Collectors.joining(", ")); } return ret; } private boolean checkPreferredLeader() { if (voterState().equals(VoterState.FOLLOWER) && serverUri().equals(state.getPreferredLeader()) && null != follower && follower.getLeaderMaxIndex() - journal.maxIndex() < PREFERRED_LEADER_IN_SYNC_THRESHOLD && follower.getLeaderMaxIndex() > 0) { // 给当前LEADER发RPC,停服。 logger.info("Send DisableLeaderWriteRequest to {}, {}", leaderUri, voterInfo()); getServerRpc(leaderUri) .thenComposeAsync(serverRpc -> serverRpc.disableLeaderWrite(new DisableLeaderWriteRequest(10 * config.getElectionTimeoutMs(), currentTerm.get())), asyncExecutor) .thenAccept(response -> { if (response.success() && response.getTerm() == currentTerm.get() && voterState() == VoterState.FOLLOWER && follower != null) { logger.info("Received DisableLeaderWriteResponse code: SUCCESS, {}", voterInfo()); follower.setReadyForStartPreferredLeaderElection(true); } else { logger.info("Ignore DisableLeaderWriteResponse code: {}, term: {}, errString: {}, {}", response.getStatusCode(), response.getTerm(), response.errorString(), voterInfo()); } }); } // 等待数据完全同步 // 发起选举,等待赢得足够的选票,成功新的LEADER return (voterState().equals(VoterState.FOLLOWER) && serverUri().equals(state.getPreferredLeader()) && null != follower && follower.isReadyForStartPreferredLeaderElection() && follower.getLeaderMaxIndex() == journal.maxIndex()); } // for monitor only VoterState getVoterState() { synchronized (voterState) { return voterState.getState(); } } URI getLastVote() { return votedFor; } long getElectionTimeoutMs() { return electionTimeoutMs; } long getLastHeartbeat() { return lastHeartbeat; } URI getPreferredLeader() { return state.getPreferredLeader(); } Leader getLeader() { return leader; } Follower getFollower() { return follower; } long getNextElectionTime() { return nextElectionTime; } int getTerm() { return currentTerm.get(); } private static class VoterStateMachine { private VoterState state = VoterState.FOLLOWER; private void convertToLeader() { if (state == VoterState.CANDIDATE) { state = VoterState.LEADER; } else { throw new IllegalStateException(String.format("Change voter state from %s to %s is not allowed!", state, VoterState.LEADER)); } } private void convertToFollower() { state = VoterState.FOLLOWER; } private void convertToCandidate() { if (state == VoterState.PRE_VOTING) { state = VoterState.CANDIDATE; } else { throw new IllegalStateException(String.format("Change voter state from %s to %s is not allowed!", state, VoterState.FOLLOWER)); } } private void convertToPreVoting() { if (state == VoterState.PRE_VOTING || state == VoterState.FOLLOWER) { state = VoterState.PRE_VOTING; } else { throw new IllegalStateException(String.format("Change voter state from %s to %s is not allowed!", state, VoterState.FOLLOWER)); } } public VoterState getState() { return state; } } public static class Config extends AbstractServer.Config { public final static long DEFAULT_HEARTBEAT_INTERVAL_MS = 100L; public final static long DEFAULT_ELECTION_TIMEOUT_MS = 300L; public final static int DEFAULT_REPLICATION_BATCH_SIZE = 128; public final static int DEFAULT_CACHE_REQUESTS = 1024; public final static long DEFAULT_TRANSACTION_TIMEOUT_MS = 10L * 60 * 1000; public final static int DEFAULT_PRINT_STATE_INTERVAL_SEC = 0; public final static boolean DEFAULT_ENABLE_PRE_VOTE = true; public final static String HEARTBEAT_INTERVAL_KEY = "heartbeat_interval_ms"; public final static String ELECTION_TIMEOUT_KEY = "election_timeout_ms"; public final static String REPLICATION_BATCH_SIZE_KEY = "replication_batch_size"; public final static String CACHE_REQUESTS_KEY = "cache_requests"; public final static String TRANSACTION_TIMEOUT_MS_KEY = "transaction_timeout_ms"; public final static String PRINT_STATE_INTERVAL_SEC_KEY = "print_state_interval_sec"; public final static String ENABLE_PRE_VOTE_KEY = "enable_pre_vote"; private long heartbeatIntervalMs = DEFAULT_HEARTBEAT_INTERVAL_MS; private long electionTimeoutMs = DEFAULT_ELECTION_TIMEOUT_MS; // 最小选举超时 private int replicationBatchSize = DEFAULT_REPLICATION_BATCH_SIZE; private int cacheRequests = DEFAULT_CACHE_REQUESTS; private long transactionTimeoutMs = DEFAULT_TRANSACTION_TIMEOUT_MS; private int printStateIntervalSec = DEFAULT_PRINT_STATE_INTERVAL_SEC; private boolean enablePreVote = DEFAULT_ENABLE_PRE_VOTE; public int getReplicationBatchSize() { return replicationBatchSize; } public void setReplicationBatchSize(int replicationBatchSize) { this.replicationBatchSize = replicationBatchSize; } public long getHeartbeatIntervalMs() { return heartbeatIntervalMs; } public void setHeartbeatIntervalMs(long heartbeatIntervalMs) { this.heartbeatIntervalMs = heartbeatIntervalMs; } public long getElectionTimeoutMs() { return electionTimeoutMs; } public void setElectionTimeoutMs(long electionTimeoutMs) { this.electionTimeoutMs = electionTimeoutMs; } public int getCacheRequests() { return cacheRequests; } public void setCacheRequests(int cacheRequests) { this.cacheRequests = cacheRequests; } public long getTransactionTimeoutMs() { return transactionTimeoutMs; } public void setTransactionTimeoutMs(long transactionTimeoutMs) { this.transactionTimeoutMs = transactionTimeoutMs; } public int getPrintStateIntervalSec() { return printStateIntervalSec; } public void setPrintStateIntervalSec(int printStateIntervalSec) { this.printStateIntervalSec = printStateIntervalSec; } public boolean isEnablePreVote() { return enablePreVote; } public void setEnablePreVote(boolean enablePreVote) { this.enablePreVote = enablePreVote; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy