org.elasticsearch.cluster.coordination.Coordinator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
There is a newer version: 8.13.4
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
package org.elasticsearch.cluster.coordination;

import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionListenerResponseHandler;
import org.elasticsearch.action.support.ChannelActionListener;
import org.elasticsearch.action.support.ListenableActionFuture;
import org.elasticsearch.client.internal.Client;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStatePublicationEvent;
import org.elasticsearch.cluster.ClusterStateTaskConfig;
import org.elasticsearch.cluster.ClusterStateUpdateTask;
import org.elasticsearch.cluster.LocalMasterServiceTask;
import org.elasticsearch.cluster.block.ClusterBlocks;
import org.elasticsearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState;
import org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfigExclusion;
import org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfiguration;
import org.elasticsearch.cluster.coordination.CoordinationState.VoteCollection;
import org.elasticsearch.cluster.coordination.FollowersChecker.FollowerCheckRequest;
import org.elasticsearch.cluster.coordination.JoinHelper.InitialJoinAccumulator;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.RerouteService;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.service.ClusterApplier;
import org.elasticsearch.cluster.service.ClusterApplierService;
import org.elasticsearch.cluster.service.MasterService;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.util.concurrent.EsExecutors;
import org.elasticsearch.common.util.concurrent.ListenableFuture;
import org.elasticsearch.common.xcontent.XContentHelper;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.SuppressForbidden;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.discovery.ConfiguredHostsResolver;
import org.elasticsearch.discovery.DiscoveryModule;
import org.elasticsearch.discovery.DiscoveryStats;
import org.elasticsearch.discovery.HandshakingTransportAddressConnector;
import org.elasticsearch.discovery.PeerFinder;
import org.elasticsearch.discovery.SeedHostsProvider;
import org.elasticsearch.discovery.SeedHostsResolver;
import org.elasticsearch.discovery.TransportAddressConnector;
import org.elasticsearch.indices.breaker.CircuitBreakerService;
import org.elasticsearch.monitor.NodeHealthService;
import org.elasticsearch.monitor.StatusInfo;
import org.elasticsearch.threadpool.Scheduler;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;
import org.elasticsearch.transport.NodeDisconnectedException;
import org.elasticsearch.transport.TransportRequest;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportResponse.Empty;
import org.elasticsearch.transport.TransportService;
import org.elasticsearch.transport.Transports;
import org.elasticsearch.xcontent.json.JsonXContent;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.BiConsumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static org.elasticsearch.cluster.coordination.NoMasterBlockService.NO_MASTER_BLOCK_ID;
import static org.elasticsearch.core.Strings.format;
import static org.elasticsearch.discovery.SettingsBasedSeedHostsProvider.DISCOVERY_SEED_HOSTS_SETTING;
import static org.elasticsearch.gateway.ClusterStateUpdaters.hideStateIfNotRecovered;
import static org.elasticsearch.gateway.GatewayService.STATE_NOT_RECOVERED_BLOCK;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;

public class Coordinator extends AbstractLifecycleComponent implements ClusterStatePublisher {

    private static final Logger logger = LogManager.getLogger(Coordinator.class);

    // the timeout before emitting an info log about a slow-running publication
    public static final Setting PUBLISH_INFO_TIMEOUT_SETTING = Setting.timeSetting(
        "cluster.publish.info_timeout",
        TimeValue.timeValueMillis(10000),
        TimeValue.timeValueMillis(1),
        Setting.Property.NodeScope
    );

    // the timeout for the publication of each value
    public static final Setting PUBLISH_TIMEOUT_SETTING = Setting.timeSetting(
        "cluster.publish.timeout",
        TimeValue.timeValueMillis(30000),
        TimeValue.timeValueMillis(1),
        Setting.Property.NodeScope
    );

    public static final Setting SINGLE_NODE_CLUSTER_SEED_HOSTS_CHECK_INTERVAL_SETTING = Setting.timeSetting(
        "cluster.discovery_configuration_check.interval",
        TimeValue.timeValueMillis(30000),
        TimeValue.timeValueMillis(1),
        Setting.Property.NodeScope
    );

    public static final String COMMIT_STATE_ACTION_NAME = "internal:cluster/coordination/commit_state";

    private final Settings settings;
    private final boolean singleNodeDiscovery;
    private final ElectionStrategy electionStrategy;
    private final TransportService transportService;
    private final MasterService masterService;
    private final AllocationService allocationService;
    private final JoinHelper joinHelper;
    private final JoinValidationService joinValidationService;
    private final NodeRemovalClusterStateTaskExecutor nodeRemovalExecutor;
    private final Supplier persistedStateSupplier;
    private final NoMasterBlockService noMasterBlockService;
    final Object mutex = new Object(); // package-private to allow tests to call methods that assert that the mutex is held
    private final SetOnce coordinationState = new SetOnce<>(); // initialized on start-up (see doStart)
    private volatile ClusterState applierState; // the state that should be exposed to the cluster state applier

    private final PeerFinder peerFinder;
    private final PreVoteCollector preVoteCollector;
    private final Random random;
    private final ElectionSchedulerFactory electionSchedulerFactory;
    private final SeedHostsResolver configuredHostsResolver;
    private final TimeValue publishTimeout;
    private final TimeValue publishInfoTimeout;
    private final TimeValue singleNodeClusterSeedHostsCheckInterval;
    @Nullable
    private Scheduler.Cancellable singleNodeClusterChecker = null;
    private final PublicationTransportHandler publicationHandler;
    private final LeaderChecker leaderChecker;
    private final FollowersChecker followersChecker;
    private final ClusterApplier clusterApplier;
    private final Collection> onJoinValidators;
    @Nullable
    private Releasable electionScheduler;
    @Nullable
    private Releasable prevotingRound;
    private long maxTermSeen;
    private final Reconfigurator reconfigurator;
    private final ClusterBootstrapService clusterBootstrapService;
    private final LagDetector lagDetector;
    private final ClusterFormationFailureHelper clusterFormationFailureHelper;
    private final JoinReasonService joinReasonService;

    private Mode mode;
    private Optional lastKnownLeader;
    private Optional lastJoin;
    private JoinHelper.JoinAccumulator joinAccumulator;
    private Optional currentPublication = Optional.empty();
    private final NodeHealthService nodeHealthService;
    private final List peerFinderListeners;

    /**
     * @param nodeName The name of the node, used to name the {@link java.util.concurrent.ExecutorService} of the {@link SeedHostsResolver}.
     * @param onJoinValidators A collection of join validators to restrict which nodes may join the cluster.
     */
    public Coordinator(
        String nodeName,
        Settings settings,
        ClusterSettings clusterSettings,
        TransportService transportService,
        Client client,
        NamedWriteableRegistry namedWriteableRegistry,
        AllocationService allocationService,
        MasterService masterService,
        Supplier persistedStateSupplier,
        SeedHostsProvider seedHostsProvider,
        ClusterApplier clusterApplier,
        Collection> onJoinValidators,
        Random random,
        RerouteService rerouteService,
        ElectionStrategy electionStrategy,
        NodeHealthService nodeHealthService,
        CircuitBreakerService circuitBreakerService
    ) {
        this.settings = settings;
        this.transportService = transportService;
        this.masterService = masterService;
        this.allocationService = allocationService;
        this.onJoinValidators = JoinTaskExecutor.addBuiltInJoinValidators(onJoinValidators);
        this.singleNodeDiscovery = DiscoveryModule.isSingleNodeDiscovery(settings);
        this.electionStrategy = electionStrategy;
        this.joinReasonService = new JoinReasonService(transportService.getThreadPool()::relativeTimeInMillis);
        this.joinHelper = new JoinHelper(
            allocationService,
            masterService,
            clusterApplier,
            transportService,
            this::getCurrentTerm,
            this::handleJoinRequest,
            this::joinLeaderInTerm,
            rerouteService,
            nodeHealthService,
            joinReasonService,
            circuitBreakerService
        );
        this.joinValidationService = new JoinValidationService(
            settings,
            transportService,
            this::getStateForMasterService,
            this.onJoinValidators
        );
        this.persistedStateSupplier = persistedStateSupplier;
        this.noMasterBlockService = new NoMasterBlockService(settings, clusterSettings);
        this.lastKnownLeader = Optional.empty();
        this.lastJoin = Optional.empty();
        this.joinAccumulator = new InitialJoinAccumulator();
        this.publishTimeout = PUBLISH_TIMEOUT_SETTING.get(settings);
        this.publishInfoTimeout = PUBLISH_INFO_TIMEOUT_SETTING.get(settings);
        this.singleNodeClusterSeedHostsCheckInterval = SINGLE_NODE_CLUSTER_SEED_HOSTS_CHECK_INTERVAL_SETTING.get(settings);
        this.random = random;
        this.electionSchedulerFactory = new ElectionSchedulerFactory(settings, random, transportService.getThreadPool());
        this.preVoteCollector = new PreVoteCollector(
            transportService,
            this::startElection,
            this::updateMaxTermSeen,
            electionStrategy,
            nodeHealthService
        );
        configuredHostsResolver = new SeedHostsResolver(nodeName, settings, transportService, seedHostsProvider);
        this.peerFinder = new CoordinatorPeerFinder(
            settings,
            transportService,
            new HandshakingTransportAddressConnector(settings, transportService),
            configuredHostsResolver
        );
        transportService.registerRequestHandler(
            COMMIT_STATE_ACTION_NAME,
            Names.CLUSTER_COORDINATION,
            false,
            false,
            ApplyCommitRequest::new,
            (request, channel, task) -> handleApplyCommit(
                request,
                new ChannelActionListener<>(channel, COMMIT_STATE_ACTION_NAME, request).map(r -> Empty.INSTANCE)
            )
        );
        this.publicationHandler = new PublicationTransportHandler(transportService, namedWriteableRegistry, this::handlePublishRequest);
        this.leaderChecker = new LeaderChecker(settings, transportService, this::onLeaderFailure, nodeHealthService);
        this.followersChecker = new FollowersChecker(
            settings,
            transportService,
            this::onFollowerCheckRequest,
            this::removeNode,
            nodeHealthService
        );
        this.nodeRemovalExecutor = new NodeRemovalClusterStateTaskExecutor(allocationService);
        this.clusterApplier = clusterApplier;
        masterService.setClusterStateSupplier(this::getStateForMasterService);
        this.reconfigurator = new Reconfigurator(settings, clusterSettings);
        this.clusterBootstrapService = new ClusterBootstrapService(
            settings,
            transportService,
            this::getFoundPeers,
            this::isInitialConfigurationSet,
            this::setInitialConfiguration
        );
        this.lagDetector = new LagDetector(
            settings,
            transportService.getThreadPool(),
            new LagDetector.HotThreadsLoggingLagListener(
                transportService,
                client,
                (node, appliedVersion, expectedVersion) -> removeNode(node, "lagging")
            ),
            transportService::getLocalNode
        );
        this.clusterFormationFailureHelper = new ClusterFormationFailureHelper(
            settings,
            this::getClusterFormationState,
            transportService.getThreadPool(),
            joinHelper::logLastFailedJoinAttempt
        );
        this.nodeHealthService = nodeHealthService;
        this.peerFinderListeners = new CopyOnWriteArrayList<>();
        this.peerFinderListeners.add(clusterBootstrapService);
    }

    /**
     * This method returns an object containing information about why cluster formation failed, which can be useful in troubleshooting.
     * @return Information about why cluster formation failed
     */
    public ClusterFormationState getClusterFormationState() {
        return new ClusterFormationState(
            settings,
            getStateForMasterService(),
            peerFinder.getLastResolvedAddresses(),
            Stream.concat(Stream.of(getLocalNode()), StreamSupport.stream(peerFinder.getFoundPeers().spliterator(), false)).toList(),
            getCurrentTerm(),
            electionStrategy,
            nodeHealthService.getHealth(),
            joinHelper.getInFlightJoinStatuses()
        );
    }

    private void onLeaderFailure(Supplier message, Exception e) {
        synchronized (mutex) {
            if (mode != Mode.CANDIDATE) {
                assert lastKnownLeader.isPresent();
                if (logger.isDebugEnabled()) {
                    // TODO this is a workaround for log4j's Supplier. We should remove this, once using ES logging api
                    logger.info(() -> message.get(), e);
                } else {
                    logger.info(() -> message.get());
                }
            }
            becomeCandidate("onLeaderFailure");
        }
    }

    private void removeNode(DiscoveryNode discoveryNode, String reason) {
        synchronized (mutex) {
            if (mode == Mode.LEADER) {
                var task = new NodeRemovalClusterStateTaskExecutor.Task(
                    discoveryNode,
                    reason,
                    () -> joinReasonService.onNodeRemoved(discoveryNode, reason)
                );
                masterService.submitStateUpdateTask(
                    "node-left",
                    task,
                    ClusterStateTaskConfig.build(Priority.IMMEDIATE),
                    nodeRemovalExecutor
                );
            }
        }
    }

    void onFollowerCheckRequest(FollowerCheckRequest followerCheckRequest) {
        synchronized (mutex) {
            ensureTermAtLeast(followerCheckRequest.getSender(), followerCheckRequest.getTerm());

            if (getCurrentTerm() != followerCheckRequest.getTerm()) {
                logger.trace("onFollowerCheckRequest: current term is [{}], rejecting {}", getCurrentTerm(), followerCheckRequest);
                throw new CoordinationStateRejectedException(
                    "onFollowerCheckRequest: current term is [" + getCurrentTerm() + "], rejecting " + followerCheckRequest
                );
            }

            // check if node has accepted a state in this term already. If not, this node has never committed a cluster state in this
            // term and therefore never removed the NO_MASTER_BLOCK for this term. This logic ensures that we quickly turn a node
            // into follower, even before receiving the first cluster state update, but also don't have to deal with the situation
            // where we would possibly have to remove the NO_MASTER_BLOCK from the applierState when turning a candidate back to follower.
            if (getLastAcceptedState().term() < getCurrentTerm()) {
                becomeFollower("onFollowerCheckRequest", followerCheckRequest.getSender());
            } else if (mode == Mode.FOLLOWER) {
                logger.trace("onFollowerCheckRequest: responding successfully to {}", followerCheckRequest);
            } else if (joinHelper.isJoinPending()) {
                logger.trace("onFollowerCheckRequest: rejoining master, responding successfully to {}", followerCheckRequest);
            } else {
                logger.trace("onFollowerCheckRequest: received check from faulty master, rejecting {}", followerCheckRequest);
                throw new CoordinationStateRejectedException(
                    "onFollowerCheckRequest: received check from faulty master, rejecting " + followerCheckRequest
                );
            }
        }
    }

    private void handleApplyCommit(ApplyCommitRequest applyCommitRequest, ActionListener applyListener) {
        synchronized (mutex) {
            logger.trace("handleApplyCommit: applying commit {}", applyCommitRequest);

            coordinationState.get().handleCommit(applyCommitRequest);
            final ClusterState committedState = hideStateIfNotRecovered(coordinationState.get().getLastAcceptedState());
            applierState = mode == Mode.CANDIDATE ? clusterStateWithNoMasterBlock(committedState) : committedState;
            updateSingleNodeClusterChecker(); // in case nodes increase/decrease, possibly update the single-node checker
            if (applyCommitRequest.getSourceNode().equals(getLocalNode())) {
                // master node applies the committed state at the end of the publication process, not here.
                applyListener.onResponse(null);
            } else {
                clusterApplier.onNewClusterState(applyCommitRequest.toString(), () -> applierState, applyListener.map(r -> {
                    onClusterStateApplied();
                    return r;
                }));
            }
        }
    }

    private void onClusterStateApplied() {
        assert ThreadPool.assertCurrentThreadPool(ClusterApplierService.CLUSTER_UPDATE_THREAD_NAME);
        if (getMode() != Mode.CANDIDATE) {
            joinHelper.onClusterStateApplied();
        }
        if (getLocalNode().isMasterNode()) {
            joinReasonService.onClusterStateApplied(applierState.nodes());
        }
    }

    PublishWithJoinResponse handlePublishRequest(PublishRequest publishRequest) {
        assert publishRequest.getAcceptedState().nodes().getLocalNode().equals(getLocalNode())
            : publishRequest.getAcceptedState().nodes().getLocalNode() + " != " + getLocalNode();

        final ClusterState newClusterState = publishRequest.getAcceptedState();
        if (newClusterState.nodes().isLocalNodeElectedMaster() == false) {
            // background initialization on the current master has been started by the master service already
            newClusterState.initializeAsync(transportService.getThreadPool().generic());
        }

        synchronized (mutex) {
            final DiscoveryNode sourceNode = newClusterState.nodes().getMasterNode();
            logger.trace("handlePublishRequest: handling [{}] from [{}]", publishRequest, sourceNode);

            if (sourceNode.equals(getLocalNode()) && mode != Mode.LEADER) {
                // Rare case in which we stood down as leader between starting this publication and receiving it ourselves. The publication
                // is already failed so there is no point in proceeding.
                throw new CoordinationStateRejectedException("no longer leading this publication's term: " + publishRequest);
            }

            final ClusterState localState = coordinationState.get().getLastAcceptedState();

            if (localState.metadata().clusterUUIDCommitted()
                && localState.metadata().clusterUUID().equals(newClusterState.metadata().clusterUUID()) == false) {
                logger.warn(
                    "received cluster state from {} with a different cluster uuid {} than local cluster uuid {}, rejecting",
                    sourceNode,
                    newClusterState.metadata().clusterUUID(),
                    localState.metadata().clusterUUID()
                );
                throw new CoordinationStateRejectedException(
                    "received cluster state from "
                        + sourceNode
                        + " with a different cluster uuid "
                        + newClusterState.metadata().clusterUUID()
                        + " than local cluster uuid "
                        + localState.metadata().clusterUUID()
                        + ", rejecting"
                );
            }

            if (newClusterState.term() > localState.term()) {
                // only do join validation if we have not accepted state from this master yet
                onJoinValidators.forEach(a -> a.accept(getLocalNode(), newClusterState));
            }

            ensureTermAtLeast(sourceNode, newClusterState.term());
            final PublishResponse publishResponse = coordinationState.get().handlePublishRequest(publishRequest);

            if (sourceNode.equals(getLocalNode())) {
                preVoteCollector.update(getPreVoteResponse(), getLocalNode());
            } else {
                becomeFollower("handlePublishRequest", sourceNode); // also updates preVoteCollector
            }

            return new PublishWithJoinResponse(publishResponse, joinWithDestination(lastJoin, sourceNode, newClusterState.term()));
        }
    }

    private static Optional joinWithDestination(Optional lastJoin, DiscoveryNode leader, long term) {
        if (lastJoin.isPresent() && lastJoin.get().targetMatches(leader) && lastJoin.get().getTerm() == term) {
            return lastJoin;
        }

        return Optional.empty();
    }

    private void closePrevotingAndElectionScheduler() {
        if (prevotingRound != null) {
            prevotingRound.close();
            prevotingRound = null;
        }

        if (electionScheduler != null) {
            electionScheduler.close();
            electionScheduler = null;
        }
    }

    private void updateMaxTermSeen(final long term) {
        synchronized (mutex) {
            maxTermSeen = Math.max(maxTermSeen, term);
            final long currentTerm = getCurrentTerm();
            if (mode == Mode.LEADER && maxTermSeen > currentTerm) {
                // Bump our term. However if there is a publication in flight then doing so would cancel the publication, so don't do that
                // since we check whether a term bump is needed at the end of the publication too.
                if (publicationInProgress()) {
                    logger.debug("updateMaxTermSeen: maxTermSeen = {} > currentTerm = {}, enqueueing term bump", maxTermSeen, currentTerm);
                } else {
                    try {
                        logger.debug("updateMaxTermSeen: maxTermSeen = {} > currentTerm = {}, bumping term", maxTermSeen, currentTerm);
                        ensureTermAtLeast(getLocalNode(), maxTermSeen);
                        startElection();
                    } catch (Exception e) {
                        logger.warn(() -> format("failed to bump term to %s", maxTermSeen), e);
                        becomeCandidate("updateMaxTermSeen");
                    }
                }
            }
        }
    }

    private void startElection() {
        synchronized (mutex) {
            // The preVoteCollector is only active while we are candidate, but it does not call this method with synchronisation, so we have
            // to check our mode again here.
            if (mode == Mode.CANDIDATE) {
                if (localNodeMayWinElection(getLastAcceptedState()) == false) {
                    logger.trace("skip election as local node may not win it: {}", getLastAcceptedState().coordinationMetadata());
                    return;
                }

                final StartJoinRequest startJoinRequest = new StartJoinRequest(getLocalNode(), Math.max(getCurrentTerm(), maxTermSeen) + 1);
                logger.debug("starting election with {}", startJoinRequest);
                getDiscoveredNodes().forEach(node -> joinHelper.sendStartJoinRequest(startJoinRequest, node));
            }
        }
    }

    private void abdicateTo(DiscoveryNode newMaster) {
        assert Thread.holdsLock(mutex);
        assert mode == Mode.LEADER : "expected to be leader on abdication but was " + mode;
        assert newMaster.isMasterNode() : "should only abdicate to master-eligible node but was " + newMaster;
        final StartJoinRequest startJoinRequest = new StartJoinRequest(newMaster, Math.max(getCurrentTerm(), maxTermSeen) + 1);
        logger.info("abdicating to {} with term {}", newMaster, startJoinRequest.getTerm());
        getLastAcceptedState().nodes().mastersFirstStream().forEach(node -> joinHelper.sendStartJoinRequest(startJoinRequest, node));
        // handling of start join messages on the local node will be dispatched to the coordination thread-pool
        assert mode == Mode.LEADER : "should still be leader after sending abdication messages " + mode;
        // explicitly move node to candidate state so that the next cluster state update task yields an onNoLongerMaster event
        becomeCandidate("after abdicating to " + newMaster);
    }

    private static boolean localNodeMayWinElection(ClusterState lastAcceptedState) {
        final DiscoveryNode localNode = lastAcceptedState.nodes().getLocalNode();
        assert localNode != null;
        return nodeMayWinElection(lastAcceptedState, localNode);
    }

    private static boolean nodeMayWinElection(ClusterState lastAcceptedState, DiscoveryNode node) {
        final String nodeId = node.getId();
        return lastAcceptedState.getLastCommittedConfiguration().getNodeIds().contains(nodeId)
            || lastAcceptedState.getLastAcceptedConfiguration().getNodeIds().contains(nodeId)
            || lastAcceptedState.getVotingConfigExclusions().stream().noneMatch(vce -> vce.getNodeId().equals(nodeId));
    }

    private Optional ensureTermAtLeast(DiscoveryNode sourceNode, long targetTerm) {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
        if (getCurrentTerm() < targetTerm) {
            return Optional.of(joinLeaderInTerm(new StartJoinRequest(sourceNode, targetTerm)));
        }
        return Optional.empty();
    }

    private Join joinLeaderInTerm(StartJoinRequest startJoinRequest) {
        synchronized (mutex) {
            logger.debug("joinLeaderInTerm: for [{}] with term {}", startJoinRequest.getSourceNode(), startJoinRequest.getTerm());
            final Join join = coordinationState.get().handleStartJoin(startJoinRequest);
            lastJoin = Optional.of(join);
            peerFinder.setCurrentTerm(getCurrentTerm());
            if (mode != Mode.CANDIDATE) {
                becomeCandidate("joinLeaderInTerm"); // updates followersChecker and preVoteCollector
            } else {
                followersChecker.updateFastResponseState(getCurrentTerm(), mode);
                preVoteCollector.update(getPreVoteResponse(), null);
            }
            return join;
        }
    }

    private void handleJoinRequest(JoinRequest joinRequest, ActionListener joinListener) {
        assert Thread.holdsLock(mutex) == false;
        assert getLocalNode().isMasterNode() : getLocalNode() + " received a join but is not master-eligible";
        logger.trace("handleJoinRequest: as {}, handling {}", mode, joinRequest);

        if (singleNodeDiscovery && joinRequest.getSourceNode().equals(getLocalNode()) == false) {
            joinListener.onFailure(
                new IllegalStateException(
                    "cannot join node with ["
                        + DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey()
                        + "] set to ["
                        + DiscoveryModule.SINGLE_NODE_DISCOVERY_TYPE
                        + "] discovery"
                )
            );
            return;
        }

        transportService.connectToNode(joinRequest.getSourceNode(), new ActionListener<>() {
            @Override
            public void onResponse(Releasable response) {
                boolean retainConnection = false;
                try {
                    validateJoinRequest(
                        joinRequest,
                        ActionListener.runBefore(joinListener, () -> Releasables.close(response))
                            .delegateFailure((l, ignored) -> processJoinRequest(joinRequest, l))
                    );
                    retainConnection = true;
                } catch (Exception e) {
                    joinListener.onFailure(e);
                } finally {
                    if (retainConnection == false) {
                        Releasables.close(response);
                    }
                }
            }

            @Override
            public void onFailure(Exception e) {
                logger.warn(
                    () -> format(
                        "received join request from [%s] but could not connect back to the joining node",
                        joinRequest.getSourceNode()
                    ),
                    e
                );

                joinListener.onFailure(
                    // NodeDisconnectedException mainly to suppress uninteresting stack trace
                    new NodeDisconnectedException(
                        joinRequest.getSourceNode(),
                        String.format(
                            Locale.ROOT,
                            "failure when opening connection back from [%s] to [%s]",
                            getLocalNode().descriptionWithoutAttributes(),
                            joinRequest.getSourceNode().descriptionWithoutAttributes()
                        ),
                        JoinHelper.JOIN_ACTION_NAME,
                        e
                    )
                );
            }
        });
    }

    private void validateJoinRequest(JoinRequest joinRequest, ActionListener validateListener) {

        // Before letting the node join the cluster, ensure:
        // - it's a new enough version to pass the version barrier
        // - we have a healthy STATE channel to the node
        // - if we're already master that it can make sense of the current cluster state.
        // - we have a healthy PING channel to the node

        final ListenableActionFuture validateStateListener = new ListenableActionFuture<>();
        final ClusterState stateForJoinValidation = getStateForMasterService();
        if (stateForJoinValidation.nodes().isLocalNodeElectedMaster()) {
            onJoinValidators.forEach(a -> a.accept(joinRequest.getSourceNode(), stateForJoinValidation));
            if (stateForJoinValidation.getBlocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) {
                // We do this in a couple of places including the cluster update thread. This one here is really just best effort to ensure
                // we fail as fast as possible.
                JoinTaskExecutor.ensureVersionBarrier(
                    joinRequest.getSourceNode().getVersion(),
                    stateForJoinValidation.getNodes().getMinNodeVersion()
                );
            }
            sendJoinValidate(joinRequest.getSourceNode(), validateStateListener);
        } else {
            sendJoinPing(joinRequest.getSourceNode(), TransportRequestOptions.Type.STATE, validateStateListener);
        }

        sendJoinPing(joinRequest.getSourceNode(), TransportRequestOptions.Type.PING, new ActionListener<>() {
            @Override
            public void onResponse(Empty empty) {
                validateStateListener.addListener(validateListener.map(ignored -> null));
            }

            @Override
            public void onFailure(Exception e) {
                // The join will be rejected, but we wait for the state validation to complete as well since the node will retry and we
                // don't want lots of cluster states in flight.
                validateStateListener.addListener(new ActionListener<>() {
                    @Override
                    public void onResponse(Empty empty) {
                        validateListener.onFailure(e);
                    }

                    @Override
                    public void onFailure(Exception e2) {
                        e2.addSuppressed(e);
                        validateListener.onFailure(e2);
                    }
                });
            }
        });
    }

    private void sendJoinValidate(DiscoveryNode discoveryNode, ActionListener listener) {
        joinValidationService.validateJoin(discoveryNode, listener.delegateResponse((delegate, e) -> {
            logger.warn(() -> "failed to validate incoming join request from node [" + discoveryNode + "]", e);
            delegate.onFailure(
                new IllegalStateException(
                    String.format(
                        Locale.ROOT,
                        "failure when sending a join validation request from [%s] to [%s]",
                        getLocalNode().descriptionWithoutAttributes(),
                        discoveryNode.descriptionWithoutAttributes()
                    ),
                    e
                )
            );
        }));
    }

    private void sendJoinPing(DiscoveryNode discoveryNode, TransportRequestOptions.Type channelType, ActionListener listener) {
        transportService.sendRequest(
            discoveryNode,
            JoinHelper.JOIN_PING_ACTION_NAME,
            TransportRequest.Empty.INSTANCE,
            TransportRequestOptions.of(null, channelType),
            new ActionListenerResponseHandler<>(listener.delegateResponse((l, e) -> {
                logger.warn(() -> format("failed to ping joining node [%s] on channel type [%s]", discoveryNode, channelType), e);
                listener.onFailure(
                    new IllegalStateException(
                        String.format(
                            Locale.ROOT,
                            "failure when sending a join ping request from [%s] to [%s]",
                            getLocalNode().descriptionWithoutAttributes(),
                            discoveryNode.descriptionWithoutAttributes()
                        ),
                        e
                    )
                );
            }), i -> Empty.INSTANCE, Names.CLUSTER_COORDINATION)
        );
    }

    private void processJoinRequest(JoinRequest joinRequest, ActionListener joinListener) {
        assert Transports.assertNotTransportThread("blocking on coordinator mutex and maybe doing IO to increase term");
        final Optional optionalJoin = joinRequest.getOptionalJoin();
        try {
            synchronized (mutex) {
                updateMaxTermSeen(joinRequest.getTerm());

                final CoordinationState coordState = coordinationState.get();
                final boolean prevElectionWon = coordState.electionWon();

                optionalJoin.ifPresent(this::handleJoin);
                joinAccumulator.handleJoinRequest(joinRequest.getSourceNode(), joinListener);

                if (prevElectionWon == false && coordState.electionWon()) {
                    becomeLeader();
                }
            }
        } catch (Exception e) {
            joinListener.onFailure(e);
        }
    }

    private void updateSingleNodeClusterChecker() {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";

        if (mode == Mode.LEADER && applierState.nodes().size() == 1) {
            if (singleNodeClusterChecker == null) {
                // Make a single-node checker if none exists
                singleNodeClusterChecker = transportService.getThreadPool().scheduleWithFixedDelay(new Runnable() {
                    @Override
                    public void run() {
                        Coordinator.this.checkSingleNodeCluster();
                    }

                    @Override
                    public String toString() {
                        return "single-node cluster checker";
                    }
                }, this.singleNodeClusterSeedHostsCheckInterval, Names.SAME);
            }
            return;
        }

        // In case of a multi-node cluster, there is no need for the single-node checker so cancel it
        if (singleNodeClusterChecker != null) {
            singleNodeClusterChecker.cancel();
            singleNodeClusterChecker = null;
        }
    }

    private void checkSingleNodeCluster() {
        if (mode != Mode.LEADER || applierState.nodes().size() > 1) {
            return;
        }

        if (DISCOVERY_SEED_HOSTS_SETTING.exists(settings)) {
            if (DISCOVERY_SEED_HOSTS_SETTING.get(settings).isEmpty()) {
                // For a single-node cluster, the only acceptable setting is an empty list.
                return;
            } else {
                logger.warn(
                    """
                        This node is a fully-formed single-node cluster with cluster UUID [{}], but it is configured as if to \
                        discover other nodes and form a multi-node cluster via the [{}] setting. Fully-formed clusters do not \
                        attempt to discover other nodes, and nodes with different cluster UUIDs cannot belong to the same cluster. \
                        The cluster UUID persists across restarts and can only be changed by deleting the contents of the node's \
                        data path(s). Remove the discovery configuration to suppress this message.""",
                    applierState.metadata().clusterUUID(),
                    DISCOVERY_SEED_HOSTS_SETTING.getKey() + "=" + DISCOVERY_SEED_HOSTS_SETTING.get(settings)
                );
            }
        }
    }

    void becomeCandidate(String method) {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
        logger.debug(
            "{}: coordinator becoming CANDIDATE in term {} (was {}, lastKnownLeader was [{}])",
            method,
            getCurrentTerm(),
            mode,
            lastKnownLeader
        );

        if (mode != Mode.CANDIDATE) {
            final Mode prevMode = mode;
            mode = Mode.CANDIDATE;
            cancelActivePublication("become candidate: " + method);
            joinAccumulator.close(mode);
            joinAccumulator = joinHelper.new CandidateJoinAccumulator();

            peerFinder.activate(coordinationState.get().getLastAcceptedState().nodes());
            clusterFormationFailureHelper.start();

            leaderChecker.setCurrentNodes(DiscoveryNodes.EMPTY_NODES);
            leaderChecker.updateLeader(null);

            followersChecker.clearCurrentNodes();
            followersChecker.updateFastResponseState(getCurrentTerm(), mode);
            lagDetector.clearTrackedNodes();

            if (prevMode == Mode.LEADER) {
                cleanMasterService();
            }

            if (applierState.nodes().getMasterNodeId() != null) {
                applierState = clusterStateWithNoMasterBlock(applierState);
                clusterApplier.onNewClusterState("becoming candidate: " + method, () -> applierState, ActionListener.noop());
            }
        }

        updateSingleNodeClusterChecker();
        preVoteCollector.update(getPreVoteResponse(), null);
    }

    private void becomeLeader() {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
        assert mode == Mode.CANDIDATE : "expected candidate but was " + mode;
        assert getLocalNode().isMasterNode() : getLocalNode() + " became a leader but is not master-eligible";

        logger.debug(
            "handleJoinRequest: coordinator becoming LEADER in term {} (was {}, lastKnownLeader was [{}])",
            getCurrentTerm(),
            mode,
            lastKnownLeader
        );

        mode = Mode.LEADER;
        joinAccumulator.close(mode);
        joinAccumulator = joinHelper.new LeaderJoinAccumulator();

        lastKnownLeader = Optional.of(getLocalNode());
        peerFinder.deactivate(getLocalNode());
        clusterFormationFailureHelper.stop();
        closePrevotingAndElectionScheduler();
        preVoteCollector.update(getPreVoteResponse(), getLocalNode());

        assert leaderChecker.leader() == null : leaderChecker.leader();
        followersChecker.updateFastResponseState(getCurrentTerm(), mode);

        updateSingleNodeClusterChecker();
    }

    void becomeFollower(String method, DiscoveryNode leaderNode) {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
        assert leaderNode.isMasterNode() : leaderNode + " became a leader but is not master-eligible";
        assert mode != Mode.LEADER : "do not switch to follower from leader (should be candidate first)";

        if (mode == Mode.FOLLOWER && Optional.of(leaderNode).equals(lastKnownLeader)) {
            logger.trace("{}: coordinator remaining FOLLOWER of [{}] in term {}", method, leaderNode, getCurrentTerm());
        } else {
            logger.debug(
                "{}: coordinator becoming FOLLOWER of [{}] in term {} (was {}, lastKnownLeader was [{}])",
                method,
                leaderNode,
                getCurrentTerm(),
                mode,
                lastKnownLeader
            );
        }

        final boolean restartLeaderChecker = (mode == Mode.FOLLOWER && Optional.of(leaderNode).equals(lastKnownLeader)) == false;

        if (mode != Mode.FOLLOWER) {
            mode = Mode.FOLLOWER;
            joinAccumulator.close(mode);
            joinAccumulator = new JoinHelper.FollowerJoinAccumulator();
            leaderChecker.setCurrentNodes(DiscoveryNodes.EMPTY_NODES);
        }

        updateSingleNodeClusterChecker();
        lastKnownLeader = Optional.of(leaderNode);
        peerFinder.deactivate(leaderNode);
        clusterFormationFailureHelper.stop();
        closePrevotingAndElectionScheduler();
        cancelActivePublication("become follower: " + method);
        preVoteCollector.update(getPreVoteResponse(), leaderNode);

        if (restartLeaderChecker) {
            leaderChecker.updateLeader(leaderNode);
        }

        followersChecker.clearCurrentNodes();
        followersChecker.updateFastResponseState(getCurrentTerm(), mode);
        lagDetector.clearTrackedNodes();
    }

    private void cleanMasterService() {
        new LocalMasterServiceTask(Priority.NORMAL) {
            @Override
            public void onFailure(Exception e) {
                // ignore
                logger.trace("failed to clean-up after stepping down as master", e);
            }

            @Override
            public void execute(ClusterState currentState) {
                if (currentState.nodes().isLocalNodeElectedMaster() == false) {
                    allocationService.cleanCaches();
                }
            }
        }.submit(masterService, "clean-up after stepping down as master");
    }

    private PreVoteResponse getPreVoteResponse() {
        return new PreVoteResponse(
            getCurrentTerm(),
            coordinationState.get().getLastAcceptedTerm(),
            coordinationState.get().getLastAcceptedState().version()
        );
    }

    // package-visible for testing
    long getCurrentTerm() {
        synchronized (mutex) {
            return coordinationState.get().getCurrentTerm();
        }
    }

    // package-visible for testing
    Mode getMode() {
        synchronized (mutex) {
            return mode;
        }
    }

    // visible for testing
    DiscoveryNode getLocalNode() {
        return transportService.getLocalNode();
    }

    // package-visible for testing
    boolean publicationInProgress() {
        synchronized (mutex) {
            return currentPublication.isPresent();
        }
    }

    @Override
    protected void doStart() {
        synchronized (mutex) {
            CoordinationState.PersistedState persistedState = persistedStateSupplier.get();
            coordinationState.set(new CoordinationState(getLocalNode(), persistedState, electionStrategy));
            peerFinder.setCurrentTerm(getCurrentTerm());
            configuredHostsResolver.start();
            final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState();
            clusterBootstrapService.logBootstrapState(lastAcceptedState.metadata());
            final VotingConfiguration votingConfiguration = lastAcceptedState.getLastCommittedConfiguration();
            if (singleNodeDiscovery
                && votingConfiguration.isEmpty() == false
                && votingConfiguration.hasQuorum(Collections.singleton(getLocalNode().getId())) == false) {
                throw new IllegalStateException(
                    "cannot start with ["
                        + DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey()
                        + "] set to ["
                        + DiscoveryModule.SINGLE_NODE_DISCOVERY_TYPE
                        + "] when local node "
                        + getLocalNode()
                        + " does not have quorum in voting configuration "
                        + votingConfiguration
                );
            }
            ClusterState initialState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(settings))
                .blocks(
                    ClusterBlocks.builder()
                        .addGlobalBlock(STATE_NOT_RECOVERED_BLOCK)
                        .addGlobalBlock(noMasterBlockService.getNoMasterBlock())
                )
                .nodes(DiscoveryNodes.builder().add(getLocalNode()).localNodeId(getLocalNode().getId()))
                .build();
            applierState = initialState;
            clusterApplier.setInitialState(initialState);
        }
    }

    public DiscoveryStats stats() {
        return new DiscoveryStats(
            new PendingClusterStateStats(0, 0, 0),
            publicationHandler.stats(),
            getLocalNode().isMasterNode() ? masterService.getClusterStateUpdateStats() : null,
            clusterApplier.getStats()
        );
    }

    public void startInitialJoin() {
        synchronized (mutex) {
            becomeCandidate("startInitialJoin");
        }
        clusterBootstrapService.scheduleUnconfiguredBootstrap();
    }

    @Override
    protected void doStop() {
        configuredHostsResolver.stop();
        joinValidationService.stop();
    }

    @Override
    protected void doClose() throws IOException {
        final CoordinationState coordinationState = this.coordinationState.get();
        if (coordinationState != null) {
            // This looks like a race that might leak an unclosed CoordinationState if it's created while execution is here, but this method
            // is synchronized on AbstractLifecycleComponent#lifestyle, as is the doStart() method that creates the CoordinationState, so
            // it's all ok.
            synchronized (mutex) {
                coordinationState.close();
            }
        }
    }

    public void invariant() {
        synchronized (mutex) {
            final Optional peerFinderLeader = peerFinder.getLeader();
            assert peerFinder.getCurrentTerm() == getCurrentTerm();
            assert followersChecker.getFastResponseState().term() == getCurrentTerm() : followersChecker.getFastResponseState();
            assert followersChecker.getFastResponseState().mode() == getMode() : followersChecker.getFastResponseState();
            assert (applierState.nodes().getMasterNodeId() == null) == applierState.blocks().hasGlobalBlockWithId(NO_MASTER_BLOCK_ID);
            assert preVoteCollector.getPreVoteResponse().equals(getPreVoteResponse()) : preVoteCollector + " vs " + getPreVoteResponse();

            assert lagDetector.getTrackedNodes().contains(getLocalNode()) == false : lagDetector.getTrackedNodes();
            assert followersChecker.getKnownFollowers().equals(lagDetector.getTrackedNodes())
                : followersChecker.getKnownFollowers() + " vs " + lagDetector.getTrackedNodes();
            assert singleNodeClusterChecker == null || (mode == Mode.LEADER && applierState.nodes().size() == 1)
                : "Single node checker must exist iff there is a single-node cluster";

            if (mode == Mode.LEADER) {
                final boolean becomingMaster = getStateForMasterService().term() != getCurrentTerm();

                assert coordinationState.get().electionWon();
                assert lastKnownLeader.isPresent() && lastKnownLeader.get().equals(getLocalNode());
                assert joinAccumulator instanceof JoinHelper.LeaderJoinAccumulator;
                assert peerFinderLeader.equals(lastKnownLeader) : peerFinderLeader;
                assert electionScheduler == null : electionScheduler;
                assert prevotingRound == null : prevotingRound;
                assert becomingMaster || getStateForMasterService().nodes().getMasterNodeId() != null : getStateForMasterService();
                assert leaderChecker.leader() == null : leaderChecker.leader();
                assert getLocalNode().equals(applierState.nodes().getMasterNode())
                    || (applierState.nodes().getMasterNodeId() == null && applierState.term() < getCurrentTerm());
                assert preVoteCollector.getLeader() == getLocalNode() : preVoteCollector;
                assert clusterFormationFailureHelper.isRunning() == false;

                final boolean activePublication = currentPublication.map(CoordinatorPublication::isActiveForCurrentLeader).orElse(false);
                if (becomingMaster && activePublication == false) {
                    // cluster state update task to become master is submitted to MasterService, but publication has not started yet
                    assert followersChecker.getKnownFollowers().isEmpty() : followersChecker.getKnownFollowers();
                } else {
                    final ClusterState lastPublishedState;
                    if (activePublication) {
                        // active publication in progress: followersChecker is up-to-date with nodes that we're actively publishing to
                        lastPublishedState = currentPublication.get().publishedState();
                    } else {
                        // no active publication: followersChecker is up-to-date with the nodes of the latest publication
                        lastPublishedState = coordinationState.get().getLastAcceptedState();
                    }
                    final Set lastPublishedNodes = new HashSet<>();
                    lastPublishedState.nodes().forEach(lastPublishedNodes::add);
                    assert lastPublishedNodes.remove(getLocalNode()); // followersChecker excludes local node
                    assert lastPublishedNodes.equals(followersChecker.getKnownFollowers())
                        : lastPublishedNodes + " != " + followersChecker.getKnownFollowers();
                }

                assert becomingMaster
                    || activePublication
                    || coordinationState.get()
                        .getLastAcceptedConfiguration()
                        .equals(coordinationState.get().getLastCommittedConfiguration())
                    : coordinationState.get().getLastAcceptedConfiguration()
                        + " != "
                        + coordinationState.get().getLastCommittedConfiguration();
            } else if (mode == Mode.FOLLOWER) {
                assert coordinationState.get().electionWon() == false : getLocalNode() + " is FOLLOWER so electionWon() should be false";
                assert lastKnownLeader.isPresent() && (lastKnownLeader.get().equals(getLocalNode()) == false);
                assert joinAccumulator instanceof JoinHelper.FollowerJoinAccumulator;
                assert peerFinderLeader.equals(lastKnownLeader) : peerFinderLeader;
                assert electionScheduler == null : electionScheduler;
                assert prevotingRound == null : prevotingRound;
                assert getStateForMasterService().nodes().getMasterNodeId() == null : getStateForMasterService();
                assert leaderChecker.currentNodeIsMaster() == false;
                assert lastKnownLeader.equals(Optional.of(leaderChecker.leader()));
                assert followersChecker.getKnownFollowers().isEmpty();
                assert lastKnownLeader.get().equals(applierState.nodes().getMasterNode())
                    || (applierState.nodes().getMasterNodeId() == null
                        && (applierState.term() < getCurrentTerm() || applierState.version() < getLastAcceptedState().version()));
                assert currentPublication.map(Publication::isCommitted).orElse(true);
                assert preVoteCollector.getLeader().equals(lastKnownLeader.get()) : preVoteCollector;
                assert clusterFormationFailureHelper.isRunning() == false;
            } else {
                assert mode == Mode.CANDIDATE;
                assert joinAccumulator instanceof JoinHelper.CandidateJoinAccumulator;
                assert peerFinderLeader.isPresent() == false : peerFinderLeader;
                assert prevotingRound == null || electionScheduler != null;
                assert getStateForMasterService().nodes().getMasterNodeId() == null : getStateForMasterService();
                assert leaderChecker.currentNodeIsMaster() == false;
                assert leaderChecker.leader() == null : leaderChecker.leader();
                assert followersChecker.getKnownFollowers().isEmpty();
                assert applierState.nodes().getMasterNodeId() == null;
                assert currentPublication.map(Publication::isCommitted).orElse(true);
                assert preVoteCollector.getLeader() == null : preVoteCollector;
                assert clusterFormationFailureHelper.isRunning();
            }
        }
    }

    public boolean isInitialConfigurationSet() {
        return getStateForMasterService().getLastAcceptedConfiguration().isEmpty() == false;
    }

    /**
     * Sets the initial configuration to the given {@link VotingConfiguration}. This method is safe to call
     * more than once, as long as the argument to each call is the same.
     *
     * @param votingConfiguration The nodes that should form the initial configuration.
     * @return whether this call successfully set the initial configuration - if false, the cluster has already been bootstrapped.
     */
    public boolean setInitialConfiguration(final VotingConfiguration votingConfiguration) {
        synchronized (mutex) {
            final ClusterState currentState = getStateForMasterService();

            if (isInitialConfigurationSet()) {
                logger.debug("initial configuration already set, ignoring {}", votingConfiguration);
                return false;
            }

            if (getLocalNode().isMasterNode() == false) {
                logger.debug("skip setting initial configuration as local node is not a master-eligible node");
                throw new CoordinationStateRejectedException(
                    "this node is not master-eligible, but cluster bootstrapping can only happen on a master-eligible node"
                );
            }

            if (votingConfiguration.getNodeIds().contains(getLocalNode().getId()) == false) {
                logger.debug("skip setting initial configuration as local node is not part of initial configuration");
                throw new CoordinationStateRejectedException("local node is not part of initial configuration");
            }

            final List knownNodes = new ArrayList<>();
            knownNodes.add(getLocalNode());
            peerFinder.getFoundPeers().forEach(knownNodes::add);

            if (votingConfiguration.hasQuorum(knownNodes.stream().map(DiscoveryNode::getId).toList()) == false) {
                logger.debug(
                    "skip setting initial configuration as not enough nodes discovered to form a quorum in the "
                        + "initial configuration [knownNodes={}, {}]",
                    knownNodes,
                    votingConfiguration
                );
                throw new CoordinationStateRejectedException(
                    "not enough nodes discovered to form a quorum in the initial configuration "
                        + "[knownNodes="
                        + knownNodes
                        + ", "
                        + votingConfiguration
                        + "]"
                );
            }

            logger.info("setting initial configuration to {}", votingConfiguration);
            final CoordinationMetadata coordinationMetadata = CoordinationMetadata.builder(currentState.coordinationMetadata())
                .lastAcceptedConfiguration(votingConfiguration)
                .lastCommittedConfiguration(votingConfiguration)
                .build();

            Metadata.Builder metadataBuilder = Metadata.builder(currentState.metadata());
            // automatically generate a UID for the metadata if we need to
            metadataBuilder.generateClusterUuidIfNeeded();
            metadataBuilder.coordinationMetadata(coordinationMetadata);

            coordinationState.get().setInitialState(ClusterState.builder(currentState).metadata(metadataBuilder).build());
            assert localNodeMayWinElection(getLastAcceptedState())
                : "initial state does not allow local node to win election: " + getLastAcceptedState().coordinationMetadata();
            preVoteCollector.update(getPreVoteResponse(), null); // pick up the change to last-accepted version
            startElectionScheduler();
            return true;
        }
    }

    // Package-private for testing
    ClusterState improveConfiguration(ClusterState clusterState) {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
        assert validVotingConfigExclusionState(clusterState) : clusterState;

        // exclude any nodes whose ID is in the voting config exclusions list ...
        final Stream excludedNodeIds = clusterState.getVotingConfigExclusions().stream().map(VotingConfigExclusion::getNodeId);
        // ... and also automatically exclude the node IDs of master-ineligible nodes that were previously master-eligible and are still in
        // the voting config. We could exclude all the master-ineligible nodes here, but there could be quite a few of them and that makes
        // the logging much harder to follow.
        final Stream masterIneligibleNodeIdsInVotingConfig = clusterState.nodes()
            .stream()
            .filter(
                n -> n.isMasterNode() == false
                    && (clusterState.getLastAcceptedConfiguration().getNodeIds().contains(n.getId())
                        || clusterState.getLastCommittedConfiguration().getNodeIds().contains(n.getId()))
            )
            .map(DiscoveryNode::getId);

        final Set liveNodes = clusterState.nodes()
            .stream()
            .filter(DiscoveryNode::isMasterNode)
            .filter(coordinationState.get()::containsJoinVoteFor)
            .collect(Collectors.toSet());
        final VotingConfiguration newConfig = reconfigurator.reconfigure(
            liveNodes,
            Stream.concat(masterIneligibleNodeIdsInVotingConfig, excludedNodeIds).collect(Collectors.toSet()),
            getLocalNode(),
            clusterState.getLastAcceptedConfiguration()
        );

        if (newConfig.equals(clusterState.getLastAcceptedConfiguration()) == false) {
            assert coordinationState.get().joinVotesHaveQuorumFor(newConfig);
            return ClusterState.builder(clusterState)
                .metadata(
                    Metadata.builder(clusterState.metadata())
                        .coordinationMetadata(
                            CoordinationMetadata.builder(clusterState.coordinationMetadata()).lastAcceptedConfiguration(newConfig).build()
                        )
                )
                .build();
        }
        return clusterState;
    }

    /*
    * Valid Voting Configuration Exclusion state criteria:
    * 1. Every voting config exclusion with an ID of _absent_ should not match any nodes currently in the cluster by name
    * 2. Every voting config exclusion with a name of _absent_ should not match any nodes currently in the cluster by ID
     */
    static boolean validVotingConfigExclusionState(ClusterState clusterState) {
        Set votingConfigExclusions = clusterState.getVotingConfigExclusions();
        Set nodeNamesWithAbsentId = votingConfigExclusions.stream()
            .filter(e -> e.getNodeId().equals(VotingConfigExclusion.MISSING_VALUE_MARKER))
            .map(VotingConfigExclusion::getNodeName)
            .collect(Collectors.toSet());
        Set nodeIdsWithAbsentName = votingConfigExclusions.stream()
            .filter(e -> e.getNodeName().equals(VotingConfigExclusion.MISSING_VALUE_MARKER))
            .map(VotingConfigExclusion::getNodeId)
            .collect(Collectors.toSet());
        for (DiscoveryNode node : clusterState.getNodes()) {
            if (node.isMasterNode() && (nodeIdsWithAbsentName.contains(node.getId()) || nodeNamesWithAbsentId.contains(node.getName()))) {
                return false;
            }
        }

        return true;
    }

    private final AtomicBoolean reconfigurationTaskScheduled = new AtomicBoolean();

    private void scheduleReconfigurationIfNeeded() {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
        assert mode == Mode.LEADER : mode;
        assert currentPublication.isPresent() == false : "Expected no publication in progress";

        final ClusterState state = getLastAcceptedState();
        if (improveConfiguration(state) != state && reconfigurationTaskScheduled.compareAndSet(false, true)) {
            logger.trace("scheduling reconfiguration");
            submitUnbatchedTask("reconfigure", new ClusterStateUpdateTask(Priority.URGENT) {
                @Override
                public ClusterState execute(ClusterState currentState) {
                    reconfigurationTaskScheduled.set(false);
                    synchronized (mutex) {
                        return improveConfiguration(currentState);
                    }
                }

                @Override
                public void onFailure(Exception e) {
                    reconfigurationTaskScheduled.set(false);
                    logger.debug("reconfiguration failed", e);
                }
            });
        }
    }

    @SuppressForbidden(reason = "legacy usage of unbatched task") // TODO add support for batching here
    private void submitUnbatchedTask(String source, ClusterStateUpdateTask task) {
        masterService.submitUnbatchedStateUpdateTask(source, task);
    }

    // exposed for tests
    boolean missingJoinVoteFrom(DiscoveryNode node) {
        return node.isMasterNode() && coordinationState.get().containsJoinVoteFor(node) == false;
    }

    private void handleJoin(Join join) {
        synchronized (mutex) {
            ensureTermAtLeast(getLocalNode(), join.getTerm()).ifPresent(this::handleJoin);

            if (coordinationState.get().electionWon()) {
                // If we have already won the election then the actual join does not matter for election purposes, so swallow any exception
                final boolean isNewJoinFromMasterEligibleNode = handleJoinIgnoringExceptions(join);

                // If we haven't completely finished becoming master then there's already a publication scheduled which will, in turn,
                // schedule a reconfiguration if needed. It's benign to schedule a reconfiguration anyway, but it might fail if it wins the
                // race against the election-winning publication and log a big error message, which we can prevent by checking this here:
                final boolean establishedAsMaster = mode == Mode.LEADER && getLastAcceptedState().term() == getCurrentTerm();
                if (isNewJoinFromMasterEligibleNode && establishedAsMaster && publicationInProgress() == false) {
                    scheduleReconfigurationIfNeeded();
                }
            } else {
                coordinationState.get().handleJoin(join); // this might fail and bubble up the exception
            }
        }
    }

    /**
     * @return true iff the join was from a new node and was successfully added
     */
    private boolean handleJoinIgnoringExceptions(Join join) {
        try {
            return coordinationState.get().handleJoin(join);
        } catch (CoordinationStateRejectedException e) {
            logger.debug(() -> "failed to add " + join + " - ignoring", e);
            return false;
        }
    }

    public ClusterState getLastAcceptedState() {
        synchronized (mutex) {
            return coordinationState.get().getLastAcceptedState();
        }
    }

    @Nullable
    public ClusterState getApplierState() {
        return applierState;
    }

    private List getDiscoveredNodes() {
        final List nodes = new ArrayList<>();
        nodes.add(getLocalNode());
        peerFinder.getFoundPeers().forEach(nodes::add);
        return nodes;
    }

    ClusterState getStateForMasterService() {
        synchronized (mutex) {
            // expose last accepted cluster state as base state upon which the master service
            // speculatively calculates the next cluster state update
            final ClusterState clusterState = coordinationState.get().getLastAcceptedState();
            assert clusterState.nodes().getLocalNode() != null;
            if (mode != Mode.LEADER || clusterState.term() != getCurrentTerm()) {
                // the master service checks if the local node is the master node in order to fail execution of the state update early
                return clusterStateWithNoMasterBlock(clusterState);
            }
            return clusterState;
        }
    }

    private ClusterState clusterStateWithNoMasterBlock(ClusterState clusterState) {
        if (clusterState.nodes().getMasterNodeId() != null) {
            // remove block if it already exists before adding new one
            assert clusterState.blocks().hasGlobalBlockWithId(NO_MASTER_BLOCK_ID) == false
                : "NO_MASTER_BLOCK should only be added by Coordinator";
            final ClusterBlocks clusterBlocks = ClusterBlocks.builder()
                .blocks(clusterState.blocks())
                .addGlobalBlock(noMasterBlockService.getNoMasterBlock())
                .build();
            final DiscoveryNodes discoveryNodes = new DiscoveryNodes.Builder(clusterState.nodes()).masterNodeId(null).build();
            return ClusterState.builder(clusterState).blocks(clusterBlocks).nodes(discoveryNodes).build();
        } else {
            return clusterState;
        }
    }

    @Override
    public void publish(
        ClusterStatePublicationEvent clusterStatePublicationEvent,
        ActionListener publishListener,
        AckListener ackListener
    ) {
        try {
            synchronized (mutex) {
                if (mode != Mode.LEADER || getCurrentTerm() != clusterStatePublicationEvent.getNewState().term()) {
                    logger.debug(
                        () -> format(
                            "[%s] failed publication as node is no longer master for term %s",
                            clusterStatePublicationEvent.getSummary(),
                            clusterStatePublicationEvent.getNewState().term()
                        )
                    );
                    publishListener.onFailure(
                        new FailedToCommitClusterStateException(
                            "node is no longer master for term "
                                + clusterStatePublicationEvent.getNewState().term()
                                + " while handling publication"
                        )
                    );
                    return;
                }

                if (currentPublication.isPresent()) {
                    assert false : "[" + currentPublication.get() + "] in progress, cannot start new publication";
                    logger.warn(
                        () -> format(
                            "[%s] failed publication as already publication in progress",
                            clusterStatePublicationEvent.getSummary()
                        )
                    );
                    publishListener.onFailure(
                        new FailedToCommitClusterStateException("publication " + currentPublication.get() + " already in progress")
                    );
                    return;
                }

                assert assertPreviousStateConsistency(clusterStatePublicationEvent);

                final ClusterState clusterState = clusterStatePublicationEvent.getNewState();

                assert getLocalNode().equals(clusterState.getNodes().get(getLocalNode().getId()))
                    : getLocalNode() + " should be in published " + clusterState;

                final long publicationContextConstructionStartMillis = transportService.getThreadPool().rawRelativeTimeInMillis();
                final PublicationTransportHandler.PublicationContext publicationContext = publicationHandler.newPublicationContext(
                    clusterStatePublicationEvent
                );
                try {
                    clusterStatePublicationEvent.setPublicationContextConstructionElapsedMillis(
                        transportService.getThreadPool().rawRelativeTimeInMillis() - publicationContextConstructionStartMillis
                    );

                    final PublishRequest publishRequest = coordinationState.get().handleClientValue(clusterState);
                    final CoordinatorPublication publication = new CoordinatorPublication(
                        clusterStatePublicationEvent,
                        publishRequest,
                        publicationContext,
                        new ListenableFuture<>(),
                        ackListener,
                        publishListener
                    );
                    currentPublication = Optional.of(publication);

                    final DiscoveryNodes publishNodes = publishRequest.getAcceptedState().nodes();
                    leaderChecker.setCurrentNodes(publishNodes);
                    followersChecker.setCurrentNodes(publishNodes);
                    lagDetector.setTrackedNodes(publishNodes);
                    publication.start(followersChecker.getFaultyNodes());
                } finally {
                    publicationContext.decRef();
                }
            }
        } catch (Exception e) {
            logger.debug(() -> "[" + clusterStatePublicationEvent.getSummary() + "] publishing failed", e);
            publishListener.onFailure(new FailedToCommitClusterStateException("publishing failed", e));
        }
    }

    // there is no equals on cluster state, so we just serialize it to XContent and compare Maps
    // deserialized from the resulting JSON
    private boolean assertPreviousStateConsistency(ClusterStatePublicationEvent clusterStatePublicationEvent) {
        assert clusterStatePublicationEvent.getOldState() == coordinationState.get().getLastAcceptedState()
            || XContentHelper.convertToMap(JsonXContent.jsonXContent, Strings.toString(clusterStatePublicationEvent.getOldState()), false)
                .equals(
                    XContentHelper.convertToMap(
                        JsonXContent.jsonXContent,
                        Strings.toString(clusterStateWithNoMasterBlock(coordinationState.get().getLastAcceptedState())),
                        false
                    )
                )
            : Strings.toString(clusterStatePublicationEvent.getOldState())
                + " vs "
                + Strings.toString(clusterStateWithNoMasterBlock(coordinationState.get().getLastAcceptedState()));
        return true;
    }

    private  ActionListener wrapWithMutex(ActionListener listener) {
        return new ActionListener() {
            @Override
            public void onResponse(T t) {
                synchronized (mutex) {
                    listener.onResponse(t);
                }
            }

            @Override
            public void onFailure(Exception e) {
                synchronized (mutex) {
                    listener.onFailure(e);
                }
            }
        };
    }

    private void cancelActivePublication(String reason) {
        assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
        if (currentPublication.isPresent()) {
            currentPublication.get().cancel(reason);
        }
    }

    public Collection> getOnJoinValidators() {
        return onJoinValidators;
    }

    // for tests
    boolean hasIdleJoinValidationService() {
        return joinValidationService.isIdle();
    }

    public void addPeerFinderListener(PeerFinderListener peerFinderListener) {
        this.peerFinderListeners.add(peerFinderListener);
    }

    public enum Mode {
        CANDIDATE,
        LEADER,
        FOLLOWER
    }

    private class CoordinatorPeerFinder extends PeerFinder {

        CoordinatorPeerFinder(
            Settings settings,
            TransportService transportService,
            TransportAddressConnector transportAddressConnector,
            ConfiguredHostsResolver configuredHostsResolver
        ) {
            super(
                settings,
                transportService,
                transportAddressConnector,
                singleNodeDiscovery ? hostsResolver -> {} : configuredHostsResolver
            );
        }

        @Override
        protected void onActiveMasterFound(DiscoveryNode masterNode, long term) {
            synchronized (mutex) {
                ensureTermAtLeast(masterNode, term);
                joinHelper.sendJoinRequest(masterNode, getCurrentTerm(), joinWithDestination(lastJoin, masterNode, term));
            }
        }

        @Override
        protected void startProbe(TransportAddress transportAddress) {
            if (singleNodeDiscovery == false) {
                super.startProbe(transportAddress);
            }
        }

        @Override
        protected void onFoundPeersUpdated() {
            synchronized (mutex) {
                if (mode == Mode.CANDIDATE) {
                    final VoteCollection expectedVotes = new VoteCollection();
                    getFoundPeers().forEach(expectedVotes::addVote);
                    expectedVotes.addVote(Coordinator.this.getLocalNode());
                    final boolean foundQuorum = coordinationState.get().isElectionQuorum(expectedVotes);

                    if (foundQuorum) {
                        if (electionScheduler == null) {
                            startElectionScheduler();
                        }
                    } else {
                        closePrevotingAndElectionScheduler();
                    }
                }
            }
            peerFinderListeners.forEach(PeerFinderListener::onFoundPeersUpdated);
        }
    }

    private void startElectionScheduler() {
        assert electionScheduler == null : electionScheduler;

        if (getLocalNode().isMasterNode() == false) {
            return;
        }

        final TimeValue gracePeriod = TimeValue.ZERO;
        electionScheduler = electionSchedulerFactory.startElectionScheduler(gracePeriod, new Runnable() {
            @Override
            public void run() {
                synchronized (mutex) {
                    if (mode == Mode.CANDIDATE) {
                        final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState();

                        if (localNodeMayWinElection(lastAcceptedState) == false) {
                            logger.trace("skip prevoting as local node may not win election: {}", lastAcceptedState.coordinationMetadata());
                            return;
                        }

                        final StatusInfo statusInfo = nodeHealthService.getHealth();
                        if (statusInfo.getStatus() == UNHEALTHY) {
                            logger.debug("skip prevoting as local node is unhealthy: [{}]", statusInfo.getInfo());
                            return;
                        }

                        if (prevotingRound != null) {
                            prevotingRound.close();
                        }
                        prevotingRound = preVoteCollector.start(lastAcceptedState, getDiscoveredNodes());
                    }
                }
            }

            @Override
            public String toString() {
                return "scheduling of new prevoting round";
            }
        });
    }

    public Iterable getFoundPeers() {
        return peerFinder.getFoundPeers();
    }

    public PeerFinder getPeerFinder() {
        return this.peerFinder;
    }

    /**
     * If there is any current committed publication, this method cancels it.
     * This method is used exclusively by tests.
     * @return true if publication was cancelled, false if there is no current committed publication.
     */
    boolean cancelCommittedPublication() {
        synchronized (mutex) {
            if (currentPublication.isPresent()) {
                final CoordinatorPublication publication = currentPublication.get();
                if (publication.isCommitted()) {
                    publication.cancel("cancelCommittedPublication");
                    logger.debug("Cancelled publication of [{}].", publication);
                    return true;
                }
            }
            return false;
        }
    }

    class CoordinatorPublication extends Publication {

        private final ClusterStatePublicationEvent clusterStatePublicationEvent;
        private final PublishRequest publishRequest;
        private final ListenableFuture localNodeAckEvent;
        private final AckListener ackListener;
        private final ActionListener publishListener;
        private final PublicationTransportHandler.PublicationContext publicationContext;

        @Nullable // if using single-node discovery
        private final Scheduler.ScheduledCancellable timeoutHandler;
        private final Scheduler.Cancellable infoTimeoutHandler;

        // We may not have accepted our own state before receiving a join from another node, causing its join to be rejected (we cannot
        // safely accept a join whose last-accepted term/version is ahead of ours), so store them up and process them at the end.
        private final List receivedJoins = new ArrayList<>();
        private boolean receivedJoinsProcessed;

        CoordinatorPublication(
            ClusterStatePublicationEvent clusterStatePublicationEvent,
            PublishRequest publishRequest,
            PublicationTransportHandler.PublicationContext publicationContext,
            ListenableFuture localNodeAckEvent,
            AckListener ackListener,
            ActionListener publishListener
        ) {
            super(publishRequest, new AckListener() {
                @Override
                public void onCommit(TimeValue commitTime) {
                    clusterStatePublicationEvent.setPublicationCommitElapsedMillis(commitTime.millis());
                    ackListener.onCommit(commitTime);
                }

                @Override
                public void onNodeAck(DiscoveryNode node, Exception e) {
                    // acking and cluster state application for local node is handled specially
                    if (node.equals(getLocalNode())) {
                        synchronized (mutex) {
                            if (e == null) {
                                localNodeAckEvent.onResponse(null);
                            } else {
                                localNodeAckEvent.onFailure(e);
                            }
                        }
                    } else {
                        ackListener.onNodeAck(node, e);
                        if (e == null) {
                            lagDetector.setAppliedVersion(node, publishRequest.getAcceptedState().version());
                        }
                    }
                }
            }, transportService.getThreadPool()::rawRelativeTimeInMillis);
            this.clusterStatePublicationEvent = clusterStatePublicationEvent;
            this.publishRequest = publishRequest;
            this.publicationContext = publicationContext;
            this.localNodeAckEvent = localNodeAckEvent;
            this.ackListener = ackListener;
            this.publishListener = publishListener;

            this.timeoutHandler = singleNodeDiscovery ? null : transportService.getThreadPool().schedule(new Runnable() {
                @Override
                public void run() {
                    synchronized (mutex) {
                        cancel("timed out after " + publishTimeout);
                    }
                }

                @Override
                public String toString() {
                    return "scheduled timeout for " + CoordinatorPublication.this;
                }
            }, publishTimeout, Names.CLUSTER_COORDINATION);

            this.infoTimeoutHandler = transportService.getThreadPool().schedule(new Runnable() {
                @Override
                public void run() {
                    synchronized (mutex) {
                        logIncompleteNodes(Level.INFO);
                    }
                }

                @Override
                public String toString() {
                    return "scheduled timeout for reporting on " + CoordinatorPublication.this;
                }
            }, publishInfoTimeout, Names.CLUSTER_COORDINATION);
        }

        private void removePublicationAndPossiblyBecomeCandidate(String reason) {
            assert Thread.holdsLock(mutex) : "Coordinator mutex not held";

            assert currentPublication.get() == this;
            currentPublication = Optional.empty();
            logger.debug("publication ended unsuccessfully: {}", this);

            // check if node has not already switched modes (by bumping term)
            if (isActiveForCurrentLeader()) {
                becomeCandidate(reason);
            }
        }

        boolean isActiveForCurrentLeader() {
            // checks if this publication can still influence the mode of the current publication
            return mode == Mode.LEADER && publishRequest.getAcceptedState().term() == getCurrentTerm();
        }

        @Override
        protected void onCompletion(boolean committed) {
            assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
            final long completionTimeMillis = transportService.getThreadPool().rawRelativeTimeInMillis();
            clusterStatePublicationEvent.setPublicationCompletionElapsedMillis(completionTimeMillis - getStartTime());

            localNodeAckEvent.addListener(new ActionListener<>() {
                @Override
                public void onResponse(Void ignore) {
                    assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
                    assert committed;

                    receivedJoins.forEach(CoordinatorPublication.this::handleAssociatedJoin);
                    assert receivedJoinsProcessed == false;
                    receivedJoinsProcessed = true;

                    clusterApplier.onNewClusterState(
                        CoordinatorPublication.this.toString(),
                        () -> applierState,
                        new ActionListener() {
                            @Override
                            public void onFailure(Exception e) {
                                synchronized (mutex) {
                                    removePublicationAndPossiblyBecomeCandidate("clusterApplier#onNewClusterState");
                                }
                                cancelTimeoutHandlers();
                                ackListener.onNodeAck(getLocalNode(), e);
                                publishListener.onFailure(e);
                            }

                            @Override
                            public void onResponse(Void ignored) {
                                onClusterStateApplied();
                                clusterStatePublicationEvent.setMasterApplyElapsedMillis(
                                    transportService.getThreadPool().rawRelativeTimeInMillis() - completionTimeMillis
                                );
                                synchronized (mutex) {
                                    assert currentPublication.get() == CoordinatorPublication.this;
                                    currentPublication = Optional.empty();
                                    logger.debug("publication ended successfully: {}", CoordinatorPublication.this);
                                    // trigger term bump if new term was found during publication
                                    updateMaxTermSeen(getCurrentTerm());

                                    if (mode == Mode.LEADER) {
                                        // if necessary, abdicate to another node or improve the voting configuration
                                        boolean attemptReconfiguration = true;
                                        final ClusterState state = getLastAcceptedState(); // committed state
                                        if (localNodeMayWinElection(state) == false) {
                                            final List masterCandidates = completedNodes().stream()
                                                .filter(DiscoveryNode::isMasterNode)
                                                .filter(node -> nodeMayWinElection(state, node))
                                                .filter(node -> {
                                                    // check if master candidate would be able to get an election quorum if we were to
                                                    // abdicate to it. Assume that every node that completed the publication can provide
                                                    // a vote in that next election and has the latest state.
                                                    final long futureElectionTerm = state.term() + 1;
                                                    final VoteCollection futureVoteCollection = new VoteCollection();
                                                    completedNodes().forEach(
                                                        completedNode -> futureVoteCollection.addJoinVote(
                                                            new Join(completedNode, node, futureElectionTerm, state.term(), state.version())
                                                        )
                                                    );
                                                    return electionStrategy.isElectionQuorum(
                                                        node,
                                                        futureElectionTerm,
                                                        state.term(),
                                                        state.version(),
                                                        state.getLastCommittedConfiguration(),
                                                        state.getLastAcceptedConfiguration(),
                                                        futureVoteCollection
                                                    );
                                                })
                                                .toList();
                                            if (masterCandidates.isEmpty() == false) {
                                                abdicateTo(masterCandidates.get(random.nextInt(masterCandidates.size())));
                                                attemptReconfiguration = false;
                                            }
                                        }
                                        if (attemptReconfiguration) {
                                            scheduleReconfigurationIfNeeded();
                                        }
                                    }
                                    lagDetector.startLagDetector(publishRequest.getAcceptedState().version());
                                    logIncompleteNodes(Level.WARN);
                                }
                                cancelTimeoutHandlers();
                                ackListener.onNodeAck(getLocalNode(), null);
                                publishListener.onResponse(null);
                            }
                        }
                    );
                }

                @Override
                public void onFailure(Exception e) {
                    assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
                    removePublicationAndPossiblyBecomeCandidate("Publication.onCompletion(false)");
                    cancelTimeoutHandlers();

                    final FailedToCommitClusterStateException exception = new FailedToCommitClusterStateException("publication failed", e);
                    ackListener.onNodeAck(getLocalNode(), exception); // other nodes have acked, but not the master.
                    publishListener.onFailure(exception);
                }
            }, EsExecutors.DIRECT_EXECUTOR_SERVICE, transportService.getThreadPool().getThreadContext());
        }

        private void cancelTimeoutHandlers() {
            if (timeoutHandler != null) {
                timeoutHandler.cancel();
            }
            infoTimeoutHandler.cancel();
        }

        private void handleAssociatedJoin(Join join) {
            if (join.getTerm() == getCurrentTerm() && missingJoinVoteFrom(join.getSourceNode())) {
                logger.trace("handling {}", join);
                handleJoin(join);
            }
        }

        @Override
        protected boolean isPublishQuorum(VoteCollection votes) {
            assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
            return coordinationState.get().isPublishQuorum(votes);
        }

        @Override
        protected Optional handlePublishResponse(DiscoveryNode sourceNode, PublishResponse publishResponse) {
            assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
            assert getCurrentTerm() >= publishResponse.getTerm();
            return coordinationState.get().handlePublishResponse(sourceNode, publishResponse);
        }

        @Override
        protected void onJoin(Join join) {
            assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
            if (receivedJoinsProcessed) {
                // a late response may arrive after the state has been locally applied, meaning that receivedJoins has already been
                // processed, so we have to handle this late response here.
                handleAssociatedJoin(join);
            } else {
                receivedJoins.add(join);
            }
        }

        @Override
        protected void onMissingJoin(DiscoveryNode discoveryNode) {
            assert Thread.holdsLock(mutex) : "Coordinator mutex not held";
            // The remote node did not include a join vote in its publish response. We do not persist joins, so it could be that the remote
            // node voted for us and then rebooted, or it could be that it voted for a different node in this term. If we don't have a copy
            // of a join from this node then we assume the latter and bump our term to obtain a vote from this node.
            if (missingJoinVoteFrom(discoveryNode)) {
                final long term = publishRequest.getAcceptedState().term();
                logger.debug("onMissingJoin: no join vote from {}, bumping term to exceed {}", discoveryNode, term);
                updateMaxTermSeen(term + 1);
            }
        }

        @Override
        protected void sendPublishRequest(
            DiscoveryNode destination,
            PublishRequest publishRequest,
            ActionListener responseActionListener
        ) {
            publicationContext.sendPublishRequest(destination, publishRequest, wrapWithMutex(responseActionListener));
        }

        private static final TransportRequestOptions COMMIT_STATE_REQUEST_OPTIONS = TransportRequestOptions.of(
            null,
            TransportRequestOptions.Type.STATE
        );

        @Override
        protected void sendApplyCommit(
            DiscoveryNode destination,
            ApplyCommitRequest applyCommit,
            ActionListener responseActionListener
        ) {
            assert transportService.getThreadPool().getThreadContext().isSystemContext();
            transportService.sendRequest(
                destination,
                COMMIT_STATE_ACTION_NAME,
                applyCommit,
                COMMIT_STATE_REQUEST_OPTIONS,
                new ActionListenerResponseHandler<>(wrapWithMutex(responseActionListener), in -> Empty.INSTANCE, Names.CLUSTER_COORDINATION)
            );
        }
    }

    public interface PeerFinderListener {
        void onFoundPeersUpdated();
    }
}