All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.cluster.coordination.LeaderChecker Maven / Gradle / Ivy

There is a newer version: 8.15.1
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.cluster.coordination;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.zen.MasterFaultDetection;
import org.elasticsearch.threadpool.ThreadPool.Names;
import org.elasticsearch.transport.ConnectTransportException;
import org.elasticsearch.transport.TransportConnectionListener;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequest;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportRequestOptions.Type;
import org.elasticsearch.transport.TransportResponse;
import org.elasticsearch.transport.TransportResponse.Empty;
import org.elasticsearch.transport.TransportResponseHandler;
import org.elasticsearch.transport.TransportService;

import java.io.IOException;
import java.util.Objects;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

/**
 * The LeaderChecker is responsible for allowing followers to check that the currently elected leader is still connected and healthy. We are
 * fairly lenient, possibly allowing multiple checks to fail before considering the leader to be faulty, to allow for the leader to
 * temporarily stand down on occasion, e.g. if it needs to move to a higher term. On deciding that the leader has failed a follower will
 * become a candidate and attempt to become a leader itself.
 */
public class LeaderChecker {

    private static final Logger logger = LogManager.getLogger(LeaderChecker.class);

    public static final String LEADER_CHECK_ACTION_NAME = "internal:coordination/fault_detection/leader_check";

    // the time between checks sent to the leader
    public static final Setting LEADER_CHECK_INTERVAL_SETTING =
        Setting.timeSetting("cluster.fault_detection.leader_check.interval",
            TimeValue.timeValueMillis(1000), TimeValue.timeValueMillis(100), Setting.Property.NodeScope);

    // the timeout for each check sent to the leader
    public static final Setting LEADER_CHECK_TIMEOUT_SETTING =
        Setting.timeSetting("cluster.fault_detection.leader_check.timeout",
            TimeValue.timeValueMillis(10000), TimeValue.timeValueMillis(1), Setting.Property.NodeScope);

    // the number of failed checks that must happen before the leader is considered to have failed.
    public static final Setting LEADER_CHECK_RETRY_COUNT_SETTING =
        Setting.intSetting("cluster.fault_detection.leader_check.retry_count", 3, 1, Setting.Property.NodeScope);

    private final Settings settings;

    private final TimeValue leaderCheckInterval;
    private final TimeValue leaderCheckTimeout;
    private final int leaderCheckRetryCount;
    private final TransportService transportService;
    private final Runnable onLeaderFailure;

    private AtomicReference currentChecker = new AtomicReference<>();

    private volatile DiscoveryNodes discoveryNodes;

    public LeaderChecker(final Settings settings, final TransportService transportService, final Runnable onLeaderFailure) {
        this.settings = settings;
        leaderCheckInterval = LEADER_CHECK_INTERVAL_SETTING.get(settings);
        leaderCheckTimeout = LEADER_CHECK_TIMEOUT_SETTING.get(settings);
        leaderCheckRetryCount = LEADER_CHECK_RETRY_COUNT_SETTING.get(settings);
        this.transportService = transportService;
        this.onLeaderFailure = onLeaderFailure;

        transportService.registerRequestHandler(LEADER_CHECK_ACTION_NAME, Names.SAME, false, false, LeaderCheckRequest::new,
            (request, channel, task) -> {
                handleLeaderCheck(request);
                channel.sendResponse(Empty.INSTANCE);
            });

        transportService.registerRequestHandler(MasterFaultDetection.MASTER_PING_ACTION_NAME, MasterFaultDetection.MasterPingRequest::new,
            Names.SAME, false, false, (request, channel, task) -> {
                try {
                    handleLeaderCheck(new LeaderCheckRequest(request.sourceNode));
                } catch (CoordinationStateRejectedException e) {
                    throw new MasterFaultDetection.ThisIsNotTheMasterYouAreLookingForException(e.getMessage());
                }
                channel.sendResponse(new MasterFaultDetection.MasterPingResponseResponse());
            });

        transportService.addConnectionListener(new TransportConnectionListener() {
            @Override
            public void onNodeDisconnected(DiscoveryNode node) {
                handleDisconnectedNode(node);
            }
        });
    }

    public DiscoveryNode leader() {
        CheckScheduler checkScheduler = currentChecker.get();
        return checkScheduler == null ? null : checkScheduler.leader;
    }

    /**
     * Starts and / or stops a leader checker for the given leader. Should only be called after successfully joining this leader.
     *
     * @param leader the node to be checked as leader, or null if checks should be disabled
     */
    public void updateLeader(@Nullable final DiscoveryNode leader) {
        assert transportService.getLocalNode().equals(leader) == false;
        final CheckScheduler checkScheduler;
        if (leader != null) {
            checkScheduler = new CheckScheduler(leader);
        } else {
            checkScheduler = null;
        }
        CheckScheduler previousChecker = currentChecker.getAndSet(checkScheduler);
        if (previousChecker != null) {
            previousChecker.close();
        }
        if (checkScheduler != null) {
            checkScheduler.handleWakeUp();
        }
    }

    /**
     * Update the "known" discovery nodes. Should be called on the leader before a new cluster state is published to reflect the new
     * publication targets, and also called if a leader becomes a non-leader.
     * TODO if heartbeats can make nodes become followers then this needs to be called before a heartbeat is sent to a new node too.
     * 

* isLocalNodeElectedMaster() should reflect whether this node is a leader, and nodeExists() * should indicate whether nodes are known publication targets or not. */ public void setCurrentNodes(DiscoveryNodes discoveryNodes) { logger.trace("setCurrentNodes: {}", discoveryNodes); this.discoveryNodes = discoveryNodes; } // For assertions boolean currentNodeIsMaster() { return discoveryNodes.isLocalNodeElectedMaster(); } private void handleLeaderCheck(LeaderCheckRequest request) { final DiscoveryNodes discoveryNodes = this.discoveryNodes; assert discoveryNodes != null; if (discoveryNodes.isLocalNodeElectedMaster() == false) { logger.debug("non-master handling {}", request); throw new CoordinationStateRejectedException("non-leader rejecting leader check"); } else if (discoveryNodes.nodeExists(request.getSender()) == false) { logger.debug("leader check from unknown node: {}", request); throw new CoordinationStateRejectedException("leader check from unknown node"); } else { logger.trace("handling {}", request); } } private void handleDisconnectedNode(DiscoveryNode discoveryNode) { CheckScheduler checkScheduler = currentChecker.get(); if (checkScheduler != null) { checkScheduler.handleDisconnectedNode(discoveryNode); } else { logger.trace("disconnect event ignored for {}, no check scheduler", discoveryNode); } } private class CheckScheduler implements Releasable { private final AtomicBoolean isClosed = new AtomicBoolean(); private final AtomicLong failureCountSinceLastSuccess = new AtomicLong(); private final DiscoveryNode leader; CheckScheduler(final DiscoveryNode leader) { this.leader = leader; } @Override public void close() { if (isClosed.compareAndSet(false, true) == false) { logger.trace("already closed, doing nothing"); } else { logger.debug("closed"); } } void handleWakeUp() { if (isClosed.get()) { logger.trace("closed check scheduler woken up, doing nothing"); return; } logger.trace("checking {} with [{}] = {}", leader, LEADER_CHECK_TIMEOUT_SETTING.getKey(), leaderCheckTimeout); final String actionName; final TransportRequest transportRequest; if (Coordinator.isZen1Node(leader)) { actionName = MasterFaultDetection.MASTER_PING_ACTION_NAME; transportRequest = new MasterFaultDetection.MasterPingRequest( transportService.getLocalNode(), leader, ClusterName.CLUSTER_NAME_SETTING.get(settings)); } else { actionName = LEADER_CHECK_ACTION_NAME; transportRequest = new LeaderCheckRequest(transportService.getLocalNode()); } // TODO lag detection: // In the PoC, the leader sent its current version to the follower in the response to a LeaderCheck, so the follower // could detect if it was lagging. We'd prefer this to be implemented on the leader, so the response is just // TransportResponse.Empty here. transportService.sendRequest(leader, actionName, transportRequest, TransportRequestOptions.builder().withTimeout(leaderCheckTimeout).withType(Type.PING).build(), new TransportResponseHandler() { @Override public Empty read(StreamInput in) { return Empty.INSTANCE; } @Override public void handleResponse(Empty response) { if (isClosed.get()) { logger.debug("closed check scheduler received a response, doing nothing"); return; } failureCountSinceLastSuccess.set(0); scheduleNextWakeUp(); // logs trace message indicating success } @Override public void handleException(TransportException exp) { if (isClosed.get()) { logger.debug("closed check scheduler received a response, doing nothing"); return; } if (exp instanceof ConnectTransportException || exp.getCause() instanceof ConnectTransportException) { logger.debug(new ParameterizedMessage("leader [{}] disconnected, failing immediately", leader), exp); leaderFailed(); return; } long failureCount = failureCountSinceLastSuccess.incrementAndGet(); if (failureCount >= leaderCheckRetryCount) { logger.debug(new ParameterizedMessage("{} consecutive failures (limit [{}] is {}) so leader [{}] has failed", failureCount, LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), leaderCheckRetryCount, leader), exp); leaderFailed(); return; } logger.debug(new ParameterizedMessage("{} consecutive failures (limit [{}] is {}) with leader [{}]", failureCount, LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), leaderCheckRetryCount, leader), exp); scheduleNextWakeUp(); } @Override public String executor() { return Names.SAME; } }); } void leaderFailed() { if (isClosed.compareAndSet(false, true)) { transportService.getThreadPool().generic().execute(onLeaderFailure); } else { logger.trace("already closed, not failing leader"); } } void handleDisconnectedNode(DiscoveryNode discoveryNode) { if (discoveryNode.equals(leader)) { leaderFailed(); } } private void scheduleNextWakeUp() { logger.trace("scheduling next check of {} for [{}] = {}", leader, LEADER_CHECK_INTERVAL_SETTING.getKey(), leaderCheckInterval); transportService.getThreadPool().schedule(new Runnable() { @Override public void run() { handleWakeUp(); } @Override public String toString() { return "scheduled check of leader " + leader; } }, leaderCheckInterval, Names.SAME); } } public static class LeaderCheckRequest extends TransportRequest { private final DiscoveryNode sender; public LeaderCheckRequest(final DiscoveryNode sender) { this.sender = sender; } public LeaderCheckRequest(final StreamInput in) throws IOException { super(in); sender = new DiscoveryNode(in); } @Override public void writeTo(final StreamOutput out) throws IOException { super.writeTo(out); sender.writeTo(out); } public DiscoveryNode getSender() { return sender; } @Override public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final LeaderCheckRequest that = (LeaderCheckRequest) o; return Objects.equals(sender, that.sender); } @Override public int hashCode() { return Objects.hash(sender); } @Override public String toString() { return "LeaderCheckRequest{" + "sender=" + sender + '}'; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy