All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.hekate.cluster.internal.DefaultClusterService Maven / Gradle / Ivy

/*
 * Copyright 2022 The Hekate Project
 *
 * The Hekate Project licenses this file to you under the Apache License,
 * version 2.0 (the "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations
 * under the License.
 */

package io.hekate.cluster.internal;

import io.hekate.cluster.ClusterAcceptor;
import io.hekate.cluster.ClusterAddress;
import io.hekate.cluster.ClusterFilter;
import io.hekate.cluster.ClusterNode;
import io.hekate.cluster.ClusterNodeId;
import io.hekate.cluster.ClusterRejectedJoinException;
import io.hekate.cluster.ClusterService;
import io.hekate.cluster.ClusterServiceFactory;
import io.hekate.cluster.ClusterServiceJmx;
import io.hekate.cluster.ClusterTopology;
import io.hekate.cluster.ClusterView;
import io.hekate.cluster.event.ClusterEvent;
import io.hekate.cluster.event.ClusterEventListener;
import io.hekate.cluster.event.ClusterEventType;
import io.hekate.cluster.health.FailureDetector;
import io.hekate.cluster.internal.gossip.GossipCommListener;
import io.hekate.cluster.internal.gossip.GossipCommManager;
import io.hekate.cluster.internal.gossip.GossipListener;
import io.hekate.cluster.internal.gossip.GossipManager;
import io.hekate.cluster.internal.gossip.GossipNodeStatus;
import io.hekate.cluster.internal.gossip.GossipPolicy;
import io.hekate.cluster.internal.gossip.GossipProtocol;
import io.hekate.cluster.internal.gossip.GossipProtocol.GossipMessage;
import io.hekate.cluster.internal.gossip.GossipProtocol.HeartbeatReply;
import io.hekate.cluster.internal.gossip.GossipProtocol.HeartbeatRequest;
import io.hekate.cluster.internal.gossip.GossipProtocol.JoinAccept;
import io.hekate.cluster.internal.gossip.GossipProtocol.JoinReject;
import io.hekate.cluster.internal.gossip.GossipProtocol.JoinReply;
import io.hekate.cluster.internal.gossip.GossipProtocol.JoinRequest;
import io.hekate.cluster.internal.gossip.GossipProtocol.UpdateBase;
import io.hekate.cluster.internal.gossip.GossipProtocolCodec;
import io.hekate.cluster.seed.SeedNodeProvider;
import io.hekate.cluster.seed.multicast.MulticastSeedNodeProvider;
import io.hekate.cluster.split.SplitBrainDetector;
import io.hekate.core.Hekate;
import io.hekate.core.HekateBootstrap;
import io.hekate.core.HekateConfigurationException;
import io.hekate.core.HekateException;
import io.hekate.core.internal.util.ArgAssert;
import io.hekate.core.internal.util.ConfigCheck;
import io.hekate.core.internal.util.HekateThreadFactory;
import io.hekate.core.jmx.JmxService;
import io.hekate.core.jmx.JmxSupport;
import io.hekate.core.report.ConfigReporter;
import io.hekate.core.service.ClusterServiceManager;
import io.hekate.core.service.ConfigurationContext;
import io.hekate.core.service.CoreService;
import io.hekate.core.service.DependencyContext;
import io.hekate.core.service.InitializationContext;
import io.hekate.network.NetworkConfigProvider;
import io.hekate.network.NetworkConnector;
import io.hekate.network.NetworkConnectorConfig;
import io.hekate.network.NetworkEndpoint;
import io.hekate.network.NetworkMessage;
import io.hekate.network.NetworkServerHandler;
import io.hekate.network.NetworkService;
import io.hekate.util.StateGuard;
import io.hekate.util.async.Waiting;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.function.Predicate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static io.hekate.core.internal.util.StreamUtils.nullSafe;
import static io.hekate.util.async.AsyncUtils.shutdown;
import static java.util.Collections.emptySet;
import static java.util.Collections.singleton;
import static java.util.Collections.synchronizedList;
import static java.util.Collections.unmodifiableList;
import static java.util.Collections.unmodifiableSet;
import static java.util.stream.Collectors.toCollection;
import static java.util.stream.Collectors.toSet;

public class DefaultClusterService implements ClusterService, ClusterServiceManager, CoreService, NetworkConfigProvider,
    JmxSupport {
    private static final Logger log = LoggerFactory.getLogger(DefaultClusterService.class);

    private static final boolean DEBUG = log.isDebugEnabled();

    private static final String PROTOCOL_ID = "hekate.cluster";

    private final String namespace;

    private final long gossipInterval;

    private final int speedUpGossipSize;

    private final SeedNodeProvider seedNodeProvider;

    private final boolean seedNodeFailFast;

    private final FailureDetector failureDetector;

    private final SplitBrainManager splitBrain;

    private final List acceptors;

    private final GossipListener gossipSpy;

    private final StateGuard guard;

    private final AtomicReference localNodeIdRef = new AtomicReference<>();

    private final List listeners;

    private final List deferredListeners = synchronizedList(new ArrayList<>());

    private final TopologyContextCache ctxCache = new TopologyContextCache();

    private ClusterAcceptManager acceptMgr;

    private SeedNodeManager seedNodeMgr;

    private GossipManager gossipMgr;

    private NetworkService net;

    private ClusterMetricsSink metrics;

    private JmxService jmx;

    private ScheduledExecutorService serviceThread;

    private ScheduledExecutorService gossipThread;

    private ScheduledFuture heartbeatTask;

    private ScheduledFuture gossipTask;

    private ScheduledFuture joinTask;

    private volatile InitializationContext ctx;

    private volatile GossipCommManager commMgr;

    private volatile ClusterNode localNode;

    public DefaultClusterService(ClusterServiceFactory factory, StateGuard guard, GossipListener gossipSpy) {
        ArgAssert.notNull(factory, "Factory");

        ConfigCheck check = ConfigCheck.get(ClusterServiceFactory.class);

        check.notNull(factory, "configuration");
        check.notEmpty(factory.getNamespace(), "cluster name");
        check.validSysName(factory.getNamespace(), "cluster name");
        check.positive(factory.getGossipInterval(), "gossip interval");
        check.notNull(factory.getFailureDetector(), "failure detector");

        // Basic properties.
        this.namespace = factory.getNamespace();
        this.gossipInterval = factory.getGossipInterval();
        this.speedUpGossipSize = factory.getSpeedUpGossipSize();
        this.failureDetector = factory.getFailureDetector();
        this.gossipSpy = gossipSpy;

        // State guard.
        if (guard == null) {
            this.guard = new StateGuard(ClusterService.class);
        } else {
            this.guard = guard;
        }

        // Seed node provider.
        if (factory.getSeedNodeProvider() == null) {
            try {
                seedNodeProvider = new MulticastSeedNodeProvider();
            } catch (UnknownHostException e) {
                throw new HekateConfigurationException(HekateBootstrap.class.getSimpleName() + ": multicasting is not supported. "
                    + "Consider using other seed node provider implementation.", e);
            }
        } else {
            seedNodeProvider = factory.getSeedNodeProvider();
        }

        this.seedNodeFailFast = factory.isSeedNodeFailFast();

        // Split-brain manager.
        splitBrain = new SplitBrainManager(
            factory.getSplitBrainCheckInterval(),
            factory.getSplitBrainDetector()
        );

        // Pre-configured (unmodifiable) event listeners.
        this.listeners = unmodifiableList(nullSafe(factory.getClusterListeners()).collect(toCollection(() -> {
            List listeners = new ArrayList<>();

            listeners.add(new ClusterEventLogger());

            return listeners;
        })));

        // Join acceptors.
        this.acceptors = nullSafe(factory.getAcceptors()).collect(toCollection(ArrayList::new));
    }

    @Override
    public void resolve(DependencyContext ctx) {
        net = ctx.require(NetworkService.class);

        jmx = ctx.optional(JmxService.class);
    }

    @Override
    public void configure(ConfigurationContext ctx) {
        Collection customAcceptors = ctx.findComponents(ClusterAcceptor.class);

        acceptors.addAll(customAcceptors);
    }

    @Override
    public Collection> configureNetwork() {
        NetworkConnectorConfig netCfg = new NetworkConnectorConfig<>();

        netCfg.setProtocol(PROTOCOL_ID);
        netCfg.setMessageCodec(() -> new GossipProtocolCodec(localNodeIdRef));
        netCfg.setLogCategory(GossipProtocol.class.getName());

        // Use a dedicated NIO thread for cluster communications.
        netCfg.setNioThreads(1);

        netCfg.setServerHandler(new NetworkServerHandler() {
            @Override
            public void onConnect(GossipProtocol login, NetworkEndpoint client) {
                // Volatile read.
                GossipCommManager localCommMgr = commMgr;

                if (localCommMgr != null) {
                    localCommMgr.onConnect(login, client);
                }
            }

            @Override
            public void onMessage(NetworkMessage msg, NetworkEndpoint from) throws IOException {
                // Volatile read.
                GossipCommManager localCommMgr = commMgr;

                if (localCommMgr != null) {
                    localCommMgr.onMessage(msg, from);
                }
            }

            @Override
            public void onDisconnect(NetworkEndpoint client) {
                // Volatile read.
                GossipCommManager localCommMgr = commMgr;

                if (localCommMgr != null) {
                    localCommMgr.onDisconnect(client);
                }
            }
        });

        return singleton(netCfg);
    }

    @Override
    public void initialize(InitializationContext initCtx) throws HekateException {
        if (DEBUG) {
            log.debug("Initializing...");
        }

        guard.becomeInitialized(() -> {
            ctx = initCtx;
            localNode = initCtx.localNode();
            localNodeIdRef.set(initCtx.localNode().id());

            // Register cluster listeners.
            listeners.forEach(listener ->
                ctx.cluster().addListener(listener)
            );

            deferredListeners.forEach(deferred ->
                ctx.cluster().addListener(deferred.listener(), deferred.eventTypes())
            );

            // Prepare seed node manager.
            seedNodeMgr = new SeedNodeManager(namespace, seedNodeProvider, seedNodeFailFast);

            // Prepare workers.
            gossipThread = Executors.newSingleThreadScheduledExecutor(new HekateThreadFactory("ClusterGossip"));
            serviceThread = Executors.newSingleThreadScheduledExecutor(new HekateThreadFactory("Cluster"));

            // Prepare accept manager.
            acceptMgr = new ClusterAcceptManager(acceptors, serviceThread);

            // Prepare gossip listener.
            GossipListener gossipListener = createGossipListener();

            // Prepare gossip manager.
            gossipMgr = new GossipManager(
                namespace,
                localNode,
                speedUpGossipSize,
                seedNodeFailFast,
                failureDetector,
                gossipListener
            );

            // Prepare gossip communication manager.
            NetworkConnector connector = net.connector(PROTOCOL_ID);

            commMgr = new GossipCommManager(connector, localNode.address(), new GossipCommListener() {
                @Override
                public void onReceive(GossipProtocol msg) {
                    process(msg);
                }

                @Override
                public void onSendFailure(GossipProtocol msg, Throwable error) {
                    processSendFailure(msg, error);
                }

                @Override
                public void onConnectFailure(ClusterAddress node) {
                    processConnectFailure(node);
                }

                @Override
                public Optional onBeforeSend(GossipProtocol msg) {
                    return gossipListener.onBeforeSend(msg);
                }
            });

            // Prepare split-brain manager.
            splitBrain.initialize(
                localNode,
                serviceThread,
                this::onFatalError
            );

            // Prepare metrics sink.
            metrics = new ClusterMetricsSink(ctx.metrics());

            // Register JMX beans (optional).
            if (jmx != null) {
                jmx.register(this);

                jmx.register(failureDetector);
                jmx.register(seedNodeProvider);

                if (splitBrain.detector() != null) {
                    jmx.register(splitBrain.detector());
                }
            }
        });

        if (DEBUG) {
            log.debug("Initialized.");
        }
    }

    @Override
    public void report(ConfigReporter report) {
        report.section("cluster", cs -> {
            cs.value("namespace", namespace);
            cs.value("gossip-interval", gossipInterval);
            cs.value("speed-up-gossip-size", speedUpGossipSize);
            cs.value("failure-detector", failureDetector);
            cs.value("split-brain", splitBrain);
            cs.value("seed-node-fail-fast", seedNodeFailFast);
            cs.value("seed-node-provider", seedNodeProvider);
        });
    }

    @Override
    public void joinAsync() {
        guard.withReadLockAndStateCheck(() ->
            runOnServiceThread(() -> {
                if (guard.isInitialized()) {
                    if (log.isInfoEnabled()) {
                        log.info("Joining cluster [namespace={}, local-node={}]", namespace, ctx.localNode());
                    }

                    // Prepare a repeatable join task to re-run in case of a recoverable failure during the join process.
                    Runnable repeatableJoinTask = new Runnable() {
                        @Override
                        public void run() {
                            ClusterAddress address;
                            SeedNodeManager localSeedNodeMgr;

                            guard.lockRead();

                            try {
                                // Check that there were no concurrent leave/terminate events.
                                if (guard.isInitialized()) {
                                    address = localNode.address();
                                    localSeedNodeMgr = seedNodeMgr;
                                } else {
                                    // Stop since there was a concurrent leave/terminate event.
                                    return;
                                }
                            } finally {
                                guard.unlockRead();
                            }

                            try {
                                // Check if node is not in a split-brain mode before trying to join.
                                if (!splitBrain.check()) {
                                    // Try to schedule a new join attempt.
                                    guard.withReadLockIfInitialized(() -> {
                                        log.warn("Split-brain detected ...will wait for {} ms before making another attempt "
                                            + "[split-brain-detector={}]", gossipInterval, splitBrain.detector());

                                        serviceThread.schedule(this, gossipInterval, TimeUnit.MILLISECONDS);
                                    });

                                    return;
                                }

                                // Start discovery of seed nodes.
                                try {
                                    localSeedNodeMgr.startDiscovery(address.socket());
                                } catch (Exception e) {
                                    // Try to schedule a new join attempt.
                                    boolean scheduled = guard.withReadLockIfInitialized(() -> {
                                        log.error("Failed to start seed nodes discovery "
                                            + "...will wait for {}ms before making another attempt.", gossipInterval, e);

                                        serviceThread.schedule(this, gossipInterval, TimeUnit.MILLISECONDS);
                                    });

                                    if (!scheduled) {
                                        // Make sure that seed nodes discovery is stopped in case of concurrent service termination.
                                        localSeedNodeMgr.stopDiscovery(address.socket());
                                    }

                                    return;
                                }

                                // Initialize failure detector.
                                if (DEBUG) {
                                    log.debug("Initializing failure detector [address={}]", address);
                                }

                                failureDetector.initialize(() -> address);

                                if (DEBUG) {
                                    log.debug("Initialized failure detector [address={}]", address);
                                }

                                // Schedule asynchronous join task.
                                if (!scheduleAsyncJoin()) {
                                    if (DEBUG) {
                                        log.debug("Stopped initialization sequence due to a concurrent leave/terminate event.");
                                    }

                                    // Make sure that seed nodes discovery is stopped.
                                    localSeedNodeMgr.stopDiscovery(address.socket());

                                    // Make sure that failure detector is terminated.
                                    try {
                                        failureDetector.terminate();
                                    } catch (Throwable e) {
                                        log.error("Got an unexpected runtime error during the failure detector termination.", e);
                                    }
                                }
                            } catch (Throwable e) {
                                ctx.terminate(e);
                            }
                        }
                    };

                    repeatableJoinTask.run();
                }
            })
        );
    }

    @Override
    public void leaveAsync() {
        guard.withReadLockIfInitialized(() ->
            runOnGossipThread(this::doLeave)
        );
    }

    @Override
    public void terminate() throws HekateException {
        if (DEBUG) {
            log.debug("Terminating.");
        }

        Waiting done = guard.becomeTerminated(() -> {
            acceptMgr.terminate();
            splitBrain.terminate();

            List waiting = new ArrayList<>();

            if (seedNodeMgr != null) {
                waiting.add(seedNodeMgr.stopCleaning());

                InetSocketAddress localAddress = localNode.socket();
                SeedNodeManager localSeedNodeMgr = seedNodeMgr;

                waiting.add(() ->
                    localSeedNodeMgr.stopDiscovery(localAddress)
                );
            }

            waiting.add(shutdown(gossipThread));
            waiting.add(shutdown(serviceThread));

            if (commMgr != null) {
                GossipCommManager localCommMgr = commMgr;

                waiting.add(localCommMgr::stop);
            }

            if (failureDetector != null) {
                waiting.add(failureDetector::terminate);
            }

            localNodeIdRef.set(null);

            localNode = null;
            commMgr = null;
            gossipMgr = null;
            acceptMgr = null;
            serviceThread = null;
            gossipThread = null;
            seedNodeMgr = null;
            metrics = null;

            return waiting;
        });

        done.awaitUninterruptedly();

        if (DEBUG) {
            log.debug("Terminated.");
        }
    }

    @Override
    public ClusterTopology topology() {
        return requireContext().cluster().topology();
    }

    @Override
    public ClusterView filterAll(ClusterFilter filter) {
        ArgAssert.notNull(filter, "Filter");

        return new FilteredClusterView(this, filter);
    }

    @Override
    public void addListener(ClusterEventListener listener) {
        addListener(listener, (ClusterEventType[])null);
    }

    @Override
    public void addListener(ClusterEventListener listener, ClusterEventType... eventTypes) {
        ArgAssert.notNull(listener, "Listener");

        guard.withReadLock(() -> {
            if (guard.isInitialized()) {
                requireContext().cluster().addListener(listener, eventTypes);
            } else {
                deferredListeners.add(new DeferredClusterListener(listener, eventTypes));
            }
        });
    }

    @Override
    public void removeListener(ClusterEventListener listener) {
        ArgAssert.notNull(listener, "Listener");

        guard.withReadLock(() -> {
            if (guard.isInitialized()) {
                requireContext().cluster().removeListener(listener);
            }

            deferredListeners.remove(new DeferredClusterListener(listener, null));
        });
    }

    @Override
    public  T topologyContext(Function supplier) {
        return ctxCache.get(topology(), supplier);
    }

    @Override
    public CompletableFuture futureOf(Predicate predicate) {
        ArgAssert.notNull(predicate, "Predicate");

        return guard.withReadLock(() -> {
            // Completable future that completes upon a cluster event with the matching topology.
            class PredicateFuture extends CompletableFuture implements ClusterEventListener {
                public PredicateFuture() {
                    // Unregister listener when this future gets completed.
                    whenComplete((topology, err) ->
                        removeListener(this)
                    );
                }

                @Override
                public void onEvent(ClusterEvent event) throws HekateException {
                    if (!isDone()) {
                        if (predicate.test(event.topology())) {
                            complete(event.topology());
                        } else if (event.type() == ClusterEventType.LEAVE) {
                            cancel(false);
                        }
                    }
                }
            }

            PredicateFuture future = new PredicateFuture();

            if (guard.isInitialized()) {
                requireContext().cluster().addListenerAsync(future);
            } else {
                deferredListeners.add(new DeferredClusterListener(future, null));
            }

            return future;
        });
    }

    @Override
    public boolean awaitFor(Predicate predicate) {
        return awaitFor(predicate, Long.MAX_VALUE, TimeUnit.NANOSECONDS);
    }

    @Override
    public boolean awaitFor(Predicate predicate, long timeout, TimeUnit timeUnit) {
        ArgAssert.notNull(predicate, "Predicate");

        // Fast try against the current topology.
        ClusterTopology immediateTopology = tryTopology(predicate);

        if (immediateTopology != null) {
            // Complete immediately.
            return true;
        } else {
            // Await via future object.
            Future future = guard.withReadLock(() ->
                guard.isInitialized() ? futureOf(predicate) : null
            );

            if (future == null) {
                return false;
            } else {
                try {
                    future.get(timeout, timeUnit);

                    return true;
                } catch (InterruptedException | TimeoutException e) {
                    // Notify that this future is not needed anymore.
                    future.cancel(false);

                    return false;
                } catch (CancellationException | ExecutionException e) {
                    return false;
                }
            }
        }
    }

    @Override
    public String namespace() {
        return namespace;
    }

    @Override
    public ClusterNode localNode() {
        ClusterNode node = this.localNode;

        if (node == null) {
            throw new IllegalStateException(ClusterService.class.getSimpleName() + " is not initialized.");
        }

        return node;
    }

    public SeedNodeProvider seedNodeProvider() {
        return seedNodeProvider;
    }

    public FailureDetector failureDetector() {
        return failureDetector;
    }

    public SplitBrainDetector splitBrainDetector() {
        return splitBrain.detector();
    }

    public List acceptors() {
        return unmodifiableList(acceptors);
    }

    @Override
    public ClusterServiceJmx jmx() {
        return new DefaultClusterServiceJmx(this);
    }

    private boolean scheduleAsyncJoin() {
        return guard.withWriteLockIfInitialized(() -> {
            if (DEBUG) {
                log.debug("Scheduling a periodic gossip task [interval={}]", gossipInterval);
            }

            // Schedule gossip task.
            gossipTask = scheduleOn(gossipThread, DefaultClusterService.this::gossip, gossipInterval);

            // Schedule heartbeat task.
            long hbInterval = failureDetector.heartbeatInterval();

            if (hbInterval > 0) {
                if (DEBUG) {
                    log.debug("Scheduling a periodic heartbeat task [interval={}]", hbInterval);
                }

                heartbeatTask = scheduleOn(gossipThread, DefaultClusterService.this::heartbeat, hbInterval);
            }

            if (DEBUG) {
                log.debug("Scheduling an asynchronous join task [interval={}]", gossipInterval);
            }

            // Schedule the asynchronous join task.
            joinTask = scheduleOn(serviceThread, 0, gossipInterval, DefaultClusterService.this::doJoin);
        });
    }

    private void doJoin() {
        guard.withReadLockIfInitialized(() -> {
            try {
                List nodes = seedNodeMgr.getSeedNodes();

                // Schedule the join task to run on the gossip thread.
                runOnGossipThread(() ->
                    guard.withReadLockIfInitialized(() -> {
                        JoinRequest msg = gossipMgr.join(nodes);

                        if (msg != null) {
                            sendAndDisconnect(msg);
                        }
                    })
                );
            } catch (Exception e) {
                log.error("Failed to find seed nodes ...will wait for {} ms before making another attempt.", gossipInterval, e);
            }
        });
    }

    private void doLeave() {
        guard.withReadLockIfInitialized(() -> {
            UpdateBase msg = gossipMgr.leave();

            if (msg == null) {
                // Do not need to go through the cluster leave protocol (gossip manager decision).
                ctx.cluster().onLeave();
            } else {
                if (log.isInfoEnabled()) {
                    log.info("Leaving cluster...");
                }

                send(msg);
            }
        });
    }

    private void gossip() {
        guard.withReadLockIfInitialized(() ->
            gossipMgr.batchGossip(GossipPolicy.RANDOM_PREFER_UNSEEN).forEach(this::send)
        );
    }

    private void heartbeat() {
        guard.withReadLockIfInitialized(() -> {
            // Check nodes aliveness.
            boolean failureDetected = gossipMgr.checkAliveness();

            // Send heartbeats first (even if new failures were detected).
            Collection targets = failureDetector.heartbeatTick();

            if (targets != null) {
                targets.stream()
                    .map(to -> new HeartbeatRequest(localNode.address(), to))
                    .forEach(this::send);
            }

            // Send gossip messages if new failures were detected.
            if (failureDetected) {
                gossip();
            }
        });
    }

    private void process(GossipProtocol msg) {
        guard.withReadLockIfInitialized(() -> {
            metrics.onGossipMessage(msg.type());

            if (msg instanceof GossipMessage) {
                GossipMessage gossipMsg = (GossipMessage)msg;

                if (!localNode.address().equals(gossipMsg.to())) {
                    if (DEBUG) {
                        log.debug("Ignored message since it is not addressed to the local node [message={}, node={}]", msg, localNode);
                    }

                    return;
                }
            }

            GossipProtocol.Type type = msg.type();

            if (type == GossipProtocol.Type.HEARTBEAT_REQUEST) {
                boolean reply = failureDetector.onHeartbeatRequest(msg.from());

                if (reply) {
                    send(new HeartbeatReply(localNode.address(), msg.from()));
                }
            } else if (type == GossipProtocol.Type.HEARTBEAT_REPLY) {
                failureDetector.onHeartbeatReply(msg.from());
            } else {
                runOnGossipThread(() ->
                    doProcess(msg)
                );
            }
        });
    }

    private void doProcess(GossipProtocol msg) {
        guard.withReadLockIfInitialized(() -> {
            switch (msg.type()) {
                case GOSSIP_UPDATE:
                case GOSSIP_UPDATE_DIGEST: {
                    UpdateBase update = (UpdateBase)msg;

                    GossipMessage reply = gossipMgr.processUpdate(update);

                    send(reply);

                    break;
                }
                case JOIN_REQUEST: {
                    JoinRequest request = (JoinRequest)msg;

                    // Check that the join request can be accepted.
                    JoinReject reject = gossipMgr.acceptJoinRequest(request);

                    if (reject == null) {
                        // Asynchronously validate and process the join request so that it won't block the gossiping thread.
                        acceptMgr.check(request.fromNode(), ctx.hekate()).thenAcceptAsync(rejectReason -> {
                            try {
                                guard.withReadLockIfInitialized(() -> {
                                    JoinReply reply;

                                    if (rejectReason.isPresent()) {
                                        reply = gossipMgr.reject(request, rejectReason.get());

                                    } else {
                                        reply = gossipMgr.processJoinRequest(request);
                                    }

                                    send(reply);
                                });
                            } catch (Throwable e) {
                                onFatalError(e);
                            }
                        }, gossipThread);
                    } else {
                        // Immediate reject.
                        send(reject);
                    }

                    break;
                }
                case JOIN_ACCEPT: {
                    JoinAccept accept = (JoinAccept)msg;

                    GossipMessage reply = gossipMgr.processJoinAccept(accept);

                    send(reply);

                    break;
                }
                case JOIN_REJECT: {
                    JoinReject reject = (JoinReject)msg;

                    // Try to select another node to join.
                    JoinRequest newRequest = gossipMgr.processJoinReject(reject);

                    sendAndDisconnect(newRequest);

                    break;
                }
                case HEARTBEAT_REQUEST:
                case HEARTBEAT_REPLY:
                case LONG_TERM_CONNECT:
                default: {
                    throw new IllegalArgumentException("Unexpected message type: " + msg);
                }
            }
        });
    }

    private void processSendFailure(GossipProtocol msg, Throwable error) {
        guard.withReadLockIfInitialized(() -> {
            if (msg.type() == GossipProtocol.Type.JOIN_REQUEST) {
                JoinRequest request = (JoinRequest)msg;

                runOnGossipThread(() ->
                    processJoinSendFailure(request, error)
                );
            } else {
                if (DEBUG) {
                    log.debug("Failed to sent gossip message [cause={}, message={}]", error.toString(), msg);
                }
            }
        });
    }

    private void processConnectFailure(ClusterAddress address) {
        guard.withReadLockIfInitialized(() ->
            failureDetector.onConnectFailure(address)
        );
    }

    private void processJoinSendFailure(JoinRequest msg, Throwable cause) {
        guard.withReadLockIfInitialized(() -> {
            if (msg.type() == GossipProtocol.Type.JOIN_REQUEST) {
                if (DEBUG) {
                    log.debug("Processing join message send failure notification [message={}]", msg);
                }

                JoinRequest newReq = gossipMgr.processJoinFailure(msg, cause);

                sendAndDisconnect(newReq);
            }
        });
    }

    private GossipListener createGossipListener() {
        return new GossipListener() {
            // Volatile since can be accessed by a different thread in seed node cleaner.
            private volatile Set knownAddresses = emptySet();

            @Override
            public void onJoinReject(ClusterAddress rejectedBy, String reason) {
                if (ctx.state() == Hekate.State.JOINING) {
                    ctx.terminate(new ClusterRejectedJoinException(reason, rejectedBy));
                }
            }

            @Override
            public void onStatusChange(GossipNodeStatus oldStatus, GossipNodeStatus newStatus, int order, Set newTopology) {
                if (DEBUG) {
                    log.debug("Processing gossip manager status change [old={}, new={}, order={}, topology={}]",
                        oldStatus, newStatus, order, newTopology);
                }

                if (gossipSpy != null) {
                    gossipSpy.onStatusChange(oldStatus, newStatus, order, newTopology);
                }

                switch (newStatus) {
                    case JOINING: {
                        if (DEBUG) {
                            log.debug("Cancelling a periodic join task.");
                        }

                        joinTask.cancel(false);

                        runOnServiceThread(() ->
                            guard.withReadLockIfInitialized(() ->
                                seedNodeMgr.suspendDiscovery()
                            )
                        );

                        break;
                    }
                    case UP: {
                        ctx.cluster().onJoin(order, newTopology).thenAcceptAsync(event -> {
                            if (event != null) {
                                try {
                                    guard.withReadLockIfInitialized(() -> {
                                        ClusterTopology topology = event.topology();

                                        metrics.onTopologyChange(topology);

                                        if (isCoordinator(topology)) {
                                            startSeedNodeCleaner();
                                        }

                                        startPeriodicSplitBrainChecks();
                                    });
                                } catch (Throwable e) {
                                    onFatalError(e);
                                }
                            }
                        }, gossipThread);

                        break;
                    }
                    case LEAVING: {
                        // No-op.
                        break;
                    }
                    case FAILED:
                    case DOWN: {
                        if (ctx.state() == Hekate.State.LEAVING) {
                            if (DEBUG) {
                                log.debug("Stopping periodic gossiping.");
                            }

                            gossipTask.cancel(false);

                            if (DEBUG) {
                                log.debug("Stopping periodic heartbeats.");
                            }

                            if (heartbeatTask != null) {
                                heartbeatTask.cancel(false);
                            }

                            Collection msgs = gossipMgr.batchGossip(GossipPolicy.ON_DOWN);

                            if (msgs.isEmpty()) {
                                ctx.cluster().onLeave();
                            } else {
                                // Send final gossip updates and notify context on leave once sending is done.
                                AtomicInteger enqueued = new AtomicInteger(msgs.size());

                                msgs.forEach(msg ->
                                    send(msg, () -> {
                                        if (enqueued.decrementAndGet() == 0) {
                                            runOnServiceThread(() ->
                                                ctx.cluster().onLeave()
                                            );
                                        }
                                    })
                                );
                            }
                        }

                        break;
                    }
                    default: {
                        throw new IllegalArgumentException("Unexpected status: " + newStatus);
                    }
                }
            }

            @Override
            public void onTopologyChange(Set oldTopology, Set newTopology, Set failed) {
                if (gossipSpy != null) {
                    gossipSpy.onTopologyChange(oldTopology, newTopology, failed);
                }

                ctx.cluster().onTopologyChange(newTopology, failed).thenAcceptAsync(event -> {
                    if (event != null) {
                        guard.withReadLockIfInitialized(() -> {
                            try {
                                ClusterTopology topology = event.topology();

                                metrics.onTopologyChange(topology);

                                if (isCoordinator(topology)) {
                                    startSeedNodeCleaner();
                                } else {
                                    seedNodeMgr.stopCleaning();
                                }

                                if (!event.removed().isEmpty()) {
                                    splitBrain.checkAsync();
                                }
                            } catch (Throwable e) {
                                onFatalError(e);
                            }
                        });
                    }
                }, gossipThread);
            }

            @Override
            public void onKnownAddressesChange(Set oldAddresses, Set newAddresses) {
                Set addresses = newAddresses.stream().map(ClusterAddress::socket).collect(toSet());

                knownAddresses = unmodifiableSet(addresses);
            }

            @Override
            public void onNodeFailureSuspected(ClusterNode failed, GossipNodeStatus status) {
                if (log.isWarnEnabled()) {
                    log.warn("Node failure suspected [address={}, status={}]", failed, status);
                }

                if (gossipSpy != null) {
                    gossipSpy.onNodeFailureSuspected(failed, status);
                }
            }

            @Override
            public void onNodeFailureUnsuspected(ClusterNode node, GossipNodeStatus status) {
                if (log.isWarnEnabled()) {
                    log.warn("Failure suspicion removed from node [address={}, status={}]", node, status);
                }

                if (gossipSpy != null) {
                    gossipSpy.onNodeFailureUnsuspected(node, status);
                }
            }

            @Override
            public void onNodeFailure(ClusterNode failed, GossipNodeStatus status) {
                if (log.isWarnEnabled()) {
                    log.warn("Removing failed node from cluster [address={}, status={}]", failed, status);
                }

                if (gossipSpy != null) {
                    gossipSpy.onNodeFailure(failed, status);
                }
            }

            @Override
            public void onNodeInconsistency(GossipNodeStatus gossipStatus) {
                if (gossipSpy != null) {
                    gossipSpy.onNodeInconsistency(gossipStatus);
                }

                Hekate.State state = ctx.state();

                switch (state) {
                    case INITIALIZING:
                    case INITIALIZED:
                    case JOINING:
                    case SYNCHRONIZING:
                    case UP: {
                        splitBrain.notifyOnSplitBrain();

                        break;
                    }
                    case LEAVING: {
                        // Safe to terminate since we are leaving anyway.
                        ctx.terminate();

                        break;
                    }
                    case DOWN:
                    case TERMINATING: {
                        // No-op.
                        break;
                    }
                    default: {
                        throw new IllegalStateException("Unexpected cluster node state: " + state);
                    }
                }
            }

            @Override
            public Optional onBeforeSend(GossipProtocol msg) {
                if (gossipSpy != null) {
                    return gossipSpy.onBeforeSend(msg);
                }

                return Optional.empty();
            }

            private void startSeedNodeCleaner() {
                seedNodeMgr.startCleaning(net, () -> knownAddresses);
            }

            private void startPeriodicSplitBrainChecks() {
                if (splitBrain.hasDetector() && splitBrain.checkInterval() > 0) {
                    scheduleOn(serviceThread, 0, splitBrain.checkInterval(), () ->
                        guard.withReadLockIfInitialized(splitBrain::checkAsync)
                    );
                }
            }
        };
    }

    private void send(GossipMessage msg) {
        send(msg, null);
    }

    private void send(GossipMessage msg, Runnable onComplete) {
        if (msg != null) {
            commMgr.send(msg, onComplete);
        }
    }

    private void sendAndDisconnect(JoinRequest join) {
        if (join != null) {
            if (log.isInfoEnabled()) {
                log.info("Sending cluster join request [seed-node={}].", join.toAddress());
            }

            commMgr.sendAndDisconnect(join, () ->
                guard.withReadLockIfInitialized(() ->
                    runOnGossipThread(() ->
                        onSendComplete(join)
                    )
                )
            );
        }
    }

    private boolean onSendComplete(JoinRequest msg) {
        return guard.withReadLockIfInitialized(() ->
            gossipMgr.onSendComplete(msg)
        );
    }

    private boolean isCoordinator(ClusterTopology topology) {
        return topology.first().equals(localNode);
    }

    private InitializationContext requireContext() {
        InitializationContext localCtx = this.ctx;

        if (localCtx == null) {
            throw new IllegalStateException("Cluster service is not initialized.");
        }

        return localCtx;
    }

    private ClusterTopology tryTopology(Predicate predicate) {
        ClusterTopology topology = guard.withReadLock(() ->
            guard.isInitialized() ? topology() : null
        );

        if (topology != null && predicate.test(topology)) {
            return topology;
        } else {
            return null;
        }
    }

    private void runOnGossipThread(Runnable task) {
        gossipThread.execute(() -> {
            try {
                task.run();
            } catch (Throwable e) {
                onFatalError(e);
            }
        });
    }

    private void runOnServiceThread(Runnable task) {
        serviceThread.execute(() -> {
            try {
                task.run();
            } catch (Throwable e) {
                onFatalError(e);
            }
        });
    }

    private ScheduledFuture scheduleOn(ScheduledExecutorService executor, Runnable task, long intervalMs) {
        return scheduleOn(executor, intervalMs, intervalMs, task);
    }

    private ScheduledFuture scheduleOn(ScheduledExecutorService executor, long delay, long intervalMs, Runnable task) {
        return executor.scheduleWithFixedDelay(() -> {
            try {
                task.run();
            } catch (Throwable e) {
                onFatalError(e);
            }
        }, delay, intervalMs, TimeUnit.MILLISECONDS);
    }

    private void onFatalError(Throwable e) {
        InitializationContext localCtx = this.ctx;

        if (localCtx != null) {
            localCtx.hekate().fatalError(e);
        }
    }

    @Override
    public String toString() {
        return ClusterService.class.getSimpleName();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy