All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.net.StartupClusterConnectivityChecker Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.net;

import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.SetMultimap;
import com.google.common.util.concurrent.Uninterruptibles;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.gms.EndpointState;
import org.apache.cassandra.gms.Gossiper;
import org.apache.cassandra.gms.IEndpointStateChangeSubscriber;
import org.apache.cassandra.locator.InetAddressAndPort;
import org.apache.cassandra.utils.CassandraVersion;
import org.apache.cassandra.utils.FBUtilities;

import static org.apache.cassandra.net.Verb.PING_REQ;
import static org.apache.cassandra.net.ConnectionType.LARGE_MESSAGES;
import static org.apache.cassandra.net.ConnectionType.SMALL_MESSAGES;

public class StartupClusterConnectivityChecker
{
    private static final Logger logger = LoggerFactory.getLogger(StartupClusterConnectivityChecker.class);

    private final boolean blockForRemoteDcs;
    private final long timeoutNanos;

    public static StartupClusterConnectivityChecker create(long timeoutSecs, boolean blockForRemoteDcs)
    {
        if (timeoutSecs > 100)
            logger.warn("setting the block-for-peers timeout (in seconds) to {} might be a bit excessive, but using it nonetheless", timeoutSecs);
        long timeoutNanos = TimeUnit.SECONDS.toNanos(timeoutSecs);

        return new StartupClusterConnectivityChecker(timeoutNanos, blockForRemoteDcs);
    }

    @VisibleForTesting
    StartupClusterConnectivityChecker(long timeoutNanos, boolean blockForRemoteDcs)
    {
        this.blockForRemoteDcs = blockForRemoteDcs;
        this.timeoutNanos = timeoutNanos;
    }

    /**
     * @param peers The currently known peers in the cluster; argument is not modified.
     * @param getDatacenterSource A function for mapping peers to their datacenter.
     * @return true if the requested percentage of peers are marked ALIVE in gossip and have their connections opened;
     * else false.
     */
    public boolean execute(Set peers, Function getDatacenterSource,
                           Predicate isUpgradingFromLowerVersionThan)
    {
        if (peers == null || this.timeoutNanos < 0)
            return true;

        // Check if there are any nodes which we know are running a version prior to 4.0.
        // We use this intead of Gossiper::hasMajorVersion3Nodes because in the absence of version information for a peer
        // we still prefer to run the startup connectivity check.
        if (isUpgradingFromLowerVersionThan.test(CassandraVersion.CASSANDRA_4_0))
        {
            logger.debug("Skipping startup connectivity check as some nodes may be running Cassandra version 3 or older " +
                         "which does not support connectivity checking.");
            return true;
        }

        // make a copy of the set, to avoid mucking with the input (in case it's a sensitive collection)
        peers = new HashSet<>(peers);
        InetAddressAndPort localAddress = FBUtilities.getBroadcastAddressAndPort();
        String localDc = getDatacenterSource.apply(localAddress);

        peers.remove(localAddress);
        if (peers.isEmpty())
            return true;

        // make a copy of the datacenter mapping (in case gossip updates happen during this method or some such)
        Map peerToDatacenter = new HashMap<>();
        SetMultimap datacenterToPeers = HashMultimap.create();

        for (InetAddressAndPort peer : peers)
        {
            String datacenter = getDatacenterSource.apply(peer);
            peerToDatacenter.put(peer, datacenter);
            datacenterToPeers.put(datacenter, peer);
        }

        // In the case where we do not want to block startup on remote datacenters (e.g. because clients only use
        // LOCAL_X consistency levels), we remove all other datacenter hosts from the mapping and we only wait
        // on the remaining local datacenter.
        if (!blockForRemoteDcs)
        {
            datacenterToPeers.keySet().retainAll(Collections.singleton(localDc));
            logger.info("Blocking coordination until only a single peer is DOWN in the local datacenter, timeout={}s",
                        TimeUnit.NANOSECONDS.toSeconds(timeoutNanos));
        }
        else
        {
            logger.info("Blocking coordination until only a single peer is DOWN in each datacenter, timeout={}s",
                        TimeUnit.NANOSECONDS.toSeconds(timeoutNanos));
        }

        AckMap acks = new AckMap(3);
        Map dcToRemainingPeers = new HashMap<>(datacenterToPeers.size());
        for (String datacenter: datacenterToPeers.keys())
        {
            dcToRemainingPeers.put(datacenter,
                                   new CountDownLatch(Math.max(datacenterToPeers.get(datacenter).size() - 1, 0)));
        }

        long startNanos = System.nanoTime();

        // set up a listener to react to new nodes becoming alive (in gossip), and account for all the nodes that are already alive
        Set alivePeers = Collections.newSetFromMap(new ConcurrentHashMap<>());
        AliveListener listener = new AliveListener(alivePeers, dcToRemainingPeers, acks, peerToDatacenter::get);
        Gossiper.instance.register(listener);

        // send out a ping message to open up the non-gossip connections to all peers. Note that this sends the
        // ping messages to _all_ peers, not just the ones we block for in dcToRemainingPeers.
        sendPingMessages(peers, dcToRemainingPeers, acks, peerToDatacenter::get);

        for (InetAddressAndPort peer : peers)
        {
            if (Gossiper.instance.isAlive(peer) && alivePeers.add(peer) && acks.incrementAndCheck(peer))
            {
                String datacenter = peerToDatacenter.get(peer);
                // We have to check because we might only have the local DC in the map
                if (dcToRemainingPeers.containsKey(datacenter))
                    dcToRemainingPeers.get(datacenter).countDown();
            }
        }

        boolean succeeded = true;
        for (CountDownLatch countDownLatch : dcToRemainingPeers.values())
        {
            long remainingNanos = Math.max(1, timeoutNanos - (System.nanoTime() - startNanos));
            //noinspection UnstableApiUsage
            succeeded &= Uninterruptibles.awaitUninterruptibly(countDownLatch, remainingNanos, TimeUnit.NANOSECONDS);
        }

        Gossiper.instance.unregister(listener);

        Map numDown = dcToRemainingPeers.entrySet().stream()
                                                      .collect(Collectors.toMap(Map.Entry::getKey,
                                                                                e -> e.getValue().getCount()));

        if (succeeded)
        {
            logger.info("Ensured sufficient healthy connections with {} after {} milliseconds",
                        numDown.keySet(), TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos));
        }
        else
        {
            logger.warn("Timed out after {} milliseconds, was waiting for remaining peers to connect: {}",
                        TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos), numDown);
        }

        return succeeded;
    }

    /**
     * Sends a "connection warmup" message to each peer in the collection, on every {@link ConnectionType}
     * used for internode messaging (that is not gossip).
     */
    private void sendPingMessages(Set peers, Map dcToRemainingPeers,
                                  AckMap acks, Function getDatacenter)
    {
        RequestCallback responseHandler = msg -> {
            if (acks.incrementAndCheck(msg.from()))
            {
                String datacenter = getDatacenter.apply(msg.from());
                // We have to check because we might only have the local DC in the map
                if (dcToRemainingPeers.containsKey(datacenter))
                    dcToRemainingPeers.get(datacenter).countDown();
            }
        };

        Message small = Message.out(PING_REQ, PingRequest.forSmall);
        Message large = Message.out(PING_REQ, PingRequest.forLarge);
        for (InetAddressAndPort peer : peers)
        {
            MessagingService.instance().sendWithCallback(small, peer, responseHandler, SMALL_MESSAGES);
            MessagingService.instance().sendWithCallback(large, peer, responseHandler, LARGE_MESSAGES);
        }
    }

    /**
     * A trivial implementation of {@link IEndpointStateChangeSubscriber} that really only cares about
     * {@link #onAlive(InetAddressAndPort, EndpointState)} invocations.
     */
    private static final class AliveListener implements IEndpointStateChangeSubscriber
    {
        private final Map dcToRemainingPeers;
        private final Set livePeers;
        private final Function getDatacenter;
        private final AckMap acks;

        AliveListener(Set livePeers, Map dcToRemainingPeers,
                      AckMap acks, Function getDatacenter)
        {
            this.livePeers = livePeers;
            this.dcToRemainingPeers = dcToRemainingPeers;
            this.acks = acks;
            this.getDatacenter = getDatacenter;
        }

        public void onAlive(InetAddressAndPort endpoint, EndpointState state)
        {
            if (livePeers.add(endpoint) && acks.incrementAndCheck(endpoint))
            {
                String datacenter = getDatacenter.apply(endpoint);
                if (dcToRemainingPeers.containsKey(datacenter))
                    dcToRemainingPeers.get(datacenter).countDown();
            }
        }
    }

    private static final class AckMap
    {
        private final int threshold;
        private final Map acks;

        AckMap(int threshold)
        {
            this.threshold = threshold;
            acks = new ConcurrentHashMap<>();
        }

        boolean incrementAndCheck(InetAddressAndPort address)
        {
            return acks.computeIfAbsent(address, addr -> new AtomicInteger(0)).incrementAndGet() == threshold;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy