All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.palantir.atlasdb.keyvalue.cassandra.CassandraTopologyValidator Maven / Gradle / Ivy

The newest version!
/*
 * (c) Copyright 2022 Palantir Technologies Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.palantir.atlasdb.keyvalue.cassandra;

import com.github.rholder.retry.RetryException;
import com.github.rholder.retry.Retryer;
import com.github.rholder.retry.RetryerBuilder;
import com.github.rholder.retry.StopStrategies;
import com.github.rholder.retry.WaitStrategies;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Sets;
import com.palantir.atlasdb.CassandraTopologyValidationMetrics;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceRuntimeConfig;
import com.palantir.atlasdb.cassandra.CassandraServersConfigs.ThriftHostsExtractingVisitor;
import com.palantir.atlasdb.keyvalue.cassandra.pool.CassandraServer;
import com.palantir.common.streams.KeyedStream;
import com.palantir.logsafe.Preconditions;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.exceptions.SafeIllegalStateException;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.refreshable.Refreshable;
import java.net.InetSocketAddress;
import java.time.Duration;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import one.util.streamex.EntryStream;
import org.apache.thrift.TApplicationException;
import org.immutables.value.Value;

public final class CassandraTopologyValidator {
    private static final SafeLogger log = SafeLoggerFactory.get(CassandraTopologyValidator.class);
    private final CassandraTopologyValidationMetrics metrics;
    private final AtomicReference pastConsistentTopologies;
    private final Supplier> configuredServers;

    private CassandraTopologyValidator(
            CassandraTopologyValidationMetrics metrics, Supplier> configuredServers) {
        this.metrics = metrics;
        this.pastConsistentTopologies = new AtomicReference<>();
        this.configuredServers = configuredServers;
    }

    public static CassandraTopologyValidator create(
            CassandraTopologyValidationMetrics metrics,
            Refreshable runtimeConfigRefreshable) {
        return new CassandraTopologyValidator(
                metrics,
                runtimeConfigRefreshable.map(
                        config -> config.servers().accept(ThriftHostsExtractingVisitor.INSTANCE).stream()
                                .map(InetSocketAddress::getHostString)
                                .collect(Collectors.toSet())));
    }

    @VisibleForTesting
    static CassandraTopologyValidator createForTests(
            CassandraTopologyValidationMetrics metrics, Supplier> configuredServers) {
        return new CassandraTopologyValidator(metrics, configuredServers);
    }

    /**
     * Checks a set of new Cassandra servers against the current Casssandra servers
     * to ensure their topologies are matching. This is done to prevent user-led split-brain,
     * which can occur if a user accidentally provided hostnames for two different Cassandra clusters.
     * 

* This is done by coming to a consensus on the topology of the pre-existing hosts, * and then subsequently returning any new hosts which do not match the present topology. *

* Of course, there is the base case of all hosts will be new. In this case, we simply check that all * new hosts are in consensus. *

* Servers that do not have support for the get_host_ids endpoint are always considered consistent, * even if we cannot come to a consensus on the hosts that do support the endpoint. *

* Consensus may be demonstrated independently by a set of nodes. In this case, we require that: * (1) A quorum of nodes (excluding those without `get_host_ids` support) are reachable. * (2) All reachable nodes have the same set of hostIds. * (3) All Cassandra nodes without get_host_ids support are considered to be matching. *

* The above should be sufficient to prevent user-led split-brain as: * (1) The initial list of servers validate that they've at least quorum for consensus of topology. * (2) All new hosts added then must match the set of pre-existing hosts topology. *

* Consensus may also be demonstrated and new hosts added without a quorum of nodes being reachable, if: * (4) New hosts support get_host_ids, and have the same set of hostIds as the most recent previous consensus * satisfied through conditions (1) - (3). *

* In this case, we know that a previous set of servers had quorum for a consensus, which we are also agreeing to. * Since we aren't agreeing on any new values, values that were agreed upon must have passed conditions (1) - (3) * at the time of their inception, and that required a quorum of nodes to agree. *

* There does exist an edge case of, two sets of Cassandra clusters being added (3 and 6 respectively). * On initialization, the Cassandra cluster with 6 will be used as the base case if the other 3 nodes * are down, as this will satisfy quorum requirements. However, the cluster of 6 could be the wrong * cluster, which means we're reading/writing from the wrong cluster! However, this then requires we * check all nodes, which then means we cannot handle Cassandra restarts, thus this is the best we can do. * * @param newlyAddedHosts Set of new Cassandra servers you wish to validate. * @param allHosts All Cassandra servers which must include newlyAddedHosts. * @return Set of Cassandra servers which do not match the pre-existing hosts topology. Servers without * the get_host_ids endpoint will never be returned here. */ public Set getNewHostsWithInconsistentTopologiesAndRetry( Map newlyAddedHosts, Map allHosts, Duration waitTimeBetweenCalls, Duration maxWaitTime) { Stopwatch stopwatch = Stopwatch.createStarted(); Retryer> retryer = RetryerBuilder.>newBuilder() .retryIfResult(servers -> servers.size() == allHosts.size()) .retryIfException() .withWaitStrategy(WaitStrategies.fixedWait(waitTimeBetweenCalls.toMillis(), TimeUnit.MILLISECONDS)) .withStopStrategy(StopStrategies.stopAfterDelay(maxWaitTime.toMillis(), TimeUnit.MILLISECONDS)) .build(); Supplier> inconsistentNewHosts = () -> getNewHostsWithInconsistentTopologies(newlyAddedHosts, allHosts); try { return retryer.call(inconsistentNewHosts::get); } catch (RetryException | ExecutionException e) { metrics.validationFailures().inc(); log.error( "Failed to obtain consistent view of hosts from cluster.", SafeArg.of("newlyAddedCassandraHosts", newlyAddedHosts), SafeArg.of("allCassandraHosts", allHosts.keySet()), e); return inconsistentNewHosts.get(); } finally { metrics.validationLatency().update(stopwatch.elapsed(TimeUnit.MILLISECONDS)); } } @VisibleForTesting Set getNewHostsWithInconsistentTopologies( Map newlyAddedHosts, Map allHosts) { Set newlyAddedHostsWithoutOrigin = newlyAddedHosts.keySet(); if (newlyAddedHosts.isEmpty()) { return newlyAddedHostsWithoutOrigin; } Preconditions.checkArgument( allHosts.keySet().containsAll(newlyAddedHostsWithoutOrigin), "Newly added hosts must be a subset of all hosts, as otherwise we have no way to query them.", SafeArg.of("newlyAddedHosts", CassandraLogHelper.collectionOfHosts(newlyAddedHostsWithoutOrigin)), SafeArg.of("allHosts", CassandraLogHelper.collectionOfHosts(allHosts.keySet()))); Map hostIdsByServerWithoutSoftFailures = fetchHostIdsIgnoringSoftFailures(allHosts); Map currentServersWithoutSoftFailures = EntryStream.of( hostIdsByServerWithoutSoftFailures) .removeKeys(newlyAddedHosts::containsKey) .toMap(); Map newServersWithoutSoftFailures = EntryStream.of( hostIdsByServerWithoutSoftFailures) .filterKeys(newlyAddedHosts::containsKey) .toMap(); // This means currently we've no servers or no server without the get_host_ids endpoint. // Therefore, we need to come to a consensus on the new servers. if (currentServersWithoutSoftFailures.isEmpty()) { ClusterTopologyResult topologyResultFromNewServers = maybeGetConsistentClusterTopology(newServersWithoutSoftFailures); Set configuredServersSnapshot = configuredServers.get(); Map newServersFromConfig = EntryStream.of( newServersWithoutSoftFailures) .filterKeys(server -> configuredServersSnapshot.contains(server.cassandraHostName())) .toMap(); return getNewHostsWithInconsistentTopologiesFromTopologyResult( topologyResultFromNewServers, newServersWithoutSoftFailures, newServersFromConfig, newlyAddedHostsWithoutOrigin, allHosts.keySet()); } // If a consensus can be reached from the current servers, filter all new servers which have the same set of // host ids. Accept dissent as such, but permit new servers if they are in quorum _and_ match the previously // accepted set of host IDs ClusterTopologyResult topologyFromCurrentServers = maybeGetConsistentClusterTopology(currentServersWithoutSoftFailures); return getNewHostsWithInconsistentTopologiesFromTopologyResult( topologyFromCurrentServers, newServersWithoutSoftFailures, newServersWithoutSoftFailures, newlyAddedHosts.keySet(), allHosts.keySet()); } private Set getNewHostsWithInconsistentTopologiesFromTopologyResult( ClusterTopologyResult topologyResult, Map newServersWithoutSoftFailures, Map serversToConsiderWhenNoQuorumPresent, Set newlyAddedHosts, Set allHosts) { switch (topologyResult.type()) { case CONSENSUS: Preconditions.checkState( topologyResult.agreedTopologies().isPresent(), "Expected to have one or more consistent topologies for a CONSENSUS result, but did not."); ConsistentClusterTopologies topologies = topologyResult.agreedTopologies().get(); pastConsistentTopologies.set(topologies); return EntryStream.of(newServersWithoutSoftFailures) .removeValues(result -> result.type() == HostIdResult.Type.SUCCESS && topologies.sharesAtLeastOneHostId(result.hostIds())) .keys() .toSet(); case DISSENT: // In the event of *active* dissent, we want to hard fail. return newServersWithoutSoftFailures.keySet(); case NO_QUORUM: // In the event of no quorum, we trust the new servers iff they agree with our historical knowledge // of what the old servers were thinking, since in containerised deployments all nodes can change // between refreshes for legitimate reasons (but they should still refer to the same underlying // cluster). ConsistentClusterTopologies pastTopologies = pastConsistentTopologies.get(); if (pastTopologies == null) { // We don't have a record of what worked in the past, and since this state means we're validating // the initial config servers, we don't have another source of truth here. return newServersWithoutSoftFailures.keySet(); } Map matchingServers = EntryStream.of( serversToConsiderWhenNoQuorumPresent) .filterValues(result -> result.type() == HostIdResult.Type.SUCCESS && pastTopologies.sharesAtLeastOneHostId(result.hostIds())) .toMap(); if (matchingServers.isEmpty()) { log.info( "No quorum was detected in original set of servers, and the filtered set of servers did" + " not include any servers which presented a plausible evolution of the last agreed" + " topology. Not adding new servers in this case.", SafeArg.of("pastConsistentTopologies", pastTopologies), SafeArg.of("newServers", CassandraLogHelper.collectionOfHosts(newlyAddedHosts)), SafeArg.of("allServers", CassandraLogHelper.collectionOfHosts(allHosts)), SafeArg.of("hostIdResults", serversToConsiderWhenNoQuorumPresent)); return newServersWithoutSoftFailures.keySet(); } log.info( "No quorum was detected among the original set of servers. Some servers in a filtered set of" + " servers presented host IDs that were a plausible evolution of the last agreed value" + " among the old servers. Adding new servers that were in consensus.", SafeArg.of("pastConsistentTopologies", pastTopologies), SafeArg.of("hostIdResults", serversToConsiderWhenNoQuorumPresent), SafeArg.of("serversMatchingPastTopology", matchingServers), SafeArg.of("newServers", CassandraLogHelper.collectionOfHosts(newlyAddedHosts)), SafeArg.of("allServers", CassandraLogHelper.collectionOfHosts(allHosts))); ConsistentClusterTopologies mergedTopologies = pastTopologies.merge(matchingServers); pastConsistentTopologies.set(mergedTopologies); return Sets.difference(newServersWithoutSoftFailures.keySet(), matchingServers.keySet()); default: throw new SafeIllegalStateException( "Unexpected cluster topology result type", SafeArg.of("type", topologyResult.type())); } } /** * Obtains a consistent view of the cluster topology for the provided hosts. *

* This is achieved by comparing the hostIds (list of UUIDs for each C* node) for all Cassandra nodes. * A quorum of C* nodes are required to be reachable and all reachable nodes must have the same * topology (hostIds) for this to return a valid result. Nodes that are reachable but do not have * support for our get_host_ids endpoint are simply ignored and will not be filtered out. * * @param hostIdsByServerWithoutSoftFailures Cassandra hosts to obtain a consistent topology view from * @return If consensus could be reached, the topology and the list of valid hosts, otherwise empty. */ private ClusterTopologyResult maybeGetConsistentClusterTopology( Map hostIdsByServerWithoutSoftFailures) { // If all our queries fail due to soft failures, then our consensus is an empty set of host ids if (hostIdsByServerWithoutSoftFailures.isEmpty()) { NodesAndSharedTopology emptySetOfHostIds = NodesAndSharedTopology.builder() .hostIds(Set.of()) .serversInConsensus(hostIdsByServerWithoutSoftFailures.keySet()) .build(); return ClusterTopologyResult.consensus(ConsistentClusterTopologies.builder() .addNodesAndSharedTopologies(emptySetOfHostIds) .build()); } Map> hostIdsWithoutFailures = EntryStream.of(hostIdsByServerWithoutSoftFailures) .filterValues(result -> result.type() == HostIdResult.Type.SUCCESS) .mapValues(NonSoftFailureHostIdResult::hostIds) .toMap(); // Only consider hosts that have the endpoint for quorum calculations. // Otherwise, we will never add hosts when we're in a mixed state int quorum = (hostIdsByServerWithoutSoftFailures.size() / 2) + 1; // If too many hosts are unreachable, then we cannot come to a consensus if (hostIdsWithoutFailures.size() < quorum) { return ClusterTopologyResult.noQuorum(); } Set> uniqueSetsOfHostIds = EntryStream.of(hostIdsWithoutFailures).values().toImmutableSet(); if (HostIdEvolution.existsPlausibleEvolutionOfHostIdSets(uniqueSetsOfHostIds)) { Map, List> idsToServers = EntryStream.of(hostIdsWithoutFailures).invert().grouping(); return ClusterTopologyResult.consensus(ConsistentClusterTopologies.builder() .nodesAndSharedTopologies(EntryStream.of(idsToServers) .mapKeyValue((hostIds, servers) -> NodesAndSharedTopology.builder() .hostIds(hostIds) .serversInConsensus(servers) .build())) .build()); } return ClusterTopologyResult.dissent(); } private Map fetchHostIdsIgnoringSoftFailures( Map servers) { Map results = EntryStream.of(servers).mapValues(this::fetchHostIds).toMap(); if (KeyedStream.stream(results) .values() .anyMatch(result -> result.type() == HostIdResult.Type.SOFT_FAILURE || result.type() == HostIdResult.Type.HARD_FAILURE)) { log.warn( "While fetching host id from hosts, some reported soft and hard failures.", SafeArg.of("results", results)); } return EntryStream.of(results) .removeValues(result -> result.type() == HostIdResult.Type.SOFT_FAILURE) .mapValues(NonSoftFailureHostIdResult::wrap) .toMap(); } @VisibleForTesting HostIdResult fetchHostIds(CassandraClientPoolingContainer container) { try { return container.runWithPooledResource( client -> HostIdResult.success(client.get_host_ids())); } catch (Exception e) { // If the get_host_ids API endpoint does not exist, then return a soft failure. if (e instanceof TApplicationException) { TApplicationException applicationException = (TApplicationException) e; if (applicationException.getType() == TApplicationException.UNKNOWN_METHOD) { return HostIdResult.softFailure(); } } return HostIdResult.hardFailure(); } } @Value.Immutable public interface ConsistentClusterTopologies { Set nodesAndSharedTopologies(); @Value.Derived default Set hostIds() { return nodesAndSharedTopologies().stream() .flatMap(topology -> topology.hostIds().stream()) .collect(Collectors.toSet()); } @Value.Derived default Set serversInConsensus() { return nodesAndSharedTopologies().stream() .flatMap(topology -> topology.serversInConsensus().stream()) .collect(Collectors.toSet()); } default boolean sharesAtLeastOneHostId(Set otherHostIds) { return !Sets.intersection(hostIds(), otherHostIds).isEmpty(); } default ConsistentClusterTopologies merge(Map additionalNodes) { Set topologies = EntryStream.of(additionalNodes) .mapKeyValue((server, result) -> NodesAndSharedTopology.builder() .hostIds(result.hostIds()) .serversInConsensus(Set.of(server)) .build()) .collect(Collectors.toSet()); Preconditions.checkArgument( HostIdEvolution.existsPlausibleEvolutionOfHostIdSets( Stream.of(nodesAndSharedTopologies(), topologies) .flatMap(Set::stream) .map(NodesAndSharedTopology::hostIds) .collect(Collectors.toSet())), "Should not merge topologies that do not share at least one host id."); return ConsistentClusterTopologies.builder() .nodesAndSharedTopologies(Sets.union(nodesAndSharedTopologies(), topologies)) .build(); } static ImmutableConsistentClusterTopologies.Builder builder() { return ImmutableConsistentClusterTopologies.builder(); } } @Value.Immutable public interface NodesAndSharedTopology { Set hostIds(); Set serversInConsensus(); static ImmutableNodesAndSharedTopology.Builder builder() { return ImmutableNodesAndSharedTopology.builder(); } } enum ClusterTopologyResultType { CONSENSUS, DISSENT, NO_QUORUM } @Value.Immutable public interface ClusterTopologyResult { ClusterTopologyResultType type(); Optional agreedTopologies(); static ClusterTopologyResult consensus(ConsistentClusterTopologies consistentClusterTopologies) { return ImmutableClusterTopologyResult.builder() .type(ClusterTopologyResultType.CONSENSUS) .agreedTopologies(consistentClusterTopologies) .build(); } static ClusterTopologyResult dissent() { return ImmutableClusterTopologyResult.builder() .type(ClusterTopologyResultType.DISSENT) .build(); } static ClusterTopologyResult noQuorum() { return ImmutableClusterTopologyResult.builder() .type(ClusterTopologyResultType.NO_QUORUM) .build(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy