com.palantir.atlasdb.keyvalue.cassandra.CassandraTopologyValidator Maven / Gradle / Ivy
Show all versions of atlasdb-cassandra Show documentation
/*
* (c) Copyright 2022 Palantir Technologies Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.palantir.atlasdb.keyvalue.cassandra;
import com.github.rholder.retry.RetryException;
import com.github.rholder.retry.Retryer;
import com.github.rholder.retry.RetryerBuilder;
import com.github.rholder.retry.StopStrategies;
import com.github.rholder.retry.WaitStrategies;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Stopwatch;
import com.google.common.collect.Sets;
import com.palantir.atlasdb.CassandraTopologyValidationMetrics;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceRuntimeConfig;
import com.palantir.atlasdb.cassandra.CassandraServersConfigs.ThriftHostsExtractingVisitor;
import com.palantir.atlasdb.keyvalue.cassandra.pool.CassandraServer;
import com.palantir.common.streams.KeyedStream;
import com.palantir.logsafe.Preconditions;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.exceptions.SafeIllegalStateException;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.refreshable.Refreshable;
import java.net.InetSocketAddress;
import java.time.Duration;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import one.util.streamex.EntryStream;
import org.apache.thrift.TApplicationException;
import org.immutables.value.Value;
public final class CassandraTopologyValidator {
private static final SafeLogger log = SafeLoggerFactory.get(CassandraTopologyValidator.class);
private final CassandraTopologyValidationMetrics metrics;
private final AtomicReference pastConsistentTopologies;
private final Supplier> configuredServers;
private CassandraTopologyValidator(
CassandraTopologyValidationMetrics metrics, Supplier> configuredServers) {
this.metrics = metrics;
this.pastConsistentTopologies = new AtomicReference<>();
this.configuredServers = configuredServers;
}
public static CassandraTopologyValidator create(
CassandraTopologyValidationMetrics metrics,
Refreshable runtimeConfigRefreshable) {
return new CassandraTopologyValidator(
metrics,
runtimeConfigRefreshable.map(
config -> config.servers().accept(ThriftHostsExtractingVisitor.INSTANCE).stream()
.map(InetSocketAddress::getHostString)
.collect(Collectors.toSet())));
}
@VisibleForTesting
static CassandraTopologyValidator createForTests(
CassandraTopologyValidationMetrics metrics, Supplier> configuredServers) {
return new CassandraTopologyValidator(metrics, configuredServers);
}
/**
* Checks a set of new Cassandra servers against the current Casssandra servers
* to ensure their topologies are matching. This is done to prevent user-led split-brain,
* which can occur if a user accidentally provided hostnames for two different Cassandra clusters.
*
* This is done by coming to a consensus on the topology of the pre-existing hosts,
* and then subsequently returning any new hosts which do not match the present topology.
*
* Of course, there is the base case of all hosts will be new. In this case, we simply check that all
* new hosts are in consensus.
*
* Servers that do not have support for the get_host_ids endpoint are always considered consistent,
* even if we cannot come to a consensus on the hosts that do support the endpoint.
*
* Consensus may be demonstrated independently by a set of nodes. In this case, we require that:
* (1) A quorum of nodes (excluding those without `get_host_ids` support) are reachable.
* (2) All reachable nodes have the same set of hostIds.
* (3) All Cassandra nodes without get_host_ids support are considered to be matching.
*
* The above should be sufficient to prevent user-led split-brain as:
* (1) The initial list of servers validate that they've at least quorum for consensus of topology.
* (2) All new hosts added then must match the set of pre-existing hosts topology.
*
* Consensus may also be demonstrated and new hosts added without a quorum of nodes being reachable, if:
* (4) New hosts support get_host_ids, and have the same set of hostIds as the most recent previous consensus
* satisfied through conditions (1) - (3).
*
* In this case, we know that a previous set of servers had quorum for a consensus, which we are also agreeing to.
* Since we aren't agreeing on any new values, values that were agreed upon must have passed conditions (1) - (3)
* at the time of their inception, and that required a quorum of nodes to agree.
*
* There does exist an edge case of, two sets of Cassandra clusters being added (3 and 6 respectively).
* On initialization, the Cassandra cluster with 6 will be used as the base case if the other 3 nodes
* are down, as this will satisfy quorum requirements. However, the cluster of 6 could be the wrong
* cluster, which means we're reading/writing from the wrong cluster! However, this then requires we
* check all nodes, which then means we cannot handle Cassandra restarts, thus this is the best we can do.
*
* @param newlyAddedHosts Set of new Cassandra servers you wish to validate.
* @param allHosts All Cassandra servers which must include newlyAddedHosts.
* @return Set of Cassandra servers which do not match the pre-existing hosts topology. Servers without
* the get_host_ids endpoint will never be returned here.
*/
public Set getNewHostsWithInconsistentTopologiesAndRetry(
Map newlyAddedHosts,
Map allHosts,
Duration waitTimeBetweenCalls,
Duration maxWaitTime) {
Stopwatch stopwatch = Stopwatch.createStarted();
Retryer> retryer = RetryerBuilder.>newBuilder()
.retryIfResult(servers -> servers.size() == allHosts.size())
.retryIfException()
.withWaitStrategy(WaitStrategies.fixedWait(waitTimeBetweenCalls.toMillis(), TimeUnit.MILLISECONDS))
.withStopStrategy(StopStrategies.stopAfterDelay(maxWaitTime.toMillis(), TimeUnit.MILLISECONDS))
.build();
Supplier> inconsistentNewHosts =
() -> getNewHostsWithInconsistentTopologies(newlyAddedHosts, allHosts);
try {
return retryer.call(inconsistentNewHosts::get);
} catch (RetryException | ExecutionException e) {
metrics.validationFailures().inc();
log.error(
"Failed to obtain consistent view of hosts from cluster.",
SafeArg.of("newlyAddedCassandraHosts", newlyAddedHosts),
SafeArg.of("allCassandraHosts", allHosts.keySet()),
e);
return inconsistentNewHosts.get();
} finally {
metrics.validationLatency().update(stopwatch.elapsed(TimeUnit.MILLISECONDS));
}
}
@VisibleForTesting
Set getNewHostsWithInconsistentTopologies(
Map newlyAddedHosts,
Map allHosts) {
Set newlyAddedHostsWithoutOrigin = newlyAddedHosts.keySet();
if (newlyAddedHosts.isEmpty()) {
return newlyAddedHostsWithoutOrigin;
}
Preconditions.checkArgument(
allHosts.keySet().containsAll(newlyAddedHostsWithoutOrigin),
"Newly added hosts must be a subset of all hosts, as otherwise we have no way to query them.",
SafeArg.of("newlyAddedHosts", CassandraLogHelper.collectionOfHosts(newlyAddedHostsWithoutOrigin)),
SafeArg.of("allHosts", CassandraLogHelper.collectionOfHosts(allHosts.keySet())));
Map hostIdsByServerWithoutSoftFailures =
fetchHostIdsIgnoringSoftFailures(allHosts);
Map currentServersWithoutSoftFailures = EntryStream.of(
hostIdsByServerWithoutSoftFailures)
.removeKeys(newlyAddedHosts::containsKey)
.toMap();
Map newServersWithoutSoftFailures = EntryStream.of(
hostIdsByServerWithoutSoftFailures)
.filterKeys(newlyAddedHosts::containsKey)
.toMap();
// This means currently we've no servers or no server without the get_host_ids endpoint.
// Therefore, we need to come to a consensus on the new servers.
if (currentServersWithoutSoftFailures.isEmpty()) {
ClusterTopologyResult topologyResultFromNewServers =
maybeGetConsistentClusterTopology(newServersWithoutSoftFailures);
Set configuredServersSnapshot = configuredServers.get();
Map newServersFromConfig = EntryStream.of(
newServersWithoutSoftFailures)
.filterKeys(server -> configuredServersSnapshot.contains(server.cassandraHostName()))
.toMap();
return getNewHostsWithInconsistentTopologiesFromTopologyResult(
topologyResultFromNewServers,
newServersWithoutSoftFailures,
newServersFromConfig,
newlyAddedHostsWithoutOrigin,
allHosts.keySet());
}
// If a consensus can be reached from the current servers, filter all new servers which have the same set of
// host ids. Accept dissent as such, but permit new servers if they are in quorum _and_ match the previously
// accepted set of host IDs
ClusterTopologyResult topologyFromCurrentServers =
maybeGetConsistentClusterTopology(currentServersWithoutSoftFailures);
return getNewHostsWithInconsistentTopologiesFromTopologyResult(
topologyFromCurrentServers,
newServersWithoutSoftFailures,
newServersWithoutSoftFailures,
newlyAddedHosts.keySet(),
allHosts.keySet());
}
private Set getNewHostsWithInconsistentTopologiesFromTopologyResult(
ClusterTopologyResult topologyResult,
Map newServersWithoutSoftFailures,
Map serversToConsiderWhenNoQuorumPresent,
Set newlyAddedHosts,
Set allHosts) {
switch (topologyResult.type()) {
case CONSENSUS:
Preconditions.checkState(
topologyResult.agreedTopologies().isPresent(),
"Expected to have one or more consistent topologies for a CONSENSUS result, but did not.");
ConsistentClusterTopologies topologies =
topologyResult.agreedTopologies().get();
pastConsistentTopologies.set(topologies);
return EntryStream.of(newServersWithoutSoftFailures)
.removeValues(result -> result.type() == HostIdResult.Type.SUCCESS
&& topologies.sharesAtLeastOneHostId(result.hostIds()))
.keys()
.toSet();
case DISSENT:
// In the event of *active* dissent, we want to hard fail.
return newServersWithoutSoftFailures.keySet();
case NO_QUORUM:
// In the event of no quorum, we trust the new servers iff they agree with our historical knowledge
// of what the old servers were thinking, since in containerised deployments all nodes can change
// between refreshes for legitimate reasons (but they should still refer to the same underlying
// cluster).
ConsistentClusterTopologies pastTopologies = pastConsistentTopologies.get();
if (pastTopologies == null) {
// We don't have a record of what worked in the past, and since this state means we're validating
// the initial config servers, we don't have another source of truth here.
return newServersWithoutSoftFailures.keySet();
}
Map matchingServers = EntryStream.of(
serversToConsiderWhenNoQuorumPresent)
.filterValues(result -> result.type() == HostIdResult.Type.SUCCESS
&& pastTopologies.sharesAtLeastOneHostId(result.hostIds()))
.toMap();
if (matchingServers.isEmpty()) {
log.info(
"No quorum was detected in original set of servers, and the filtered set of servers did"
+ " not include any servers which presented a plausible evolution of the last agreed"
+ " topology. Not adding new servers in this case.",
SafeArg.of("pastConsistentTopologies", pastTopologies),
SafeArg.of("newServers", CassandraLogHelper.collectionOfHosts(newlyAddedHosts)),
SafeArg.of("allServers", CassandraLogHelper.collectionOfHosts(allHosts)),
SafeArg.of("hostIdResults", serversToConsiderWhenNoQuorumPresent));
return newServersWithoutSoftFailures.keySet();
}
log.info(
"No quorum was detected among the original set of servers. Some servers in a filtered set of"
+ " servers presented host IDs that were a plausible evolution of the last agreed value"
+ " among the old servers. Adding new servers that were in consensus.",
SafeArg.of("pastConsistentTopologies", pastTopologies),
SafeArg.of("hostIdResults", serversToConsiderWhenNoQuorumPresent),
SafeArg.of("serversMatchingPastTopology", matchingServers),
SafeArg.of("newServers", CassandraLogHelper.collectionOfHosts(newlyAddedHosts)),
SafeArg.of("allServers", CassandraLogHelper.collectionOfHosts(allHosts)));
ConsistentClusterTopologies mergedTopologies = pastTopologies.merge(matchingServers);
pastConsistentTopologies.set(mergedTopologies);
return Sets.difference(newServersWithoutSoftFailures.keySet(), matchingServers.keySet());
default:
throw new SafeIllegalStateException(
"Unexpected cluster topology result type", SafeArg.of("type", topologyResult.type()));
}
}
/**
* Obtains a consistent view of the cluster topology for the provided hosts.
*
* This is achieved by comparing the hostIds (list of UUIDs for each C* node) for all Cassandra nodes.
* A quorum of C* nodes are required to be reachable and all reachable nodes must have the same
* topology (hostIds) for this to return a valid result. Nodes that are reachable but do not have
* support for our get_host_ids endpoint are simply ignored and will not be filtered out.
*
* @param hostIdsByServerWithoutSoftFailures Cassandra hosts to obtain a consistent topology view from
* @return If consensus could be reached, the topology and the list of valid hosts, otherwise empty.
*/
private ClusterTopologyResult maybeGetConsistentClusterTopology(
Map hostIdsByServerWithoutSoftFailures) {
// If all our queries fail due to soft failures, then our consensus is an empty set of host ids
if (hostIdsByServerWithoutSoftFailures.isEmpty()) {
NodesAndSharedTopology emptySetOfHostIds = NodesAndSharedTopology.builder()
.hostIds(Set.of())
.serversInConsensus(hostIdsByServerWithoutSoftFailures.keySet())
.build();
return ClusterTopologyResult.consensus(ConsistentClusterTopologies.builder()
.addNodesAndSharedTopologies(emptySetOfHostIds)
.build());
}
Map> hostIdsWithoutFailures = EntryStream.of(hostIdsByServerWithoutSoftFailures)
.filterValues(result -> result.type() == HostIdResult.Type.SUCCESS)
.mapValues(NonSoftFailureHostIdResult::hostIds)
.toMap();
// Only consider hosts that have the endpoint for quorum calculations.
// Otherwise, we will never add hosts when we're in a mixed state
int quorum = (hostIdsByServerWithoutSoftFailures.size() / 2) + 1;
// If too many hosts are unreachable, then we cannot come to a consensus
if (hostIdsWithoutFailures.size() < quorum) {
return ClusterTopologyResult.noQuorum();
}
Set> uniqueSetsOfHostIds =
EntryStream.of(hostIdsWithoutFailures).values().toImmutableSet();
if (HostIdEvolution.existsPlausibleEvolutionOfHostIdSets(uniqueSetsOfHostIds)) {
Map, List> idsToServers =
EntryStream.of(hostIdsWithoutFailures).invert().grouping();
return ClusterTopologyResult.consensus(ConsistentClusterTopologies.builder()
.nodesAndSharedTopologies(EntryStream.of(idsToServers)
.mapKeyValue((hostIds, servers) -> NodesAndSharedTopology.builder()
.hostIds(hostIds)
.serversInConsensus(servers)
.build()))
.build());
}
return ClusterTopologyResult.dissent();
}
private Map fetchHostIdsIgnoringSoftFailures(
Map servers) {
Map results =
EntryStream.of(servers).mapValues(this::fetchHostIds).toMap();
if (KeyedStream.stream(results)
.values()
.anyMatch(result -> result.type() == HostIdResult.Type.SOFT_FAILURE
|| result.type() == HostIdResult.Type.HARD_FAILURE)) {
log.warn(
"While fetching host id from hosts, some reported soft and hard failures.",
SafeArg.of("results", results));
}
return EntryStream.of(results)
.removeValues(result -> result.type() == HostIdResult.Type.SOFT_FAILURE)
.mapValues(NonSoftFailureHostIdResult::wrap)
.toMap();
}
@VisibleForTesting
HostIdResult fetchHostIds(CassandraClientPoolingContainer container) {
try {
return container.runWithPooledResource(
client -> HostIdResult.success(client.get_host_ids()));
} catch (Exception e) {
// If the get_host_ids API endpoint does not exist, then return a soft failure.
if (e instanceof TApplicationException) {
TApplicationException applicationException = (TApplicationException) e;
if (applicationException.getType() == TApplicationException.UNKNOWN_METHOD) {
return HostIdResult.softFailure();
}
}
return HostIdResult.hardFailure();
}
}
@Value.Immutable
public interface ConsistentClusterTopologies {
Set nodesAndSharedTopologies();
@Value.Derived
default Set hostIds() {
return nodesAndSharedTopologies().stream()
.flatMap(topology -> topology.hostIds().stream())
.collect(Collectors.toSet());
}
@Value.Derived
default Set serversInConsensus() {
return nodesAndSharedTopologies().stream()
.flatMap(topology -> topology.serversInConsensus().stream())
.collect(Collectors.toSet());
}
default boolean sharesAtLeastOneHostId(Set otherHostIds) {
return !Sets.intersection(hostIds(), otherHostIds).isEmpty();
}
default ConsistentClusterTopologies merge(Map additionalNodes) {
Set topologies = EntryStream.of(additionalNodes)
.mapKeyValue((server, result) -> NodesAndSharedTopology.builder()
.hostIds(result.hostIds())
.serversInConsensus(Set.of(server))
.build())
.collect(Collectors.toSet());
Preconditions.checkArgument(
HostIdEvolution.existsPlausibleEvolutionOfHostIdSets(
Stream.of(nodesAndSharedTopologies(), topologies)
.flatMap(Set::stream)
.map(NodesAndSharedTopology::hostIds)
.collect(Collectors.toSet())),
"Should not merge topologies that do not share at least one host id.");
return ConsistentClusterTopologies.builder()
.nodesAndSharedTopologies(Sets.union(nodesAndSharedTopologies(), topologies))
.build();
}
static ImmutableConsistentClusterTopologies.Builder builder() {
return ImmutableConsistentClusterTopologies.builder();
}
}
@Value.Immutable
public interface NodesAndSharedTopology {
Set hostIds();
Set serversInConsensus();
static ImmutableNodesAndSharedTopology.Builder builder() {
return ImmutableNodesAndSharedTopology.builder();
}
}
enum ClusterTopologyResultType {
CONSENSUS,
DISSENT,
NO_QUORUM
}
@Value.Immutable
public interface ClusterTopologyResult {
ClusterTopologyResultType type();
Optional agreedTopologies();
static ClusterTopologyResult consensus(ConsistentClusterTopologies consistentClusterTopologies) {
return ImmutableClusterTopologyResult.builder()
.type(ClusterTopologyResultType.CONSENSUS)
.agreedTopologies(consistentClusterTopologies)
.build();
}
static ClusterTopologyResult dissent() {
return ImmutableClusterTopologyResult.builder()
.type(ClusterTopologyResultType.DISSENT)
.build();
}
static ClusterTopologyResult noQuorum() {
return ImmutableClusterTopologyResult.builder()
.type(ClusterTopologyResultType.NO_QUORUM)
.build();
}
}
}