
org.elasticsearch.cluster.coordination.DiscoveryUpgradeService Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.coordination;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfiguration;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.CountDown;
import org.elasticsearch.discovery.zen.ElectMasterService;
import org.elasticsearch.discovery.zen.ElectMasterService.MasterCandidate;
import org.elasticsearch.discovery.zen.UnicastZenPing;
import org.elasticsearch.discovery.zen.UnicastZenPing.UnicastPingRequest;
import org.elasticsearch.discovery.zen.UnicastZenPing.UnicastPingResponse;
import org.elasticsearch.discovery.zen.ZenPing.PingResponse;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportResponseHandler;
import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Optional;
import java.util.Set;
import java.util.function.BooleanSupplier;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import static java.lang.Math.max;
import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
import static org.elasticsearch.cluster.ClusterState.UNKNOWN_VERSION;
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentSet;
import static org.elasticsearch.discovery.zen.ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING;
import static org.elasticsearch.discovery.zen.ZenDiscovery.PING_TIMEOUT_SETTING;
/**
* Deals with rolling upgrades of the cluster coordination layer. In mixed clusters we prefer to elect the older nodes, but
* when the last old node shuts down then as long as there are enough new nodes we can assume that they form the whole cluster and
* define them as the initial configuration.
*/
public class DiscoveryUpgradeService {
private static Logger logger = LogManager.getLogger(DiscoveryUpgradeService.class);
// how long to wait after activation before attempting to join a master or perform a bootstrap upgrade
public static final Setting BWC_PING_TIMEOUT_SETTING =
Setting.timeSetting("discovery.zen.bwc_ping_timeout",
PING_TIMEOUT_SETTING, TimeValue.timeValueMillis(1), Setting.Property.NodeScope, Setting.Property.Deprecated);
// whether to try and bootstrap all the discovered Zen2 nodes when the last Zen1 node leaves the cluster.
public static final Setting ENABLE_UNSAFE_BOOTSTRAPPING_ON_UPGRADE_SETTING =
Setting.boolSetting("discovery.zen.unsafe_rolling_upgrades_enabled", true, Setting.Property.NodeScope, Setting.Property.Deprecated);
/**
* Dummy {@link ElectMasterService} that is only used to choose the best 6.x master from the discovered nodes, ignoring the
* `minimum_master_nodes` setting.
*/
private static final ElectMasterService electMasterService = new ElectMasterService(Settings.EMPTY);
private final TransportService transportService;
private final BooleanSupplier isBootstrappedSupplier;
private final JoinHelper joinHelper;
private final Supplier> peersSupplier;
private final Consumer initialConfigurationConsumer;
private final TimeValue bwcPingTimeout;
private final boolean enableUnsafeBootstrappingOnUpgrade;
private final ClusterName clusterName;
@Nullable // null if no active joining round
private volatile JoiningRound joiningRound;
public DiscoveryUpgradeService(Settings settings, TransportService transportService,
BooleanSupplier isBootstrappedSupplier, JoinHelper joinHelper,
Supplier> peersSupplier,
Consumer initialConfigurationConsumer) {
assert Version.CURRENT.major == Version.V_6_6_0.major + 1 : "remove this service once unsafe upgrades are no longer needed";
this.transportService = transportService;
this.isBootstrappedSupplier = isBootstrappedSupplier;
this.joinHelper = joinHelper;
this.peersSupplier = peersSupplier;
this.initialConfigurationConsumer = initialConfigurationConsumer;
this.bwcPingTimeout = BWC_PING_TIMEOUT_SETTING.get(settings);
this.enableUnsafeBootstrappingOnUpgrade = ENABLE_UNSAFE_BOOTSTRAPPING_ON_UPGRADE_SETTING.get(settings);
this.clusterName = CLUSTER_NAME_SETTING.get(settings);
}
public void activate(Optional lastKnownLeader, ClusterState lastAcceptedClusterState) {
// called under coordinator mutex
if (isBootstrappedSupplier.getAsBoolean()) {
return;
}
assert lastKnownLeader.isPresent() == false || Coordinator.isZen1Node(lastKnownLeader.get()) : lastKnownLeader;
// if there was a leader and it's not a old node then we must have been bootstrapped
final Settings dynamicSettings = lastAcceptedClusterState.metadata().settings();
final int minimumMasterNodes = DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.exists(dynamicSettings)
? DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.get(dynamicSettings)
: lastAcceptedClusterState.getMinimumMasterNodesOnPublishingMaster();
assert joiningRound == null : joiningRound;
final Set knownMasterNodeIds = new HashSet<>();
lastAcceptedClusterState.nodes().getMasterNodes().forEach(c -> knownMasterNodeIds.add(c.key));
joiningRound
= new JoiningRound(enableUnsafeBootstrappingOnUpgrade && lastKnownLeader.isPresent(), minimumMasterNodes, knownMasterNodeIds);
joiningRound.scheduleNextAttempt();
}
public void deactivate() {
// called under coordinator mutex
joiningRound = null;
}
/**
* Waits for some number of calls to {@link ListenableCountDown#countDown()} and then notifies a listener. The listener
* is only ever notified once, whether successful or not.
*/
private static class ListenableCountDown {
private final CountDown countDown;
private final ActionListener listener;
ListenableCountDown(int count, ActionListener listener) {
this.countDown = new CountDown(count);
this.listener = listener;
}
void onFailure(Exception e) {
if (countDown.fastForward()) {
listener.onFailure(e);
}
}
void countDown() {
if (countDown.countDown()) {
listener.onResponse(null);
}
}
}
private class JoiningRound {
private final boolean upgrading;
private final int minimumMasterNodes;
private final Set knownMasterNodeIds;
JoiningRound(boolean upgrading, int minimumMasterNodes, Set knownMasterNodeIds) {
this.upgrading = upgrading;
this.minimumMasterNodes = minimumMasterNodes;
this.knownMasterNodeIds = knownMasterNodeIds;
}
private boolean isRunning() {
return joiningRound == this && isBootstrappedSupplier.getAsBoolean() == false;
}
private boolean canBootstrap(Set discoveryNodes) {
return upgrading && minimumMasterNodes <= discoveryNodes.stream().filter(DiscoveryNode::isMasterNode).count();
}
void scheduleNextAttempt() {
if (isRunning() == false) {
return;
}
final ThreadPool threadPool = transportService.getThreadPool();
threadPool.scheduleUnlessShuttingDown(bwcPingTimeout, Names.SAME, new Runnable() {
@Override
public void run() {
if (isRunning() == false) {
return;
}
final Set discoveryNodes = Stream.concat(StreamSupport.stream(peersSupplier.get().spliterator(), false),
Stream.of(transportService.getLocalNode())).filter(DiscoveryNode::isMasterNode).collect(Collectors.toSet());
// this set of nodes is reasonably fresh - the PeerFinder cleans up nodes to which the transport service is not
// connected each time it wakes up (every second by default)
logger.debug("upgrading={}, minimumMasterNodes={}, nodes={}", upgrading, minimumMasterNodes, discoveryNodes);
if (discoveryNodes.stream().anyMatch(Coordinator::isZen1Node)) {
electBestOldMaster(discoveryNodes);
} else if (canBootstrap(discoveryNodes)) {
// no Zen1 nodes found, but the last-known master was a Zen1 node, so this is a rolling upgrade
transportService.getThreadPool().generic().execute(() -> {
try {
Set nodeIds = new HashSet<>();
discoveryNodes.forEach(n -> nodeIds.add(n.getId()));
final Iterator knownNodeIdIterator = knownMasterNodeIds.iterator();
while (nodeIds.size() < 2 * minimumMasterNodes - 1 && knownNodeIdIterator.hasNext()) {
nodeIds.add(knownNodeIdIterator.next());
}
final VotingConfiguration votingConfiguration = new VotingConfiguration(nodeIds);
assert votingConfiguration.hasQuorum(
discoveryNodes.stream().map(DiscoveryNode::getId).collect(Collectors.toList()));
assert 2 * minimumMasterNodes - 2 <= nodeIds.size() : nodeIds + " too small for " + minimumMasterNodes;
initialConfigurationConsumer.accept(votingConfiguration);
} catch (Exception e) {
logger.debug("exception during bootstrapping upgrade, retrying", e);
} finally {
scheduleNextAttempt();
}
});
} else {
scheduleNextAttempt();
}
}
/**
* Ping all the old discovered masters one more time to obtain their cluster state versions, and then vote for the best one.
* @param discoveryNodes The master nodes (old and new).
*/
private void electBestOldMaster(Set discoveryNodes) {
final Set masterCandidates = newConcurrentSet();
final ListenableCountDown listenableCountDown
= new ListenableCountDown(discoveryNodes.size(), new ActionListener() {
@Override
public void onResponse(Void value) {
assert masterCandidates.size() == discoveryNodes.size()
: masterCandidates + " does not match " + discoveryNodes;
// TODO we shouldn't elect a master with a version that's older than ours
// If the only Zen1 nodes left are stale, and we can bootstrap, maybe we should bootstrap?
// Do we ever need to elect a freshly-started Zen1 node?
if (isRunning()) {
final MasterCandidate electedMaster = electMasterService.electMaster(masterCandidates);
logger.debug("elected {}, sending join", electedMaster);
joinHelper.sendJoinRequest(electedMaster.getNode(), 0L, Optional.empty(),
JoiningRound.this::scheduleNextAttempt);
}
}
@Override
public void onFailure(Exception e) {
scheduleNextAttempt();
}
});
boolean foundOldMaster = false;
for (final DiscoveryNode discoveryNode : discoveryNodes) {
assert discoveryNode.isMasterNode() : discoveryNode;
if (Coordinator.isZen1Node(discoveryNode)) {
foundOldMaster = true;
transportService.sendRequest(discoveryNode, UnicastZenPing.ACTION_NAME,
new UnicastPingRequest(0, TimeValue.ZERO,
new PingResponse(createDiscoveryNodeWithImpossiblyHighId(transportService.getLocalNode()),
null, clusterName, UNKNOWN_VERSION)),
TransportRequestOptions.timeout(bwcPingTimeout),
new TransportResponseHandler() {
@Override
public void handleResponse(UnicastPingResponse response) {
long clusterStateVersion = UNKNOWN_VERSION;
for (PingResponse pingResponse : response.pingResponses) {
if (discoveryNode.equals(pingResponse.node())) {
clusterStateVersion
= max(clusterStateVersion, pingResponse.getClusterStateVersion());
}
}
masterCandidates.add(new MasterCandidate(discoveryNode, clusterStateVersion));
listenableCountDown.countDown();
}
@Override
public void handleException(TransportException exp) {
logger.debug(
new ParameterizedMessage("unexpected exception when pinging {}", discoveryNode), exp);
listenableCountDown.onFailure(exp);
}
@Override
public String executor() {
return Names.SAME;
}
@Override
public UnicastPingResponse read(StreamInput in) throws IOException {
return new UnicastPingResponse(in);
}
});
} else {
masterCandidates.add(
new MasterCandidate(createDiscoveryNodeWithImpossiblyHighId(discoveryNode), UNKNOWN_VERSION));
listenableCountDown.countDown();
}
}
assert foundOldMaster;
}
@Override
public String toString() {
return "discovery upgrade service retry";
}
});
}
}
/**
* Pre-7.0 nodes select the best master by comparing their IDs (as strings) and selecting the lowest one amongst those nodes with
* the best cluster state version. We want 7.0+ nodes to participate in these elections in a mixed cluster but never to win one, so
* we lie and claim to have an impossible ID that compares above all genuine IDs.
*/
public static DiscoveryNode createDiscoveryNodeWithImpossiblyHighId(DiscoveryNode node) {
// IDs are base-64-encoded UUIDs, which means they use the character set [0-9A-Za-z_-]. The highest character in this set is 'z',
// and 'z' < '{', so by starting the ID with '{' we can be sure it's greater. This is terrible.
return new DiscoveryNode(node.getName(), "{zen2}" + node.getId(), node.getEphemeralId(), node.getHostName(),
node.getHostAddress(), node.getAddress(), node.getAttributes(), node.getRoles(), node.getVersion());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy