org.elasticsearch.cluster.coordination.DiscoveryUpgradeService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.coordination;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.coordination.CoordinationMetaData.VotingConfiguration;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.CountDown;
import org.elasticsearch.discovery.zen.ElectMasterService;
import org.elasticsearch.discovery.zen.ElectMasterService.MasterCandidate;
import org.elasticsearch.discovery.zen.UnicastZenPing;
import org.elasticsearch.discovery.zen.UnicastZenPing.UnicastPingRequest;
import org.elasticsearch.discovery.zen.UnicastZenPing.UnicastPingResponse;
import org.elasticsearch.discovery.zen.ZenPing.PingResponse;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.threadpool.ThreadPool.Names;
import org.elasticsearch.transport.TransportException;
import org.elasticsearch.transport.TransportRequestOptions;
import org.elasticsearch.transport.TransportResponseHandler;
import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Optional;
import java.util.Set;
import java.util.function.BooleanSupplier;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import static java.lang.Math.max;
import static org.elasticsearch.cluster.ClusterName.CLUSTER_NAME_SETTING;
import static org.elasticsearch.cluster.ClusterState.UNKNOWN_VERSION;
import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentSet;
import static org.elasticsearch.discovery.zen.ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING;
import static org.elasticsearch.discovery.zen.ZenDiscovery.PING_TIMEOUT_SETTING;
/**
* Deals with rolling upgrades of the cluster coordination layer. In mixed clusters we prefer to elect the older nodes, but
* when the last old node shuts down then as long as there are enough new nodes we can assume that they form the whole cluster and
* define them as the initial configuration.
*/
public class DiscoveryUpgradeService {
private static Logger logger = LogManager.getLogger(DiscoveryUpgradeService.class);
// how long to wait after activation before attempting to join a master or perform a bootstrap upgrade
public static final Setting BWC_PING_TIMEOUT_SETTING =
Setting.timeSetting("discovery.zen.bwc_ping_timeout",
PING_TIMEOUT_SETTING, TimeValue.timeValueMillis(1), Setting.Property.NodeScope, Setting.Property.Deprecated);
// whether to try and bootstrap all the discovered Zen2 nodes when the last Zen1 node leaves the cluster.
public static final Setting ENABLE_UNSAFE_BOOTSTRAPPING_ON_UPGRADE_SETTING =
Setting.boolSetting("discovery.zen.unsafe_rolling_upgrades_enabled", true, Setting.Property.NodeScope, Setting.Property.Deprecated);
/**
* Dummy {@link ElectMasterService} that is only used to choose the best 6.x master from the discovered nodes, ignoring the
* `minimum_master_nodes` setting.
*/
private static final ElectMasterService electMasterService = new ElectMasterService(Settings.EMPTY);
private final TransportService transportService;
private final BooleanSupplier isBootstrappedSupplier;
private final JoinHelper joinHelper;
private final Supplier> peersSupplier;
private final Consumer initialConfigurationConsumer;
private final TimeValue bwcPingTimeout;
private final boolean enableUnsafeBootstrappingOnUpgrade;
private final ClusterName clusterName;
@Nullable // null if no active joining round
private volatile JoiningRound joiningRound;
public DiscoveryUpgradeService(Settings settings, TransportService transportService,
BooleanSupplier isBootstrappedSupplier, JoinHelper joinHelper,
Supplier> peersSupplier,
Consumer initialConfigurationConsumer) {
assert Version.CURRENT.major == Version.V_6_6_0.major + 1 : "remove this service once unsafe upgrades are no longer needed";
this.transportService = transportService;
this.isBootstrappedSupplier = isBootstrappedSupplier;
this.joinHelper = joinHelper;
this.peersSupplier = peersSupplier;
this.initialConfigurationConsumer = initialConfigurationConsumer;
this.bwcPingTimeout = BWC_PING_TIMEOUT_SETTING.get(settings);
this.enableUnsafeBootstrappingOnUpgrade = ENABLE_UNSAFE_BOOTSTRAPPING_ON_UPGRADE_SETTING.get(settings);
this.clusterName = CLUSTER_NAME_SETTING.get(settings);
}
public void activate(Optional lastKnownLeader, ClusterState lastAcceptedClusterState) {
// called under coordinator mutex
if (isBootstrappedSupplier.getAsBoolean()) {
return;
}
assert lastKnownLeader.isPresent() == false || Coordinator.isZen1Node(lastKnownLeader.get()) : lastKnownLeader;
// if there was a leader and it's not a old node then we must have been bootstrapped
final Settings dynamicSettings = lastAcceptedClusterState.metaData().settings();
final int minimumMasterNodes = DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.exists(dynamicSettings)
? DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.get(dynamicSettings)
: lastAcceptedClusterState.getMinimumMasterNodesOnPublishingMaster();
assert joiningRound == null : joiningRound;
final Set knownMasterNodeIds = new HashSet<>();
lastAcceptedClusterState.nodes().getMasterNodes().forEach(c -> knownMasterNodeIds.add(c.key));
joiningRound
= new JoiningRound(enableUnsafeBootstrappingOnUpgrade && lastKnownLeader.isPresent(), minimumMasterNodes, knownMasterNodeIds);
joiningRound.scheduleNextAttempt();
}
public void deactivate() {
// called under coordinator mutex
joiningRound = null;
}
/**
* Waits for some number of calls to {@link ListenableCountDown#countDown()} and then notifies a listener. The listener
* is only ever notified once, whether successful or not.
*/
private static class ListenableCountDown {
private final CountDown countDown;
private final ActionListener listener;
ListenableCountDown(int count, ActionListener listener) {
this.countDown = new CountDown(count);
this.listener = listener;
}
void onFailure(Exception e) {
if (countDown.fastForward()) {
listener.onFailure(e);
}
}
void countDown() {
if (countDown.countDown()) {
listener.onResponse(null);
}
}
}
private class JoiningRound {
private final boolean upgrading;
private final int minimumMasterNodes;
private final Set knownMasterNodeIds;
JoiningRound(boolean upgrading, int minimumMasterNodes, Set knownMasterNodeIds) {
this.upgrading = upgrading;
this.minimumMasterNodes = minimumMasterNodes;
this.knownMasterNodeIds = knownMasterNodeIds;
}
private boolean isRunning() {
return joiningRound == this && isBootstrappedSupplier.getAsBoolean() == false;
}
private boolean canBootstrap(Set discoveryNodes) {
return upgrading && minimumMasterNodes <= discoveryNodes.stream().filter(DiscoveryNode::isMasterNode).count();
}
void scheduleNextAttempt() {
if (isRunning() == false) {
return;
}
final ThreadPool threadPool = transportService.getThreadPool();
threadPool.scheduleUnlessShuttingDown(bwcPingTimeout, Names.SAME, new Runnable() {
@Override
public void run() {
if (isRunning() == false) {
return;
}
final Set discoveryNodes = Stream.concat(StreamSupport.stream(peersSupplier.get().spliterator(), false),
Stream.of(transportService.getLocalNode())).filter(DiscoveryNode::isMasterNode).collect(Collectors.toSet());
// this set of nodes is reasonably fresh - the PeerFinder cleans up nodes to which the transport service is not
// connected each time it wakes up (every second by default)
logger.debug("upgrading={}, minimumMasterNodes={}, nodes={}", upgrading, minimumMasterNodes, discoveryNodes);
if (discoveryNodes.stream().anyMatch(Coordinator::isZen1Node)) {
electBestOldMaster(discoveryNodes);
} else if (canBootstrap(discoveryNodes)) {
// no Zen1 nodes found, but the last-known master was a Zen1 node, so this is a rolling upgrade
transportService.getThreadPool().generic().execute(() -> {
try {
Set nodeIds = new HashSet<>();
discoveryNodes.forEach(n -> nodeIds.add(n.getId()));
final Iterator knownNodeIdIterator = knownMasterNodeIds.iterator();
while (nodeIds.size() < 2 * minimumMasterNodes - 1 && knownNodeIdIterator.hasNext()) {
nodeIds.add(knownNodeIdIterator.next());
}
final VotingConfiguration votingConfiguration = new VotingConfiguration(nodeIds);
assert votingConfiguration.hasQuorum(
discoveryNodes.stream().map(DiscoveryNode::getId).collect(Collectors.toList()));
assert 2 * minimumMasterNodes - 2 <= nodeIds.size() : nodeIds + " too small for " + minimumMasterNodes;
initialConfigurationConsumer.accept(votingConfiguration);
} catch (Exception e) {
logger.debug("exception during bootstrapping upgrade, retrying", e);
} finally {
scheduleNextAttempt();
}
});
} else {
scheduleNextAttempt();
}
}
/**
* Ping all the old discovered masters one more time to obtain their cluster state versions, and then vote for the best one.
* @param discoveryNodes The master nodes (old and new).
*/
private void electBestOldMaster(Set discoveryNodes) {
final Set masterCandidates = newConcurrentSet();
final ListenableCountDown listenableCountDown
= new ListenableCountDown(discoveryNodes.size(), new ActionListener() {
@Override
public void onResponse(Void value) {
assert masterCandidates.size() == discoveryNodes.size()
: masterCandidates + " does not match " + discoveryNodes;
// TODO we shouldn't elect a master with a version that's older than ours
// If the only Zen1 nodes left are stale, and we can bootstrap, maybe we should bootstrap?
// Do we ever need to elect a freshly-started Zen1 node?
if (isRunning()) {
final MasterCandidate electedMaster = electMasterService.electMaster(masterCandidates);
logger.debug("elected {}, sending join", electedMaster);
joinHelper.sendJoinRequest(electedMaster.getNode(), Optional.empty(),
JoiningRound.this::scheduleNextAttempt);
}
}
@Override
public void onFailure(Exception e) {
scheduleNextAttempt();
}
});
boolean foundOldMaster = false;
for (final DiscoveryNode discoveryNode : discoveryNodes) {
assert discoveryNode.isMasterNode() : discoveryNode;
if (Coordinator.isZen1Node(discoveryNode)) {
foundOldMaster = true;
transportService.sendRequest(discoveryNode, UnicastZenPing.ACTION_NAME,
new UnicastPingRequest(0, TimeValue.ZERO,
new PingResponse(createDiscoveryNodeWithImpossiblyHighId(transportService.getLocalNode()),
null, clusterName, UNKNOWN_VERSION)),
TransportRequestOptions.builder().withTimeout(bwcPingTimeout).build(),
new TransportResponseHandler() {
@Override
public void handleResponse(UnicastPingResponse response) {
long clusterStateVersion = UNKNOWN_VERSION;
for (PingResponse pingResponse : response.pingResponses) {
if (discoveryNode.equals(pingResponse.node())) {
clusterStateVersion
= max(clusterStateVersion, pingResponse.getClusterStateVersion());
}
}
masterCandidates.add(new MasterCandidate(discoveryNode, clusterStateVersion));
listenableCountDown.countDown();
}
@Override
public void handleException(TransportException exp) {
logger.debug(
new ParameterizedMessage("unexpected exception when pinging {}", discoveryNode), exp);
listenableCountDown.onFailure(exp);
}
@Override
public String executor() {
return Names.SAME;
}
@Override
public UnicastPingResponse read(StreamInput in) throws IOException {
return new UnicastPingResponse(in);
}
});
} else {
masterCandidates.add(
new MasterCandidate(createDiscoveryNodeWithImpossiblyHighId(discoveryNode), UNKNOWN_VERSION));
listenableCountDown.countDown();
}
}
assert foundOldMaster;
}
@Override
public String toString() {
return "discovery upgrade service retry";
}
});
}
}
/**
* Pre-7.0 nodes select the best master by comparing their IDs (as strings) and selecting the lowest one amongst those nodes with
* the best cluster state version. We want 7.0+ nodes to participate in these elections in a mixed cluster but never to win one, so
* we lie and claim to have an impossible ID that compares above all genuine IDs.
*/
public static DiscoveryNode createDiscoveryNodeWithImpossiblyHighId(DiscoveryNode node) {
// IDs are base-64-encoded UUIDs, which means they use the character set [0-9A-Za-z_-]. The highest character in this set is 'z',
// and 'z' < '{', so by starting the ID with '{' we can be sure it's greater. This is terrible.
return new DiscoveryNode(node.getName(), "{zen2}" + node.getId(), node.getEphemeralId(), node.getHostName(),
node.getHostAddress(), node.getAddress(), node.getAttributes(), node.getRoles(), node.getVersion());
}
}