org.opensearch.cluster.decommission.DecommissionService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearch Show documentation
Show all versions of opensearch Show documentation
OpenSearch subproject :server
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
package org.opensearch.cluster.decommission;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.opensearch.OpenSearchTimeoutException;
import org.opensearch.action.admin.cluster.decommission.awareness.delete.DeleteDecommissionStateResponse;
import org.opensearch.action.admin.cluster.decommission.awareness.put.DecommissionRequest;
import org.opensearch.action.admin.cluster.decommission.awareness.put.DecommissionResponse;
import org.opensearch.cluster.ClusterState;
import org.opensearch.cluster.ClusterStateObserver;
import org.opensearch.cluster.ClusterStateObserver.Listener;
import org.opensearch.cluster.ClusterStateUpdateTask;
import org.opensearch.cluster.NotClusterManagerException;
import org.opensearch.cluster.coordination.CoordinationMetadata;
import org.opensearch.cluster.metadata.WeightedRoutingMetadata;
import org.opensearch.cluster.node.DiscoveryNode;
import org.opensearch.cluster.routing.WeightedRouting;
import org.opensearch.cluster.routing.allocation.AllocationService;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.Priority;
import org.opensearch.common.UUIDs;
import org.opensearch.common.inject.Inject;
import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.unit.TimeValue;
import org.opensearch.core.action.ActionListener;
import org.opensearch.threadpool.ThreadPool;
import org.opensearch.transport.TransportService;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import static org.opensearch.action.admin.cluster.configuration.TransportAddVotingConfigExclusionsAction.MAXIMUM_VOTING_CONFIG_EXCLUSIONS_SETTING;
import static org.opensearch.action.admin.cluster.configuration.VotingConfigExclusionsHelper.clearExclusionsAndGetState;
import static org.opensearch.cluster.decommission.DecommissionHelper.addVotingConfigExclusionsForNodesToBeDecommissioned;
import static org.opensearch.cluster.decommission.DecommissionHelper.deleteDecommissionAttributeInClusterState;
import static org.opensearch.cluster.decommission.DecommissionHelper.filterNodesWithDecommissionAttribute;
import static org.opensearch.cluster.decommission.DecommissionHelper.nodeHasDecommissionedAttribute;
import static org.opensearch.cluster.decommission.DecommissionHelper.registerDecommissionAttributeInClusterState;
import static org.opensearch.cluster.routing.allocation.decider.AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING;
import static org.opensearch.cluster.routing.allocation.decider.AwarenessAllocationDecider.CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING;
/**
* Service responsible for entire lifecycle of decommissioning and recommissioning an awareness attribute.
*
* Whenever a cluster manager initiates operation to decommission an awareness attribute,
* the service makes the best attempt to perform the following task -
*
* - Initiates nodes decommissioning by adding custom metadata with the attribute and state as {@link DecommissionStatus#INIT}
* - Remove to-be-decommissioned cluster-manager eligible nodes from voting config and wait for its abdication if it is active leader
* - After the draining timeout, the service triggers nodes decommission. This marks the decommission status as {@link DecommissionStatus#IN_PROGRESS}
* - Once the decommission is successful, the service clears the voting config and marks the status as {@link DecommissionStatus#SUCCESSFUL}
* - If service fails at any step, it makes best attempt to mark the status as {@link DecommissionStatus#FAILED} and to clear voting config exclusion
*
*
* @opensearch.internal
*/
public class DecommissionService {
private static final Logger logger = LogManager.getLogger(DecommissionService.class);
private final ClusterService clusterService;
private final TransportService transportService;
private final ThreadPool threadPool;
private final DecommissionController decommissionController;
private volatile List awarenessAttributes;
private volatile Map> forcedAwarenessAttributes;
private volatile int maxVotingConfigExclusions;
@Inject
public DecommissionService(
Settings settings,
ClusterSettings clusterSettings,
ClusterService clusterService,
TransportService transportService,
ThreadPool threadPool,
AllocationService allocationService
) {
this.clusterService = clusterService;
this.transportService = transportService;
this.threadPool = threadPool;
this.decommissionController = new DecommissionController(clusterService, transportService, allocationService, threadPool);
this.awarenessAttributes = CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING, this::setAwarenessAttributes);
setForcedAwarenessAttributes(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.get(settings));
clusterSettings.addSettingsUpdateConsumer(
CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING,
this::setForcedAwarenessAttributes
);
maxVotingConfigExclusions = MAXIMUM_VOTING_CONFIG_EXCLUSIONS_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(MAXIMUM_VOTING_CONFIG_EXCLUSIONS_SETTING, this::setMaxVotingConfigExclusions);
}
private void setAwarenessAttributes(List awarenessAttributes) {
this.awarenessAttributes = awarenessAttributes;
}
private void setForcedAwarenessAttributes(Settings forceSettings) {
Map> forcedAwarenessAttributes = new HashMap<>();
Map forceGroups = forceSettings.getAsGroups();
for (Map.Entry entry : forceGroups.entrySet()) {
List aValues = entry.getValue().getAsList("values");
if (aValues.size() > 0) {
forcedAwarenessAttributes.put(entry.getKey(), aValues);
}
}
this.forcedAwarenessAttributes = forcedAwarenessAttributes;
}
private void setMaxVotingConfigExclusions(int maxVotingConfigExclusions) {
this.maxVotingConfigExclusions = maxVotingConfigExclusions;
}
/**
* Starts the new decommission request and registers the metadata with status as {@link DecommissionStatus#INIT}
* Once the status is updated, it tries to exclude to-be-decommissioned cluster manager eligible nodes from Voting Configuration
*
* @param decommissionRequest request for decommission action
* @param listener register decommission listener
*/
public void startDecommissionAction(
final DecommissionRequest decommissionRequest,
final ActionListener listener
) {
final DecommissionAttribute decommissionAttribute = decommissionRequest.getDecommissionAttribute();
// register the metadata with status as INIT as first step
clusterService.submitStateUpdateTask("decommission [" + decommissionAttribute + "]", new ClusterStateUpdateTask(Priority.URGENT) {
private Set nodeIdsToBeExcluded;
@Override
public ClusterState execute(ClusterState currentState) {
// validates if correct awareness attributes and forced awareness attribute set to the cluster before starting action
validateAwarenessAttribute(decommissionAttribute, awarenessAttributes, forcedAwarenessAttributes);
if (decommissionRequest.requestID() == null) {
decommissionRequest.setRequestID(UUIDs.randomBase64UUID());
}
DecommissionAttributeMetadata decommissionAttributeMetadata = currentState.metadata().decommissionAttributeMetadata();
// check that request is eligible to proceed and attribute is weighed away
ensureEligibleRequest(decommissionAttributeMetadata, decommissionRequest);
ensureToBeDecommissionedAttributeWeighedAway(currentState, decommissionAttribute);
ClusterState newState = registerDecommissionAttributeInClusterState(
currentState,
decommissionAttribute,
decommissionRequest.requestID()
);
// add all 'to-be-decommissioned' cluster manager eligible nodes to voting config exclusion
nodeIdsToBeExcluded = filterNodesWithDecommissionAttribute(currentState, decommissionAttribute, true).stream()
.map(DiscoveryNode::getId)
.collect(Collectors.toSet());
logger.info(
"resolved cluster manager eligible nodes [{}] that should be added to voting config exclusion",
nodeIdsToBeExcluded.toString()
);
newState = addVotingConfigExclusionsForNodesToBeDecommissioned(
newState,
nodeIdsToBeExcluded,
TimeValue.timeValueSeconds(120), // TODO - update it with request timeout
maxVotingConfigExclusions
);
logger.debug(
"registering decommission metadata [{}] to execute action",
newState.metadata().decommissionAttributeMetadata().toString()
);
return newState;
}
@Override
public void onFailure(String source, Exception e) {
logger.error(
() -> new ParameterizedMessage(
"failed to start decommission action for attribute [{}]",
decommissionAttribute.toString()
),
e
);
listener.onFailure(e);
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
DecommissionAttributeMetadata decommissionAttributeMetadata = newState.metadata().decommissionAttributeMetadata();
assert decommissionAttribute.equals(decommissionAttributeMetadata.decommissionAttribute());
assert decommissionAttributeMetadata.status().equals(DecommissionStatus.INIT);
assert decommissionAttributeMetadata.requestID().equals(decommissionRequest.requestID());
assert newState.getVotingConfigExclusions()
.stream()
.map(CoordinationMetadata.VotingConfigExclusion::getNodeId)
.collect(Collectors.toSet())
.containsAll(nodeIdsToBeExcluded);
logger.debug(
"registered decommission metadata for attribute [{}] with status [{}]",
decommissionAttributeMetadata.decommissionAttribute(),
decommissionAttributeMetadata.status()
);
final ClusterStateObserver observer = new ClusterStateObserver(
clusterService,
TimeValue.timeValueSeconds(120), // TODO - update it with request timeout
logger,
threadPool.getThreadContext()
);
final Predicate allNodesRemovedAndAbdicated = clusterState -> {
final Set votingConfigNodeIds = clusterState.getLastCommittedConfiguration().getNodeIds();
return nodeIdsToBeExcluded.stream().noneMatch(votingConfigNodeIds::contains)
&& clusterState.nodes().getClusterManagerNodeId() != null
&& nodeIdsToBeExcluded.contains(clusterState.nodes().getClusterManagerNodeId()) == false;
};
final Listener clusterStateListener = new Listener() {
@Override
public void onNewClusterState(ClusterState state) {
logger.info(
"successfully removed decommissioned cluster manager eligible nodes [{}] from voting config ",
nodeIdsToBeExcluded.toString()
);
if (state.nodes().isLocalNodeElectedClusterManager()) {
if (nodeHasDecommissionedAttribute(clusterService.localNode(), decommissionAttribute)) {
// this is an unexpected state, as after exclusion of nodes having decommission attribute,
// this local node shouldn't have had the decommission attribute. Will send the failure response to the user
String errorMsg =
"unexpected state encountered [local node is to-be-decommissioned leader] while executing decommission request";
logger.error(errorMsg);
// will go ahead and clear the voting config and mark the status as failed
decommissionController.updateMetadataWithDecommissionStatus(
DecommissionStatus.FAILED,
statusUpdateListener()
);
listener.onFailure(new IllegalStateException(errorMsg));
} else {
logger.info("will proceed to drain decommissioned nodes as local node is eligible to process the request");
// we are good here to send the response now as the request is processed by an eligible active leader
// and to-be-decommissioned cluster manager is no more part of Voting Configuration
listener.onResponse(new DecommissionResponse(true));
drainNodesWithDecommissionedAttribute(decommissionRequest);
}
} else {
// explicitly calling listener.onFailure with NotClusterManagerException as the local node is not leader
// this will ensures that request is retried until cluster manager times out
logger.info(
"local node is not eligible to process the request, "
+ "throwing NotClusterManagerException to attempt a retry on an eligible node"
);
listener.onFailure(
new NotClusterManagerException(
"node ["
+ transportService.getLocalNode().toString()
+ "] not eligible to execute decommission request. Will retry until timeout."
)
);
}
}
@Override
public void onClusterServiceClose() {
String errorMsg = "cluster service closed while waiting for abdication of to-be-decommissioned leader";
logger.error(errorMsg);
listener.onFailure(new DecommissioningFailedException(decommissionAttribute, errorMsg));
}
@Override
public void onTimeout(TimeValue timeout) {
String errorMsg = "timed out ["
+ timeout.toString()
+ "] while removing to-be-decommissioned cluster manager eligible nodes ["
+ nodeIdsToBeExcluded.toString()
+ "] from voting config";
logger.error(errorMsg);
listener.onFailure(new OpenSearchTimeoutException(errorMsg));
// will go ahead and clear the voting config and mark the status as failed
decommissionController.updateMetadataWithDecommissionStatus(DecommissionStatus.FAILED, statusUpdateListener());
}
};
// In case the cluster state is already processed even before this code is executed
// therefore testing first before attaching the listener
if (allNodesRemovedAndAbdicated.test(newState)) {
clusterStateListener.onNewClusterState(newState);
} else {
logger.debug("waiting to abdicate to-be-decommissioned leader");
observer.waitForNextChange(clusterStateListener, allNodesRemovedAndAbdicated); // TODO add request timeout here
}
}
});
}
// TODO - after registering the new status check if any node which is not excluded still present in decommissioned zone. If yes, start
// the action again (retry)
void drainNodesWithDecommissionedAttribute(DecommissionRequest decommissionRequest) {
ClusterState state = clusterService.getClusterApplierService().state();
assert state.metadata().decommissionAttributeMetadata().requestID().equals(decommissionRequest.requestID());
Set decommissionedNodes = filterNodesWithDecommissionAttribute(
state,
decommissionRequest.getDecommissionAttribute(),
false
);
if (decommissionRequest.isNoDelay()) {
// Call to fail the decommission nodes
failDecommissionedNodes(decommissionedNodes, decommissionRequest.getDecommissionAttribute());
} else {
decommissionController.updateMetadataWithDecommissionStatus(DecommissionStatus.DRAINING, new ActionListener<>() {
@Override
public void onResponse(DecommissionStatus status) {
logger.info("updated the decommission status to [{}]", status);
// set the weights
scheduleNodesDecommissionOnTimeout(decommissionedNodes, decommissionRequest.getDelayTimeout());
}
@Override
public void onFailure(Exception e) {
logger.error(
() -> new ParameterizedMessage(
"failed to update decommission status for attribute [{}] to [{}]",
decommissionRequest.getDecommissionAttribute().toString(),
DecommissionStatus.DRAINING
),
e
);
// This decommission state update call will most likely fail as the state update call to 'DRAINING'
// failed. But attempting it anyways as FAILED update might still pass as it doesn't have dependency on
// the current state
decommissionController.updateMetadataWithDecommissionStatus(DecommissionStatus.FAILED, statusUpdateListener());
}
});
}
}
void scheduleNodesDecommissionOnTimeout(Set decommissionedNodes, TimeValue timeoutForNodeDraining) {
ClusterState state = clusterService.getClusterApplierService().state();
DecommissionAttributeMetadata decommissionAttributeMetadata = state.metadata().decommissionAttributeMetadata();
if (decommissionAttributeMetadata == null) {
return;
}
assert decommissionAttributeMetadata.status().equals(DecommissionStatus.DRAINING)
: "Unexpected status encountered while decommissioning nodes.";
// This method ensures no matter what, we always exit from this function after clearing the voting config exclusion
DecommissionAttribute decommissionAttribute = decommissionAttributeMetadata.decommissionAttribute();
// Wait for timeout to happen. Log the active connection before decommissioning of nodes.
transportService.getThreadPool().schedule(() -> {
// Log active connections.
decommissionController.getActiveRequestCountOnDecommissionedNodes(decommissionedNodes);
// Call to fail the decommission nodes
failDecommissionedNodes(decommissionedNodes, decommissionAttribute);
}, timeoutForNodeDraining, ThreadPool.Names.GENERIC);
}
private void failDecommissionedNodes(Set decommissionedNodes, DecommissionAttribute decommissionAttribute) {
// Weighing away is complete. We have allowed the nodes to be drained. Let's move decommission status to IN_PROGRESS.
decommissionController.updateMetadataWithDecommissionStatus(DecommissionStatus.IN_PROGRESS, new ActionListener<>() {
@Override
public void onResponse(DecommissionStatus status) {
logger.info("updated the decommission status to [{}]", status);
// execute nodes decommissioning
decommissionController.removeDecommissionedNodes(
decommissionedNodes,
"nodes-decommissioned",
TimeValue.timeValueSeconds(120L),
new ActionListener() {
@Override
public void onResponse(Void unused) {
// will clear the voting config exclusion and mark the status as successful
decommissionController.updateMetadataWithDecommissionStatus(
DecommissionStatus.SUCCESSFUL,
statusUpdateListener()
);
}
@Override
public void onFailure(Exception e) {
// will go ahead and clear the voting config and mark the status as failed
decommissionController.updateMetadataWithDecommissionStatus(DecommissionStatus.FAILED, statusUpdateListener());
}
}
);
}
@Override
public void onFailure(Exception e) {
logger.error(
() -> new ParameterizedMessage(
"failed to update decommission status for attribute [{}] to [{}]",
decommissionAttribute.toString(),
DecommissionStatus.IN_PROGRESS
),
e
);
// This decommission state update call will most likely fail as the state update call to 'DRAINING'
// failed. But attempting it anyways as FAILED update might still pass as it doesn't have dependency on
// the current state
decommissionController.updateMetadataWithDecommissionStatus(DecommissionStatus.FAILED, statusUpdateListener());
}
});
}
private static void validateAwarenessAttribute(
final DecommissionAttribute decommissionAttribute,
List awarenessAttributes,
Map> forcedAwarenessAttributes
) {
String msg = null;
if (awarenessAttributes == null) {
msg = "awareness attribute not set to the cluster.";
} else if (forcedAwarenessAttributes == null) {
msg = "forced awareness attribute not set to the cluster.";
} else if (awarenessAttributes.contains(decommissionAttribute.attributeName()) == false) {
msg = "invalid awareness attribute requested for decommissioning";
} else if (forcedAwarenessAttributes.containsKey(decommissionAttribute.attributeName()) == false) {
msg = "forced awareness attribute [" + forcedAwarenessAttributes.toString() + "] doesn't have the decommissioning attribute";
}
// we don't need to check for attributes presence in forced awareness attribute because, weights API ensures that weights are set
// for all discovered routing attributes and forced attributes.
// So, if the weight is not present for the attribute it could mean its a non routing node (eg. cluster manager)
// And in that case, we are ok to proceed with the decommission. A routing node's attribute absence in forced awareness attribute is
// a problem elsewhere
if (msg != null) {
throw new DecommissioningFailedException(decommissionAttribute, msg);
}
}
private static void ensureToBeDecommissionedAttributeWeighedAway(ClusterState state, DecommissionAttribute decommissionAttribute) {
WeightedRoutingMetadata weightedRoutingMetadata = state.metadata().weightedRoutingMetadata();
if (weightedRoutingMetadata == null) {
throw new DecommissioningFailedException(
decommissionAttribute,
"no weights are set to the attribute. Please set appropriate weights before triggering decommission action"
);
}
WeightedRouting weightedRouting = weightedRoutingMetadata.getWeightedRouting();
if (weightedRouting.attributeName().equals(decommissionAttribute.attributeName()) == false) {
throw new DecommissioningFailedException(
decommissionAttribute,
"no weights are specified to attribute [" + decommissionAttribute.attributeName() + "]"
);
}
// in case the weight is not set for the attribute value, then we know that attribute values was not part of discovered routing node
// attribute or forced awareness attribute and in that case, we are ok if the attribute's value weight is not set. But if it's set,
// its weight has to be zero
Double attributeValueWeight = weightedRouting.weights().get(decommissionAttribute.attributeValue());
if (attributeValueWeight != null && attributeValueWeight.equals(0.0) == false) {
throw new DecommissioningFailedException(
decommissionAttribute,
"weight for decommissioned attribute is expected to be [0.0] but found [" + attributeValueWeight + "]"
);
}
}
private static void ensureEligibleRequest(
DecommissionAttributeMetadata decommissionAttributeMetadata,
DecommissionRequest decommissionRequest
) {
String msg;
DecommissionAttribute requestedDecommissionAttribute = decommissionRequest.getDecommissionAttribute();
if (decommissionAttributeMetadata != null) {
// check if the same attribute is registered and handle it accordingly
if (decommissionAttributeMetadata.decommissionAttribute().equals(requestedDecommissionAttribute)) {
switch (decommissionAttributeMetadata.status()) {
// for INIT - check if it is eligible internal retry
case INIT:
if (decommissionRequest.requestID().equals(decommissionAttributeMetadata.requestID()) == false) {
throw new DecommissioningFailedException(
requestedDecommissionAttribute,
"same request is already in status [INIT]"
);
}
break;
// for FAILED - we are good to process it again
case FAILED:
break;
case DRAINING:
case IN_PROGRESS:
case SUCCESSFUL:
msg = "same request is already in status [" + decommissionAttributeMetadata.status() + "]";
throw new DecommissioningFailedException(requestedDecommissionAttribute, msg);
default:
throw new IllegalStateException(
"unknown status [" + decommissionAttributeMetadata.status() + "] currently registered in metadata"
);
}
} else {
switch (decommissionAttributeMetadata.status()) {
case SUCCESSFUL:
// one awareness attribute is already decommissioned. We will reject the new request
msg = "one awareness attribute ["
+ decommissionAttributeMetadata.decommissionAttribute().toString()
+ "] already successfully decommissioned, recommission before triggering another decommission";
throw new DecommissioningFailedException(requestedDecommissionAttribute, msg);
case DRAINING:
case IN_PROGRESS:
case INIT:
// it means the decommission has been initiated or is inflight. In that case, will fail new request
msg = "there's an inflight decommission request for attribute ["
+ decommissionAttributeMetadata.decommissionAttribute().toString()
+ "] is in progress, cannot process this request";
throw new DecommissioningFailedException(requestedDecommissionAttribute, msg);
case FAILED:
break;
default:
throw new IllegalStateException(
"unknown status [" + decommissionAttributeMetadata.status() + "] currently registered in metadata"
);
}
}
}
}
private ActionListener statusUpdateListener() {
return new ActionListener<>() {
@Override
public void onResponse(DecommissionStatus status) {
logger.info("updated the decommission status to [{}]", status);
}
@Override
public void onFailure(Exception e) {
logger.error("unexpected failure occurred during decommission status update", e);
}
};
}
public void startRecommissionAction(final ActionListener listener) {
/*
* For abandoned requests, we might not really know if it actually restored the exclusion list.
* And can land up in cases where even after recommission, exclusions are set(which is unexpected).
* And by definition of OpenSearch - Clusters should have no voting configuration exclusions in normal operation.
* Once the excluded nodes have stopped, clear the voting configuration exclusions with DELETE /_cluster/voting_config_exclusions.
* And hence it is safe to remove the exclusion if any. User should make conscious choice before decommissioning awareness attribute.
*/
clusterService.submitStateUpdateTask("delete-decommission-state", new ClusterStateUpdateTask(Priority.URGENT) {
@Override
public ClusterState execute(ClusterState currentState) {
ClusterState newState = clearExclusionsAndGetState(currentState);
logger.info("Deleting the decommission attribute from the cluster state");
newState = deleteDecommissionAttributeInClusterState(newState);
return newState;
}
@Override
public void onFailure(String source, Exception e) {
logger.error(() -> new ParameterizedMessage("failure during recommission action [{}]", source), e);
listener.onFailure(e);
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
logger.info("successfully cleared voting config exclusion and decommissioned attribute");
assert newState.metadata().decommissionAttributeMetadata() == null;
assert newState.coordinationMetadata().getVotingConfigExclusions().isEmpty();
listener.onResponse(new DeleteDecommissionStateResponse(true));
}
});
}
}