org.opensearch.test.disruption.NetworkDisruption Maven / Gradle / Ivy
Show all versions of framework Show documentation
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.test.disruption;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.opensearch.cluster.ClusterState;
import org.opensearch.cluster.NodeConnectionsService;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.unit.TimeValue;
import org.opensearch.common.util.set.Sets;
import org.opensearch.test.InternalTestCluster;
import org.opensearch.test.transport.MockTransportService;
import org.opensearch.transport.ConnectTransportException;
import org.opensearch.transport.TransportService;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.function.BiConsumer;
/**
* Network disruptions are modeled using two components:
* 1) the {@link DisruptedLinks} represents the links in the network that are to be disrupted
* 2) the {@link NetworkLinkDisruptionType} represents the failure mode that is to be applied to the links
*/
public class NetworkDisruption implements ServiceDisruptionScheme {
private static final Logger logger = LogManager.getLogger(NetworkDisruption.class);
private final DisruptedLinks disruptedLinks;
private final NetworkLinkDisruptionType networkLinkDisruptionType;
protected volatile InternalTestCluster cluster;
protected volatile boolean activeDisruption = false;
public NetworkDisruption(DisruptedLinks disruptedLinks, NetworkLinkDisruptionType networkLinkDisruptionType) {
this.disruptedLinks = disruptedLinks;
this.networkLinkDisruptionType = networkLinkDisruptionType;
}
public DisruptedLinks getDisruptedLinks() {
return disruptedLinks;
}
public NetworkLinkDisruptionType getNetworkLinkDisruptionType() {
return networkLinkDisruptionType;
}
@Override
public void applyToCluster(InternalTestCluster cluster) {
this.cluster = cluster;
}
@Override
public void removeFromCluster(InternalTestCluster cluster) {
stopDisrupting();
}
@Override
public void removeAndEnsureHealthy(InternalTestCluster cluster) {
removeFromCluster(cluster);
ensureHealthy(cluster);
}
/**
* ensures the cluster is healthy after the disruption
*/
public void ensureHealthy(InternalTestCluster cluster) {
assert activeDisruption == false;
ensureNodeCount(cluster);
ensureFullyConnectedCluster(cluster);
}
/**
* Ensures that all nodes in the cluster are connected to each other.
*
* Some network disruptions may leave nodes that are not the cluster-manager disconnected from each other.
* {@link org.opensearch.cluster.NodeConnectionsService} will eventually reconnect but it's
* handy to be able to ensure this happens faster
*/
public static void ensureFullyConnectedCluster(InternalTestCluster cluster) {
final String[] nodeNames = cluster.getNodeNames();
final CountDownLatch countDownLatch = new CountDownLatch(nodeNames.length);
for (String node : nodeNames) {
ClusterState stateOnNode = cluster.getInstance(ClusterService.class, node).state();
cluster.getInstance(NodeConnectionsService.class, node).reconnectToNodes(stateOnNode.nodes(), countDownLatch::countDown);
}
try {
countDownLatch.await();
} catch (InterruptedException e) {
throw new AssertionError(e);
}
}
protected void ensureNodeCount(InternalTestCluster cluster) {
cluster.validateClusterFormed();
}
@Override
public synchronized void applyToNode(String node, InternalTestCluster cluster) {
}
@Override
public synchronized void removeFromNode(String node1, InternalTestCluster cluster) {
logger.info("stop disrupting node (disruption type: {}, disrupted links: {})", networkLinkDisruptionType, disruptedLinks);
applyToNodes(new String[] { node1 }, cluster.getNodeNames(), networkLinkDisruptionType::removeDisruption);
applyToNodes(cluster.getNodeNames(), new String[] { node1 }, networkLinkDisruptionType::removeDisruption);
}
@Override
public synchronized void testClusterClosed() {
}
@Override
public synchronized void startDisrupting() {
logger.info("start disrupting (disruption type: {}, disrupted links: {})", networkLinkDisruptionType, disruptedLinks);
applyToNodes(cluster.getNodeNames(), cluster.getNodeNames(), networkLinkDisruptionType::applyDisruption);
activeDisruption = true;
}
@Override
public synchronized void stopDisrupting() {
if (!activeDisruption) {
return;
}
logger.info("stop disrupting (disruption scheme: {}, disrupted links: {})", networkLinkDisruptionType, disruptedLinks);
applyToNodes(cluster.getNodeNames(), cluster.getNodeNames(), networkLinkDisruptionType::removeDisruption);
activeDisruption = false;
}
/**
* Applies action to all disrupted links between two sets of nodes.
*/
private void applyToNodes(String[] nodes1, String[] nodes2, BiConsumer consumer) {
for (String node1 : nodes1) {
if (disruptedLinks.nodes().contains(node1)) {
for (String node2 : nodes2) {
if (disruptedLinks.nodes().contains(node2)) {
if (node1.equals(node2) == false) {
if (disruptedLinks.disrupt(node1, node2)) {
consumer.accept(transport(node1), transport(node2));
}
}
}
}
}
}
}
@Override
public TimeValue expectedTimeToHeal() {
return networkLinkDisruptionType.expectedTimeToHeal();
}
private MockTransportService transport(String node) {
return (MockTransportService) cluster.getInstance(TransportService.class, node);
}
@Override
public String toString() {
return "network disruption (disruption type: " + networkLinkDisruptionType + ", disrupted links: " + disruptedLinks + ")";
}
/**
* Represents a set of nodes with connections between nodes that are to be disrupted
*/
public abstract static class DisruptedLinks {
private final Set nodes;
protected DisruptedLinks(Set... nodeSets) {
Set allNodes = new HashSet<>();
for (Set nodeSet : nodeSets) {
allNodes.addAll(nodeSet);
}
this.nodes = allNodes;
}
/**
* Set of all nodes that can participate in disruptions
*/
public Set nodes() {
return nodes;
}
/**
* Returns true iff network should be disrupted between the two nodes
*/
public abstract boolean disrupt(String node1, String node2);
}
/**
* Creates two partitions with symmetric failures
*/
public static class TwoPartitions extends DisruptedLinks {
protected final Set nodesSideOne;
protected final Set nodesSideTwo;
public TwoPartitions(String node1, String node2) {
this(Collections.singleton(node1), Collections.singleton(node2));
}
public TwoPartitions(Set nodesSideOne, Set nodesSideTwo) {
super(nodesSideOne, nodesSideTwo);
this.nodesSideOne = nodesSideOne;
this.nodesSideTwo = nodesSideTwo;
assert nodesSideOne.isEmpty() == false;
assert nodesSideTwo.isEmpty() == false;
assert Sets.haveEmptyIntersection(nodesSideOne, nodesSideTwo);
}
public static TwoPartitions random(Random random, String... nodes) {
return random(random, Sets.newHashSet(nodes));
}
public static TwoPartitions random(Random random, Set nodes) {
assert nodes.size() >= 2 : "two partitions topology requires at least 2 nodes";
Set nodesSideOne = new HashSet<>();
Set nodesSideTwo = new HashSet<>();
for (String node : nodes) {
if (nodesSideOne.isEmpty()) {
nodesSideOne.add(node);
} else if (nodesSideTwo.isEmpty()) {
nodesSideTwo.add(node);
} else if (random.nextBoolean()) {
nodesSideOne.add(node);
} else {
nodesSideTwo.add(node);
}
}
return new TwoPartitions(nodesSideOne, nodesSideTwo);
}
@Override
public boolean disrupt(String node1, String node2) {
if (nodesSideOne.contains(node1) && nodesSideTwo.contains(node2)) {
return true;
}
if (nodesSideOne.contains(node2) && nodesSideTwo.contains(node1)) {
return true;
}
return false;
}
public Set getNodesSideOne() {
return Collections.unmodifiableSet(nodesSideOne);
}
public Set getNodesSideTwo() {
return Collections.unmodifiableSet(nodesSideTwo);
}
public Collection getMajoritySide() {
if (nodesSideOne.size() >= nodesSideTwo.size()) {
return getNodesSideOne();
} else {
return getNodesSideTwo();
}
}
public Collection getMinoritySide() {
if (nodesSideOne.size() >= nodesSideTwo.size()) {
return getNodesSideTwo();
} else {
return getNodesSideOne();
}
}
@Override
public String toString() {
return "two partitions (partition 1: " + nodesSideOne + " and partition 2: " + nodesSideTwo + ")";
}
}
/**
* Creates two partitions with symmetric failures and a bridge node that can connect to both of the partitions
*/
public static class Bridge extends DisruptedLinks {
private final String bridgeNode;
private final Set nodesSideOne;
private final Set nodesSideTwo;
public Bridge(String bridgeNode, Set nodesSideOne, Set nodesSideTwo) {
super(Collections.singleton(bridgeNode), nodesSideOne, nodesSideTwo);
this.bridgeNode = bridgeNode;
this.nodesSideOne = nodesSideOne;
this.nodesSideTwo = nodesSideTwo;
assert nodesSideOne.isEmpty() == false;
assert nodesSideTwo.isEmpty() == false;
assert Sets.haveEmptyIntersection(nodesSideOne, nodesSideTwo);
assert nodesSideOne.contains(bridgeNode) == false && nodesSideTwo.contains(bridgeNode) == false;
}
public static Bridge random(Random random, String... nodes) {
return random(random, Sets.newHashSet(nodes));
}
public static Bridge random(Random random, Set nodes) {
assert nodes.size() >= 3 : "bridge topology requires at least 3 nodes";
String bridgeNode = RandomPicks.randomFrom(random, nodes);
Set nodesSideOne = new HashSet<>();
Set nodesSideTwo = new HashSet<>();
for (String node : nodes) {
if (node.equals(bridgeNode) == false) {
if (nodesSideOne.isEmpty()) {
nodesSideOne.add(node);
} else if (nodesSideTwo.isEmpty()) {
nodesSideTwo.add(node);
} else if (random.nextBoolean()) {
nodesSideOne.add(node);
} else {
nodesSideTwo.add(node);
}
}
}
return new Bridge(bridgeNode, nodesSideOne, nodesSideTwo);
}
@Override
public boolean disrupt(String node1, String node2) {
if (nodesSideOne.contains(node1) && nodesSideTwo.contains(node2)) {
return true;
}
if (nodesSideOne.contains(node2) && nodesSideTwo.contains(node1)) {
return true;
}
return false;
}
public String getBridgeNode() {
return bridgeNode;
}
public Set getNodesSideOne() {
return nodesSideOne;
}
public Set getNodesSideTwo() {
return nodesSideTwo;
}
public String toString() {
return "bridge partition (super connected node: ["
+ bridgeNode
+ "], partition 1: "
+ nodesSideOne
+ " and partition 2: "
+ nodesSideTwo
+ ")";
}
}
public static class IsolateAllNodes extends DisruptedLinks {
public IsolateAllNodes(Set nodes) {
super(nodes);
}
@Override
public boolean disrupt(String node1, String node2) {
return true;
}
}
/**
* Abstract class representing various types of network disruptions. Instances of this class override the {@link #applyDisruption}
* method to apply their specific disruption type to requests that are send from a source to a target node.
*/
public abstract static class NetworkLinkDisruptionType {
/**
* Applies network disruption for requests send from the node represented by the source transport service to the node represented
* by the target transport service.
*
* @param sourceTransportService source transport service from which requests are sent
* @param targetTransportService target transport service to which requests are sent
*/
public abstract void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService);
/**
* Removes network disruption that was added by {@link #applyDisruption}.
*
* @param sourceTransportService source transport service from which requests are sent
* @param targetTransportService target transport service to which requests are sent
*/
public void removeDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
sourceTransportService.clearOutboundRules(targetTransportService);
}
/**
* Returns expected time to heal after disruption has been removed. Defaults to instant healing.
*/
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueMillis(0);
}
}
/**
* Simulates a network disconnect. Sending a request from source to target node throws a {@link ConnectTransportException}.
*/
public static final NetworkLinkDisruptionType DISCONNECT = new NetworkLinkDisruptionType() {
@Override
public void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
sourceTransportService.addFailToSendNoConnectRule(targetTransportService);
}
@Override
public String toString() {
return "network disconnects";
}
};
/**
* Simulates an unresponsive target node by dropping requests sent from source to target node.
*/
public static final NetworkLinkDisruptionType UNRESPONSIVE = new NetworkLinkDisruptionType() {
@Override
public void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
sourceTransportService.addUnresponsiveRule(targetTransportService);
}
@Override
public String toString() {
return "network unresponsive";
}
};
/**
* Simulates slow or congested network. Delivery of requests that are sent from source to target node are delayed by a configurable
* time amount.
*/
public static class NetworkDelay extends NetworkLinkDisruptionType {
public static TimeValue DEFAULT_DELAY_MIN = TimeValue.timeValueSeconds(10);
public static TimeValue DEFAULT_DELAY_MAX = TimeValue.timeValueSeconds(90);
private final TimeValue delay;
/**
* Delays requests by a fixed time value.
*
* @param delay time to delay requests
*/
public NetworkDelay(TimeValue delay) {
this.delay = delay;
}
/**
* Delays requests by a random but fixed time value between {@link #DEFAULT_DELAY_MIN} and {@link #DEFAULT_DELAY_MAX}.
*
* @param random instance to use for randomization of delay
*/
public static NetworkDelay random(Random random) {
return random(random, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX);
}
/**
* Delays requests by a random but fixed time value between delayMin and delayMax.
*
* @param random instance to use for randomization of delay
* @param delayMin minimum delay
* @param delayMax maximum delay
*/
public static NetworkDelay random(Random random, TimeValue delayMin, TimeValue delayMax) {
return new NetworkDelay(
TimeValue.timeValueMillis(
delayMin.millis() == delayMax.millis()
? delayMin.millis()
: delayMin.millis() + random.nextInt((int) (delayMax.millis() - delayMin.millis()))
)
);
}
@Override
public void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
sourceTransportService.addUnresponsiveRule(targetTransportService, delay);
}
@Override
public TimeValue expectedTimeToHeal() {
return delay;
}
@Override
public String toString() {
return "network delays for [" + delay + "]";
}
}
}