org.opensearch.test.disruption.NetworkDisruption Maven / Gradle / Ivy
Show all versions of framework Show documentation
* SPDX-License-Identifier: Apache-2.0
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
package org.opensearch.test.disruption;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.opensearch.cluster.ClusterState;
import org.opensearch.cluster.NodeConnectionsService;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.unit.TimeValue;
import org.opensearch.common.util.set.Sets;
import org.opensearch.test.InternalTestCluster;
import org.opensearch.test.transport.MockTransportService;
import org.opensearch.transport.ConnectTransportException;
import org.opensearch.transport.TransportService;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.function.BiConsumer;
* Network disruptions are modeled using two components:
* 1) the {@link DisruptedLinks} represents the links in the network that are to be disrupted
* 2) the {@link NetworkLinkDisruptionType} represents the failure mode that is to be applied to the links
public class NetworkDisruption implements ServiceDisruptionScheme {
private static final Logger logger = LogManager.getLogger(NetworkDisruption.class);
private final DisruptedLinks disruptedLinks;
private final NetworkLinkDisruptionType networkLinkDisruptionType;
protected volatile InternalTestCluster cluster;
protected volatile boolean activeDisruption = false;
public NetworkDisruption(DisruptedLinks disruptedLinks, NetworkLinkDisruptionType networkLinkDisruptionType) {
this.disruptedLinks = disruptedLinks;
this.networkLinkDisruptionType = networkLinkDisruptionType;
public DisruptedLinks getDisruptedLinks() {
return disruptedLinks;
public NetworkLinkDisruptionType getNetworkLinkDisruptionType() {
return networkLinkDisruptionType;
public void applyToCluster(InternalTestCluster cluster) {
this.cluster = cluster;
public void removeFromCluster(InternalTestCluster cluster) {
public void removeAndEnsureHealthy(InternalTestCluster cluster) {
* ensures the cluster is healthy after the disruption
public void ensureHealthy(InternalTestCluster cluster) {
assert activeDisruption == false;
* Ensures that all nodes in the cluster are connected to each other.
* Some network disruptions may leave nodes that are not the cluster-manager disconnected from each other.
* {@link org.opensearch.cluster.NodeConnectionsService} will eventually reconnect but it's
* handy to be able to ensure this happens faster
public static void ensureFullyConnectedCluster(InternalTestCluster cluster) {
final String[] nodeNames = cluster.getNodeNames();
final CountDownLatch countDownLatch = new CountDownLatch(nodeNames.length);
for (String node : nodeNames) {
ClusterState stateOnNode = cluster.getInstance(ClusterService.class, node).state();
cluster.getInstance(NodeConnectionsService.class, node).reconnectToNodes(stateOnNode.nodes(), countDownLatch::countDown);
try {
} catch (InterruptedException e) {
throw new AssertionError(e);
protected void ensureNodeCount(InternalTestCluster cluster) {
public synchronized void applyToNode(String node, InternalTestCluster cluster) {
public synchronized void removeFromNode(String node1, InternalTestCluster cluster) {"stop disrupting node (disruption type: {}, disrupted links: {})", networkLinkDisruptionType, disruptedLinks);
applyToNodes(new String[] { node1 }, cluster.getNodeNames(), networkLinkDisruptionType::removeDisruption);
applyToNodes(cluster.getNodeNames(), new String[] { node1 }, networkLinkDisruptionType::removeDisruption);
public synchronized void testClusterClosed() {
public synchronized void startDisrupting() {"start disrupting (disruption type: {}, disrupted links: {})", networkLinkDisruptionType, disruptedLinks);
applyToNodes(cluster.getNodeNames(), cluster.getNodeNames(), networkLinkDisruptionType::applyDisruption);
activeDisruption = true;
public synchronized void stopDisrupting() {
if (!activeDisruption) {
}"stop disrupting (disruption scheme: {}, disrupted links: {})", networkLinkDisruptionType, disruptedLinks);
applyToNodes(cluster.getNodeNames(), cluster.getNodeNames(), networkLinkDisruptionType::removeDisruption);
activeDisruption = false;
* Applies action to all disrupted links between two sets of nodes.
private void applyToNodes(String[] nodes1, String[] nodes2, BiConsumer consumer) {
for (String node1 : nodes1) {
if (disruptedLinks.nodes().contains(node1)) {
for (String node2 : nodes2) {
if (disruptedLinks.nodes().contains(node2)) {
if (node1.equals(node2) == false) {
if (disruptedLinks.disrupt(node1, node2)) {
consumer.accept(transport(node1), transport(node2));
public TimeValue expectedTimeToHeal() {
return networkLinkDisruptionType.expectedTimeToHeal();
private MockTransportService transport(String node) {
return (MockTransportService) cluster.getInstance(TransportService.class, node);
public String toString() {
return "network disruption (disruption type: " + networkLinkDisruptionType + ", disrupted links: " + disruptedLinks + ")";
* Represents a set of nodes with connections between nodes that are to be disrupted
public abstract static class DisruptedLinks {
private final Set nodes;
protected DisruptedLinks(Set... nodeSets) {
Set allNodes = new HashSet<>();
for (Set nodeSet : nodeSets) {
this.nodes = allNodes;
* Set of all nodes that can participate in disruptions
public Set nodes() {
return nodes;
* Returns true iff network should be disrupted between the two nodes
public abstract boolean disrupt(String node1, String node2);
* Creates two partitions with symmetric failures
public static class TwoPartitions extends DisruptedLinks {
protected final Set nodesSideOne;
protected final Set nodesSideTwo;
public TwoPartitions(String node1, String node2) {
this(Collections.singleton(node1), Collections.singleton(node2));
public TwoPartitions(Set nodesSideOne, Set nodesSideTwo) {
super(nodesSideOne, nodesSideTwo);
this.nodesSideOne = nodesSideOne;
this.nodesSideTwo = nodesSideTwo;
assert nodesSideOne.isEmpty() == false;
assert nodesSideTwo.isEmpty() == false;
assert Sets.haveEmptyIntersection(nodesSideOne, nodesSideTwo);
public static TwoPartitions random(Random random, String... nodes) {
return random(random, Sets.newHashSet(nodes));
public static TwoPartitions random(Random random, Set nodes) {
assert nodes.size() >= 2 : "two partitions topology requires at least 2 nodes";
Set nodesSideOne = new HashSet<>();
Set nodesSideTwo = new HashSet<>();
for (String node : nodes) {
if (nodesSideOne.isEmpty()) {
} else if (nodesSideTwo.isEmpty()) {
} else if (random.nextBoolean()) {
} else {
return new TwoPartitions(nodesSideOne, nodesSideTwo);
public boolean disrupt(String node1, String node2) {
if (nodesSideOne.contains(node1) && nodesSideTwo.contains(node2)) {
return true;
if (nodesSideOne.contains(node2) && nodesSideTwo.contains(node1)) {
return true;
return false;
public Set getNodesSideOne() {
return Collections.unmodifiableSet(nodesSideOne);
public Set getNodesSideTwo() {
return Collections.unmodifiableSet(nodesSideTwo);
public Collection getMajoritySide() {
if (nodesSideOne.size() >= nodesSideTwo.size()) {
return getNodesSideOne();
} else {
return getNodesSideTwo();
public Collection getMinoritySide() {
if (nodesSideOne.size() >= nodesSideTwo.size()) {
return getNodesSideTwo();
} else {
return getNodesSideOne();
public String toString() {
return "two partitions (partition 1: " + nodesSideOne + " and partition 2: " + nodesSideTwo + ")";
* Creates two partitions with symmetric failures and a bridge node that can connect to both of the partitions
public static class Bridge extends DisruptedLinks {
private final String bridgeNode;
private final Set nodesSideOne;
private final Set nodesSideTwo;
public Bridge(String bridgeNode, Set nodesSideOne, Set nodesSideTwo) {
super(Collections.singleton(bridgeNode), nodesSideOne, nodesSideTwo);
this.bridgeNode = bridgeNode;
this.nodesSideOne = nodesSideOne;
this.nodesSideTwo = nodesSideTwo;
assert nodesSideOne.isEmpty() == false;
assert nodesSideTwo.isEmpty() == false;
assert Sets.haveEmptyIntersection(nodesSideOne, nodesSideTwo);
assert nodesSideOne.contains(bridgeNode) == false && nodesSideTwo.contains(bridgeNode) == false;
public static Bridge random(Random random, String... nodes) {
return random(random, Sets.newHashSet(nodes));
public static Bridge random(Random random, Set nodes) {
assert nodes.size() >= 3 : "bridge topology requires at least 3 nodes";
String bridgeNode = RandomPicks.randomFrom(random, nodes);
Set nodesSideOne = new HashSet<>();
Set nodesSideTwo = new HashSet<>();
for (String node : nodes) {
if (node.equals(bridgeNode) == false) {
if (nodesSideOne.isEmpty()) {
} else if (nodesSideTwo.isEmpty()) {
} else if (random.nextBoolean()) {
} else {
return new Bridge(bridgeNode, nodesSideOne, nodesSideTwo);
public boolean disrupt(String node1, String node2) {
if (nodesSideOne.contains(node1) && nodesSideTwo.contains(node2)) {
return true;
if (nodesSideOne.contains(node2) && nodesSideTwo.contains(node1)) {
return true;
return false;
public String getBridgeNode() {
return bridgeNode;
public Set getNodesSideOne() {
return nodesSideOne;
public Set getNodesSideTwo() {
return nodesSideTwo;
public String toString() {
return "bridge partition (super connected node: ["
+ bridgeNode
+ "], partition 1: "
+ nodesSideOne
+ " and partition 2: "
+ nodesSideTwo
+ ")";
public static class IsolateAllNodes extends DisruptedLinks {
public IsolateAllNodes(Set nodes) {
public boolean disrupt(String node1, String node2) {
return true;
* Abstract class representing various types of network disruptions. Instances of this class override the {@link #applyDisruption}
* method to apply their specific disruption type to requests that are send from a source to a target node.
public abstract static class NetworkLinkDisruptionType {
* Applies network disruption for requests send from the node represented by the source transport service to the node represented
* by the target transport service.
* @param sourceTransportService source transport service from which requests are sent
* @param targetTransportService target transport service to which requests are sent
public abstract void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService);
* Removes network disruption that was added by {@link #applyDisruption}.
* @param sourceTransportService source transport service from which requests are sent
* @param targetTransportService target transport service to which requests are sent
public void removeDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
* Returns expected time to heal after disruption has been removed. Defaults to instant healing.
public TimeValue expectedTimeToHeal() {
return TimeValue.timeValueMillis(0);
* Simulates a network disconnect. Sending a request from source to target node throws a {@link ConnectTransportException}.
public static final NetworkLinkDisruptionType DISCONNECT = new NetworkLinkDisruptionType() {
public void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
public String toString() {
return "network disconnects";
* Simulates an unresponsive target node by dropping requests sent from source to target node.
public static final NetworkLinkDisruptionType UNRESPONSIVE = new NetworkLinkDisruptionType() {
public void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
public String toString() {
return "network unresponsive";
* Simulates slow or congested network. Delivery of requests that are sent from source to target node are delayed by a configurable
* time amount.
public static class NetworkDelay extends NetworkLinkDisruptionType {
public static TimeValue DEFAULT_DELAY_MIN = TimeValue.timeValueSeconds(10);
public static TimeValue DEFAULT_DELAY_MAX = TimeValue.timeValueSeconds(90);
private final TimeValue delay;
* Delays requests by a fixed time value.
* @param delay time to delay requests
public NetworkDelay(TimeValue delay) {
this.delay = delay;
* Delays requests by a random but fixed time value between {@link #DEFAULT_DELAY_MIN} and {@link #DEFAULT_DELAY_MAX}.
* @param random instance to use for randomization of delay
public static NetworkDelay random(Random random) {
return random(random, DEFAULT_DELAY_MIN, DEFAULT_DELAY_MAX);
* Delays requests by a random but fixed time value between delayMin and delayMax.
* @param random instance to use for randomization of delay
* @param delayMin minimum delay
* @param delayMax maximum delay
public static NetworkDelay random(Random random, TimeValue delayMin, TimeValue delayMax) {
return new NetworkDelay(
delayMin.millis() == delayMax.millis()
? delayMin.millis()
: delayMin.millis() + random.nextInt((int) (delayMax.millis() - delayMin.millis()))
public void applyDisruption(MockTransportService sourceTransportService, MockTransportService targetTransportService) {
sourceTransportService.addUnresponsiveRule(targetTransportService, delay);
public TimeValue expectedTimeToHeal() {
return delay;
public String toString() {
return "network delays for [" + delay + "]";