org.swisspush.vertx.cluster.ClusterWatchdog Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cluster-watchdog Show documentation
Show all versions of cluster-watchdog Show documentation
Checks if all your hazelcast cluster members are receiveing published messages over the bus.
package org.swisspush.vertx.cluster;
import io.vertx.core.AbstractVerticle;
import io.vertx.core.Future;
import io.vertx.core.Handler;
import io.vertx.core.eventbus.EventBus;
import io.vertx.core.eventbus.Message;
import io.vertx.core.json.JsonObject;
import io.vertx.core.logging.Logger;
import io.vertx.core.logging.LoggerFactory;
import java.text.SimpleDateFormat;
import java.util.*;
public class ClusterWatchdog extends AbstractVerticle {
private static final String BROADCAST = "clusterhealthcheck";
private static final String RESPONSE_ADDRESS_PREFIX = "responseAddress-";
private static final String RESPONSE_ADDRESS_KEY = "responseAddress";
private static final String RESULT_ADDRESS_PREFIX = "resultAddress-";
private static final int WATCHDOG_START_DELAY = 2000;
private static final int TIME_TO_WAIT_FOR_RESPONSE = 2000;
private final Logger log = LoggerFactory.getLogger(ClusterWatchdog.class);
private EventBus eb;
private String uniqueId;
private int intervalInMillis;
private boolean useInjectedClusterMembersCount = true;
private int clusterMemberCount;
private Map> healthCheckResponses;
private ClusterWatchdogHttpHandler clusterWatchdogHttpHandler;
@Override
public void start(Future fut) {
eb = vertx.eventBus();
JsonObject config = config();
log.info("started with config: \n" +config.encodePrettily());
// get the interval in seconds to execute the checks
intervalInMillis = config.getInteger("intervalInSec", 30) * 1000;
log.info("ClusterWatchdog interval in sec is: " + intervalInMillis / 1000);
// get the clusterMembers injected over the config, if available
int clusterMemberCountFromConfig = config.getInteger("clusterMemberCount", -1);
if(clusterMemberCountFromConfig == -1) {
useInjectedClusterMembersCount = false;
} else {
clusterMemberCount = clusterMemberCountFromConfig;
}
int resultQueueLength = config.getInteger("resultQueueLength", 100);
log.info("ClusterWatchdog used resultQueueLength: " + resultQueueLength);
int httport = config.getInteger("http.port", 7878);
log.info("ClusterWatchdog used http port: " + httport);
// initalize variables
healthCheckResponses = new HashMap<>();
clusterWatchdogHttpHandler = new ClusterWatchdogHttpHandler(vertx, log, resultQueueLength);
// create a unique ID per verticle to identify it
uniqueId = UUID.randomUUID().toString();
log.info("ClusterWatchdog started cluster check verticle: " + uniqueId);
// the handler for the broadcast event, reads the sender from the event and reply to him
eb.consumer(BROADCAST, new Handler>() {
public void handle(Message event) {
String responseAddress = event.body().getString(RESPONSE_ADDRESS_KEY);
String timestamp = event.body().getString("timestamp");
log.debug("got broadcast, i am: " + uniqueId + ", responseAddress is: " + responseAddress + " timestamp is: " + timestamp);
// respond to the sender
JsonObject responsePayload = new JsonObject();
responsePayload.put("senderId", uniqueId);
responsePayload.put("timestamp", timestamp);
eb.send(responseAddress, responsePayload);
}
});
// the handler for the reply of the broadcast handler, adds the result to the healthCheckResponses
eb.consumer(RESPONSE_ADDRESS_PREFIX + uniqueId, new Handler>() {
public void handle(Message event) {
String senderId = event.body().getString("senderId");
String timestamp = event.body().getString("timestamp");
log.debug("ClusterWatchdog got response, i am: " + uniqueId + ", senderId is: " + senderId);
if(healthCheckResponses.get(timestamp) == null) {
healthCheckResponses.put(timestamp, new ArrayList<>());
}
JsonObject response = new JsonObject();
response.put("senderId", senderId);
healthCheckResponses.get(timestamp).add(response);
}
});
// the handler to add the result from the other members
eb.consumer(RESULT_ADDRESS_PREFIX + uniqueId, new Handler>() {
public void handle(Message watchdogResultJsonObj) {
clusterWatchdogHttpHandler.resultQueue.add(WatchdogResult.fromJson(watchdogResultJsonObj.body()));
}
});
if(intervalInMillis == 0) {
// wait until all verticles are up and running
vertx.setTimer(WATCHDOG_START_DELAY, new ClusterCheckHandler());
}
if(intervalInMillis > 0) {
// wait until all verticles are up and running
vertx.setTimer(WATCHDOG_START_DELAY, event -> vertx.setPeriodic(intervalInMillis, new ClusterCheckHandler()));
}
vertx.createHttpServer().requestHandler(clusterWatchdogHttpHandler).listen(httport, result -> {
if(result.succeeded()){
fut.complete();
} else {
fut.fail(result.cause());
}
});
}
class ClusterCheckHandler implements Handler {
public void handle(Long event) {
JsonObject testpayload = new JsonObject();
testpayload.put(RESPONSE_ADDRESS_KEY, RESPONSE_ADDRESS_PREFIX + uniqueId);
log.debug("ClusterWatchdog send single broadcast healthcheck from: " + uniqueId);
final String timestamp = String.valueOf(System.currentTimeMillis());
testpayload.put("timestamp", timestamp);
// if the cluster
if(! useInjectedClusterMembersCount) {
ClusterInformation clusterInformation = new ClusterInformation();
try {
clusterMemberCount = clusterInformation.getMembers(log).size();
} catch (MoreThanOneHazelcastInstanceException e) {
log.error("ClusterWatchdog got more than one hazelcast instance, we can only handle one hazelcast instance, we abort");
return;
}
}
if(clusterMemberCount == 0) {
log.info("ClusterWatchdog no cluster members found, no watchdog will run");
return;
}
// publish the broadcast event which will us get the response of all the registered handlers
eb.publish(BROADCAST, testpayload);
// give the handlers 2sec to respond
// log an error message in the case if the response counts don't match the cluster member amount
vertx.setTimer(TIME_TO_WAIT_FOR_RESPONSE, event1 -> {
List responses = healthCheckResponses.remove(timestamp);
String time = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());
WatchdogResult watchdogResult = new WatchdogResult();
watchdogResult.broadcastTimestamp = timestamp;
watchdogResult.time = time;
watchdogResult.verticleId = uniqueId;
watchdogResult.clusterMemberCount = clusterMemberCount;
if(responses == null) {
log.error("ClusterWatchdog found no responses for timestamp: " + timestamp);
watchdogResult.status = ClusterHealthStatus.INCONSISTENT;
watchdogResult.responders = null;
clusterWatchdogHttpHandler.resultQueue.add(watchdogResult);
} else if(clusterMemberCount != responses.size()){
watchdogResult.status = ClusterHealthStatus.INCONSISTENT;
watchdogResult.setResponders(responses);
log.error("ClusterWatchdog known cluster members: " + clusterMemberCount + " responses: " + responses.size());
clusterWatchdogHttpHandler.resultQueue.add(watchdogResult);
// send the result to the other members to have consistency over the cluster in the results
sendResultToOtherMembers(watchdogResult);
} else {
watchdogResult.status = ClusterHealthStatus.CONSISTENT;
watchdogResult.setResponders(responses);
log.debug("ClusterWatchdog all the cluster members (" + responses.size() + ") answered: " + responses.toString());
clusterWatchdogHttpHandler.resultQueue.add(watchdogResult);
// send the result to the other members to have consistency over the cluster in the results
sendResultToOtherMembers(watchdogResult);
}
});
}
private void sendResultToOtherMembers(WatchdogResult watchdogResult) {
for(String member : watchdogResult.responders) {
// only send it to the other members
if(uniqueId.equals(member)) {
continue;
}
eb.send(RESULT_ADDRESS_PREFIX+member, watchdogResult.toJson());
}
}
}
}