
com.hazelcast.cluster.impl.ClusterHeartbeatManager Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2016, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.cluster.impl;
import com.hazelcast.cluster.impl.operations.HeartbeatOperation;
import com.hazelcast.cluster.impl.operations.MasterConfirmationOperation;
import com.hazelcast.cluster.impl.operations.MemberInfoUpdateOperation;
import com.hazelcast.instance.GroupProperties;
import com.hazelcast.instance.GroupProperty;
import com.hazelcast.instance.MemberImpl;
import com.hazelcast.instance.Node;
import com.hazelcast.instance.NodeState;
import com.hazelcast.internal.metrics.Probe;
import com.hazelcast.logging.ILogger;
import com.hazelcast.nio.Address;
import com.hazelcast.nio.Connection;
import com.hazelcast.spi.ExecutionService;
import com.hazelcast.spi.impl.NodeEngineImpl;
import com.hazelcast.util.Clock;
import com.hazelcast.util.EmptyStatement;
import java.net.ConnectException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.TimeUnit;
import static com.hazelcast.cluster.impl.ClusterServiceImpl.EXECUTOR_NAME;
import static com.hazelcast.cluster.impl.ClusterServiceImpl.createMemberInfoList;
import static java.lang.String.format;
/**
* ClusterHeartbeatManager manages the heartbeat sending and receiving
* process of a node.
*
* It periodically sends heartbeat to the other nodes and stores heartbeat timestamps
* per node when a heartbeat is received from other nodes. If enabled and required, it can send
* ping packets (an ICMP ping or an echo packet depending on the environment and settings).
*
* If it detects a member is not live anymore, that member is kicked out of cluster.
*
* Another job of ClusterHeartbeatManager is to send (if not master node) and track (if master)
* master-confirmation requests. Each slave node sends a master-confirmation periodically and
* master node stores them with timestamps. A slave node which does not send master-confirmation in
* a timeout will be kicked out of the cluster by master node.
*/
public class ClusterHeartbeatManager {
private static final long CLOCK_JUMP_THRESHOLD = 10000L;
private static final int HEART_BEAT_INTERVAL_FACTOR = 10;
private static final int MAX_PING_RETRY_COUNT = 5;
private final ILogger logger;
private final Node node;
private final NodeEngineImpl nodeEngine;
private final ClusterServiceImpl clusterService;
private final ClusterClockImpl clusterClock;
private final ConcurrentMap heartbeatTimes = new ConcurrentHashMap();
private final ConcurrentMap masterConfirmationTimes = new ConcurrentHashMap();
private final long maxNoHeartbeatMillis;
private final long maxNoMasterConfirmationMillis;
private final long heartbeatIntervalMillis;
private final long pingIntervalMillis;
private final boolean icmpEnabled;
private final int icmpTtl;
private final int icmpTimeoutMillis;
@Probe(name = "lastHeartBeat")
private volatile long lastHeartBeat;
private volatile long lastClusterTimeDiff;
public ClusterHeartbeatManager(Node node, ClusterServiceImpl clusterService) {
this.node = node;
this.clusterService = clusterService;
this.nodeEngine = node.getNodeEngine();
clusterClock = clusterService.getClusterClock();
logger = node.getLogger(getClass());
maxNoHeartbeatMillis = node.groupProperties.getMillis(GroupProperty.MAX_NO_HEARTBEAT_SECONDS);
maxNoMasterConfirmationMillis = node.groupProperties.getMillis(GroupProperty.MAX_NO_MASTER_CONFIRMATION_SECONDS);
heartbeatIntervalMillis = getHeartBeatInterval(node.groupProperties);
pingIntervalMillis = heartbeatIntervalMillis * HEART_BEAT_INTERVAL_FACTOR;
icmpEnabled = node.groupProperties.getBoolean(GroupProperty.ICMP_ENABLED);
icmpTtl = node.groupProperties.getInteger(GroupProperty.ICMP_TTL);
icmpTimeoutMillis = (int) node.groupProperties.getMillis(GroupProperty.ICMP_TIMEOUT);
}
private static long getHeartBeatInterval(GroupProperties groupProperties) {
long heartbeatInterval = groupProperties.getMillis(GroupProperty.HEARTBEAT_INTERVAL_SECONDS);
return heartbeatInterval > 0 ? heartbeatInterval : TimeUnit.SECONDS.toMillis(1);
}
void init() {
ExecutionService executionService = nodeEngine.getExecutionService();
executionService.scheduleWithFixedDelay(EXECUTOR_NAME, new Runnable() {
public void run() {
heartBeat();
}
}, heartbeatIntervalMillis, heartbeatIntervalMillis, TimeUnit.MILLISECONDS);
long masterConfirmationInterval = node.groupProperties.getSeconds(GroupProperty.MASTER_CONFIRMATION_INTERVAL_SECONDS);
masterConfirmationInterval = (masterConfirmationInterval > 0 ? masterConfirmationInterval : 1);
executionService.scheduleWithFixedDelay(EXECUTOR_NAME, new Runnable() {
public void run() {
sendMasterConfirmation();
}
}, masterConfirmationInterval, masterConfirmationInterval, TimeUnit.SECONDS);
long memberListPublishInterval = node.groupProperties.getSeconds(GroupProperty.MEMBER_LIST_PUBLISH_INTERVAL_SECONDS);
memberListPublishInterval = (memberListPublishInterval > 0 ? memberListPublishInterval : 1);
executionService.scheduleWithFixedDelay(EXECUTOR_NAME, new Runnable() {
public void run() {
sendMemberListToOthers();
}
}, memberListPublishInterval, memberListPublishInterval, TimeUnit.SECONDS);
}
public void onHeartbeat(MemberImpl member, long timestamp) {
if (member != null) {
long clusterTime = clusterClock.getClusterTime();
if (logger.isFineEnabled()) {
logger.fine(format("Received heartbeat from %s (now: %s, timestamp: %s)",
member, new Date(clusterTime), new Date(timestamp)));
}
if (clusterTime - timestamp > maxNoHeartbeatMillis / 2) {
logger.warning(format("Ignoring heartbeat from %s since it is expired (now: %s, timestamp: %s)", member,
new Date(clusterTime), new Date(timestamp)));
return;
}
if (isMaster(member)) {
clusterClock.setMasterTime(timestamp);
}
heartbeatTimes.put(member, clusterClock.getClusterTime());
}
}
public void acceptMasterConfirmation(MemberImpl member, long timestamp) {
if (member != null) {
if (logger.isFinestEnabled()) {
logger.finest("MasterConfirmation has been received from " + member);
}
long clusterTime = clusterClock.getClusterTime();
if (clusterTime - timestamp > maxNoMasterConfirmationMillis / 2) {
logger.warning(
format("Ignoring master confirmation from %s, since it is expired (now: %s, timestamp: %s)",
member, new Date(clusterTime), new Date(timestamp)));
return;
}
masterConfirmationTimes.put(member, clusterTime);
}
}
void heartBeat() {
if (!node.joined()) {
return;
}
checkClockDrift(heartbeatIntervalMillis);
final long clusterTime = clusterClock.getClusterTime();
if (node.isMaster()) {
heartBeatWhenMaster(clusterTime);
} else {
heartBeatWhenSlave(clusterTime);
}
}
private void checkClockDrift(long intervalMillis) {
long now = Clock.currentTimeMillis();
// compensate for any abrupt jumps forward in the system clock
if (lastHeartBeat != 0L) {
long clockJump = now - lastHeartBeat - intervalMillis;
long absoluteClockJump = Math.abs(clockJump);
if (absoluteClockJump > CLOCK_JUMP_THRESHOLD) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS");
logger.info(format("System clock apparently jumped from %s to %s since last heartbeat (%+d ms)",
sdf.format(new Date(lastHeartBeat)), sdf.format(new Date(now)), clockJump));
// We only set cluster clock, if clock jumps more than threshold.
// If the last cluster-time diff we've seen is significantly different than what we read now,
// that means, it's already adjusted by master heartbeat. Then don't update the cluster time again.
long currentClusterTimeDiff = clusterClock.getClusterTimeDiff();
if (Math.abs(lastClusterTimeDiff - currentClusterTimeDiff) < CLOCK_JUMP_THRESHOLD) {
// adjust cluster clock due to clock drift
clusterClock.setClusterTimeDiff(currentClusterTimeDiff - clockJump);
}
}
if (absoluteClockJump >= maxNoMasterConfirmationMillis / 2) {
logger.warning(format("Resetting master confirmation timestamps because of huge system clock jump!"
+ " Clock-Jump: %d ms, Master-Confirmation-Timeout: %d ms", clockJump, maxNoMasterConfirmationMillis));
resetMemberMasterConfirmations();
}
if (absoluteClockJump >= maxNoHeartbeatMillis / 2) {
logger.warning(format("Resetting heartbeat timestamps because of huge system clock jump!"
+ " Clock-Jump: %d ms, Heartbeat-Timeout: %d ms", clockJump, maxNoHeartbeatMillis));
resetHeartbeats();
}
}
lastClusterTimeDiff = clusterClock.getClusterTimeDiff();
lastHeartBeat = now;
}
/**
* Sends heartbeat to each of cluster members.
* Checks whether a member is failed to send heartbeat or master-confirmation in time
* (see {@link #maxNoHeartbeatMillis} and {@link #maxNoMasterConfirmationMillis})
* and removes that member from cluster.
*
* This method is only called on master member.
*/
private void heartBeatWhenMaster(long now) {
Collection members = clusterService.getMemberImpls();
for (MemberImpl member : members) {
if (!member.localMember()) {
try {
logIfConnectionToEndpointIsMissing(now, member);
if (removeMemberIfNotHeartBeating(now, member)) {
continue;
}
if (removeMemberIfMasterConfirmationExpired(now, member)) {
continue;
}
pingMemberIfRequired(now, member);
sendHeartbeat(member.getAddress());
} catch (Throwable e) {
logger.severe(e);
}
}
}
}
private boolean removeMemberIfNotHeartBeating(long now, MemberImpl member) {
long heartbeatTime = getHeartbeatTime(member);
if ((now - heartbeatTime) > maxNoHeartbeatMillis) {
logger.warning(format("Removing %s because it has not sent any heartbeats for %d ms."
+ " Now: %s, last heartbeat time was %s", member, maxNoHeartbeatMillis,
new Date(now), new Date(heartbeatTime)));
clusterService.removeAddress(member.getAddress());
return true;
}
if (logger.isFinestEnabled() && (now - heartbeatTime) > heartbeatIntervalMillis * HEART_BEAT_INTERVAL_FACTOR) {
logger.finest(format("Not receiving any heartbeats from %s since %s", member, new Date(heartbeatTime)));
}
return false;
}
private boolean removeMemberIfMasterConfirmationExpired(long now, MemberImpl member) {
Long lastConfirmation = masterConfirmationTimes.get(member);
if (lastConfirmation == null) {
lastConfirmation = 0L;
}
if (now - lastConfirmation > maxNoMasterConfirmationMillis) {
logger.warning(format("Removing %s because it has not sent any master confirmation for %d ms. "
+ " Last confirmation time was %s", member, maxNoMasterConfirmationMillis, new Date(lastConfirmation)));
clusterService.removeAddress(member.getAddress());
return true;
}
return false;
}
/**
* Sends heartbeat to each of cluster members.
* Checks whether master member is failed to send heartbeat (see {@link #maxNoHeartbeatMillis})
* and removes that master member from cluster, if it fails on heartbeat.
*
* This method is called on NON-master members.
*/
private void heartBeatWhenSlave(long now) {
Collection members = clusterService.getMemberImpls();
for (MemberImpl member : members) {
if (!member.localMember()) {
try {
logIfConnectionToEndpointIsMissing(now, member);
if (isMaster(member)) {
if (removeMemberIfNotHeartBeating(now, member)) {
continue;
}
}
pingMemberIfRequired(now, member);
sendHeartbeat(member.getAddress());
} catch (Throwable e) {
logger.severe(e);
}
}
}
}
private boolean isMaster(MemberImpl member) {
return member.getAddress().equals(node.getMasterAddress());
}
private void pingMemberIfRequired(long now, MemberImpl member) {
if (!icmpEnabled) {
return;
}
if ((now - getHeartbeatTime(member)) >= pingIntervalMillis) {
ping(member);
}
}
private void ping(final MemberImpl memberImpl) {
nodeEngine.getExecutionService().execute(ExecutionService.SYSTEM_EXECUTOR, new Runnable() {
public void run() {
try {
Address address = memberImpl.getAddress();
logger.warning(format("%s will ping %s", node.getThisAddress(), address));
for (int i = 0; i < MAX_PING_RETRY_COUNT; i++) {
try {
if (address.getInetAddress().isReachable(null, icmpTtl, icmpTimeoutMillis)) {
logger.info(format("%s pinged %s successfully", node.getThisAddress(), address));
return;
}
} catch (ConnectException ignored) {
// no route to host, means we cannot connect anymore
EmptyStatement.ignore(ignored);
}
}
// host not reachable
logger.warning(format("%s could not ping %s", node.getThisAddress(), address));
clusterService.removeAddress(address);
} catch (Throwable ignored) {
EmptyStatement.ignore(ignored);
}
}
});
}
private void sendHeartbeat(Address target) {
if (target == null) {
return;
}
try {
node.nodeEngine.getOperationService().send(new HeartbeatOperation(clusterClock.getClusterTime()), target);
} catch (Exception e) {
if (logger.isFinestEnabled()) {
logger.finest(format("Error while sending heartbeat -> %s[%s]", e.getClass().getName(), e.getMessage()));
}
}
}
private void logIfConnectionToEndpointIsMissing(long now, MemberImpl member) {
long heartbeatTime = getHeartbeatTime(member);
if ((now - heartbeatTime) >= pingIntervalMillis) {
Connection conn = node.connectionManager.getOrConnect(member.getAddress());
if (conn == null || !conn.isAlive()) {
logger.warning("This node does not have a connection to " + member);
}
}
}
private long getHeartbeatTime(MemberImpl member) {
Long heartbeatTime = heartbeatTimes.get(member);
return (heartbeatTime != null ? heartbeatTime : 0L);
}
public void sendMasterConfirmation() {
if (!node.joined() || node.getState() == NodeState.SHUT_DOWN || node.isMaster()) {
return;
}
Address masterAddress = node.getMasterAddress();
if (masterAddress == null) {
logger.finest("Could not send MasterConfirmation, masterAddress is null!");
return;
}
MemberImpl masterMember = clusterService.getMember(masterAddress);
if (masterMember == null) {
logger.finest("Could not send MasterConfirmation, masterMember is null!");
return;
}
if (logger.isFinestEnabled()) {
logger.finest("Sending MasterConfirmation to " + masterMember);
}
nodeEngine.getOperationService().send(new MasterConfirmationOperation(clusterClock.getClusterTime()),
masterAddress);
}
private void sendMemberListToOthers() {
if (!node.isMaster()) {
return;
}
Collection members = clusterService.getMemberImpls();
MemberInfoUpdateOperation op = new MemberInfoUpdateOperation(
createMemberInfoList(members), clusterClock.getClusterTime(), false);
for (MemberImpl member : members) {
if (member.localMember()) {
continue;
}
nodeEngine.getOperationService().send(op, member.getAddress());
}
}
// Called just before this node becomes the master
// and when system clock jump is detected
void resetMemberMasterConfirmations() {
long now = clusterClock.getClusterTime();
for (MemberImpl member : clusterService.getMemberImpls()) {
masterConfirmationTimes.put(member, now);
}
}
// Called when system clock jump is detected
private void resetHeartbeats() {
long now = clusterClock.getClusterTime();
for (MemberImpl member : clusterService.getMemberImpls()) {
heartbeatTimes.put(member, now);
}
}
void removeMember(MemberImpl member) {
masterConfirmationTimes.remove(member);
heartbeatTimes.remove(member);
}
void reset() {
masterConfirmationTimes.clear();
heartbeatTimes.clear();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy