net.i2p.router.tasks.RouterWatchdog Maven / Gradle / Ivy
package net.i2p.router.tasks;
import net.i2p.data.DataHelper;
import net.i2p.router.Job;
import net.i2p.router.CommSystemFacade.Status;
import net.i2p.router.Router;
import net.i2p.router.RouterContext;
import net.i2p.router.util.EventLog;
import net.i2p.stat.Rate;
import net.i2p.stat.RateStat;
import net.i2p.util.Log;
import net.i2p.util.SystemVersion;
/**
* Periodically check to make sure things haven't gone totally haywire (and if
* they have, restart the JVM)
*
*/
public class RouterWatchdog implements Runnable {
private final Log _log;
private final RouterContext _context;
private int _consecutiveErrors;
private volatile boolean _isRunning;
private long _lastDump;
private static final long MAX_JOB_RUN_LAG = 60*1000;
private static final long MIN_DUMP_INTERVAL= 6*60*60*1000;
public RouterWatchdog(RouterContext ctx) {
_context = ctx;
_log = ctx.logManager().getLog(RouterWatchdog.class);
_isRunning = true;
}
/** @since 0.8.8 */
public void shutdown() {
_isRunning = false;
}
public boolean verifyJobQueueLiveliness() {
long when = _context.jobQueue().getLastJobBegin();
if (when < 0)
return true;
long howLongAgo = _context.clock().now() - when;
if (howLongAgo > MAX_JOB_RUN_LAG) {
Job cur = _context.jobQueue().getLastJob();
if (cur != null) {
if (_log.shouldLog(Log.ERROR))
_log.error("Last job was queued up " + DataHelper.formatDuration(howLongAgo)
+ " ago: " + cur);
return false;
} else {
// no prob, just normal lag
return true;
}
} else {
return true;
}
}
public boolean verifyClientLiveliness() {
return _context.clientManager().verifyClientLiveliness();
}
private boolean shutdownOnHang() {
// prop default false
if (!_context.getBooleanProperty("watchdog.haltOnHang"))
return false;
// Client manager starts complaining after 10 minutes, and we run every minute,
// so this will restart 30 minutes after we lose a lease, if the wrapper is present.
if (_consecutiveErrors >= 20 && SystemVersion.hasWrapper())
return true;
return false;
}
private void dumpStatus() {
if (_log.shouldLog(Log.ERROR)) {
/*
Job cur = _context.jobQueue().getLastJob();
if (cur != null)
_log.error("Most recent job: " + cur);
_log.error("Last job began: "
+ DataHelper.formatDuration(_context.clock().now()-_context.jobQueue().getLastJobBegin())
+ " ago");
_log.error("Last job ended: "
+ DataHelper.formatDuration(_context.clock().now()-_context.jobQueue().getLastJobEnd())
+ " ago");
*/
_log.error("Ready and waiting jobs: " + _context.jobQueue().getReadyCount());
_log.error("Job lag: " + _context.jobQueue().getMaxLag());
_log.error("Participating tunnel count: " + _context.tunnelManager().getParticipatingCount());
RateStat rs = _context.statManager().getRate("transport.sendProcessingTime");
Rate r = null;
if (rs != null)
r = rs.getRate(60*1000);
double processTime = (r != null ? r.getAverageValue() : 0);
_log.error("1 minute send processing time: " + DataHelper.formatDuration((long)processTime));
rs = _context.statManager().getRate("bw.sendBps");
r = null;
if (rs != null)
r = rs.getRate(60*1000);
double bps = (r != null ? r.getAverageValue() : 0);
_log.error("Outbound send rate: " + DataHelper.formatSize((long)bps) + "Bps");
long max = Runtime.getRuntime().maxMemory();
long used = Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory();
_log.error("Memory: " + DataHelper.formatSize(used) + "B / " + DataHelper.formatSize(max) + 'B');
if (_consecutiveErrors == 1) {
_log.log(Log.CRIT, "Router appears hung, or there is severe network congestion. Watchdog starts barking!");
_context.router().eventLog().addEvent(EventLog.WATCHDOG);
// This works on linux...
// It won't on windows, and we can't call i2prouter.bat either, it does something
// completely different...
long now = _context.clock().now();
if (now - _lastDump > MIN_DUMP_INTERVAL) {
_lastDump = now;
ThreadDump.dump(_context, 10);
}
}
}
}
public void run() {
while (_isRunning) {
try { Thread.sleep(60*1000); } catch (InterruptedException ie) {}
monitorRouter();
}
}
public void monitorRouter() {
boolean ok = verifyJobQueueLiveliness();
// If we aren't connected to the network that's why there's nobody to talk to
long netErrors = 0;
if (_context.commSystem().getStatus() == Status.DISCONNECTED) {
netErrors = 10;
} else {
RateStat rs = _context.statManager().getRate("udp.sendException");
if (rs != null) {
Rate r = rs.getRate(60*1000);
if (r != null)
netErrors = r.getLastEventCount();
}
}
ok = ok && (verifyClientLiveliness() || netErrors >= 5);
if (ok) {
_consecutiveErrors = 0;
} else {
_consecutiveErrors++;
dumpStatus();
if (shutdownOnHang()) {
_log.log(Log.CRIT, "Router hung! Restart forced by watchdog!");
try { Thread.sleep(30*1000); } catch (InterruptedException ie) {}
// halt and not system.exit, since some of the shutdown hooks might be misbehaving
Runtime.getRuntime().halt(Router.EXIT_HARD_RESTART);
}
}
}
}