org.archive.crawler.monitor.DiskSpaceMonitor Maven / Gradle / Ivy
package org.archive.crawler.monitor;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.crawler.event.StatSnapshotEvent;
import org.archive.crawler.framework.CrawlController;
import org.archive.spring.ConfigPath;
import org.archive.spring.ConfigPathConfigurer;
import org.archive.util.ArchiveUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
/**
* Monitors the available space on the paths configured. If the available space
* drops below a specified threshold a crawl pause is requested.
*
* Monitoring is done via the java.io.File.getUsableSpace()
method.
* This method will sometimes fail on network attached storage, returning 0
* bytes available even if that is not actually the case.
*
* Paths that do not resolve to actual filesystem folders or files will not be
* evaluated (i.e. if java.io.File.exists()
returns false
* no further processing is carried out on that File).
*
* Paths are checked available space whenever a {@link StatSnapshotEvent} occurs.
*
* @author Kristinn Sigurðsson
*/
public class DiskSpaceMonitor implements ApplicationListener {
private static final Logger logger = Logger.getLogger(DiskSpaceMonitor.class.getName());
protected List monitorPaths = new ArrayList();
protected long pauseThresholdMiB = 8192;
protected CrawlController controller;
protected ConfigPathConfigurer configPathConfigurer;
protected boolean monitorConfigPaths = true;
/**
* @param monitorPaths List of filesystem paths that should be monitored for available space.
*/
public void setMonitorPaths(List monitorPaths) {
this.monitorPaths = monitorPaths;
}
public List getMonitorPaths() {
return this.monitorPaths;
}
/**
* Set the minimum amount of space that must be available on all monitored paths.
* If the amount falls below this pause threshold on any path the crawl will be paused.
*
* @param pauseThresholdMiB The desired pause threshold value.
* Specified in megabytes (MiB).
*/
public void setPauseThresholdMiB(long pauseThresholdMiB) {
this.pauseThresholdMiB = pauseThresholdMiB;
}
public long getPauseThresholdMiB() {
return this.pauseThresholdMiB;
}
/**
* If enabled, all the paths returned by {@link ConfigPathConfigurer#getAllConfigPaths()}
* will be monitored in addition to any paths explicitly specified via
* {@link #setMonitorPaths(List)}.
*
* true
by default.
*
* Note: This is not guaranteed to contain all paths that Heritrix writes to.
* It is the responsibility of modules that write to disk to register their activity
* with the {@link ConfigPathConfigurer} and some may not do so.
*
* @param monitorConfigPaths If config paths should be monitored for usable space.
*/
public void setMonitorConfigPaths(boolean monitorConfigPaths){
this.monitorConfigPaths = monitorConfigPaths;
}
public boolean getMonitorConfigPaths(){
return this.monitorConfigPaths;
}
/** Autowire access to CrawlController **/
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
public CrawlController getCrawlController() {
return this.controller;
}
/** Autowire access to ConfigPathConfigurer **/
@Autowired
public void setConfigPathConfigurer(ConfigPathConfigurer configPathConfigurer) {
this.configPathConfigurer = configPathConfigurer;
}
public ConfigPathConfigurer getConfigPathConfigurer() {
return this.configPathConfigurer;
}
/**
* Checks available space on {@link StatSnapshotEvent}s.
*/
@Override
public void onApplicationEvent(ApplicationEvent event) {
if (event instanceof StatSnapshotEvent) {
// Check available space every time the statistics tracker
// updates its sample, by default every 20 sec.
for (String path : getMonitorPaths()) {
checkAvailableSpace(new File(path));
}
if (monitorConfigPaths) {
for(ConfigPath path : configPathConfigurer.getAllConfigPaths().values()) {
checkAvailableSpace(path.getFile());
}
}
}
}
/**
* Probe via File.getUsableSpace to see if monitored paths have fallen below
* the pause threshold. If so, request a crawl pause.
*
* @param path The filesystem path to check for usable space
*/
protected void checkAvailableSpace(File path) {
if (!path.exists()) {
// Paths that can not be resolved will not report accurate
// available space. Log and ignore.
logger.fine("Ignoring non-existent path " + path.getAbsolutePath());
return;
}
long availBytes = path.getUsableSpace();
long thresholdBytes = getPauseThresholdMiB() * 1024 * 1024;
if (availBytes < thresholdBytes && controller.isActive()) {
// Enact pause
controller.requestCrawlPause();
// Log issue
String errorMsg = "Low Disk Pause - %d bytes (%s) available on %s, "
+ "this is below the minimum threshold of %d bytes (%s)";
logger.log(Level.SEVERE, String.format(errorMsg, availBytes,
ArchiveUtils.formatBytesForDisplay(availBytes),
path.getAbsolutePath(), thresholdBytes,
ArchiveUtils.formatBytesForDisplay(thresholdBytes)));
}
}
}