org.archive.crawler.postprocessor.LowDiskPauseProcessor Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.postprocessor;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.archive.crawler.framework.CrawlController;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.CrawlURI;
import org.springframework.beans.factory.annotation.Autowired;
/**
* Processor module which uses 'df -k', where available and with
* the expected output format (on Linux), to monitor available
* disk space and pause the crawl if free space on monitored
* filesystems falls below certain thresholds.
*
* @deprecated Is highly system dependent.
* Use {@link org.archive.crawler.monitor.DiskSpaceMonitor} instead.
*/
@Deprecated
public class LowDiskPauseProcessor extends Processor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
/**
* Logger.
*/
private static final Logger logger =
Logger.getLogger(LowDiskPauseProcessor.class.getName());
protected CrawlController controller;
public CrawlController getCrawlController() {
return this.controller;
}
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
/**
* List of filessystem mounts whose 'available' space should be monitored
* via 'df' (if available).
*/
protected List monitorMounts = new ArrayList();
public List getMonitorMounts() {
return this.monitorMounts;
}
public void setMonitorMounts(List monitorMounts) {
this.monitorMounts = monitorMounts;
}
/**
* When available space on any monitored mounts falls below this threshold,
* the crawl will be paused.
*/
protected int pauseThresholdKb = 500*1024; // 500MB
public int getPauseThresholdKb() {
return this.pauseThresholdKb;
}
public void setPauseThresholdKb(int pauseThresholdKb) {
this.pauseThresholdKb = pauseThresholdKb;
}
/**
* Available space via 'df' is rechecked after every increment of this much
* content (uncompressed) is observed.
*/
protected int recheckThresholdKb = 200*1024; // 200MB
public int getRecheckThresholdKb() {
return this.recheckThresholdKb;
}
public void setRecheckThresholdKb(int recheckThresholdKb) {
this.recheckThresholdKb = recheckThresholdKb;
}
protected int contentSinceCheck = 0;
public static final Pattern VALID_DF_OUTPUT =
Pattern.compile("(?s)^Filesystem\\s+1K-blocks\\s+Used\\s+Available\\s+Use%\\s+Mounted on\\n.*");
public static final Pattern AVAILABLE_EXTRACTOR =
Pattern.compile("(?m)\\s(\\d+)\\s+\\d+%\\s+(\\S+)$");
public LowDiskPauseProcessor() {
}
@Override
protected boolean shouldProcess(CrawlURI curi) {
return true;
}
@Override
protected void innerProcess(CrawlURI uri) {
throw new AssertionError();
}
/**
* Notes a CrawlURI's content size in its running tally. If the
* recheck increment of content has passed through since the last
* available-space check, checks available space and pauses the
* crawl if any monitored mounts are below the configured threshold.
*
* @param curi CrawlURI to process.
*/
@Override
protected ProcessResult innerProcessResult(CrawlURI curi) {
synchronized (this) {
contentSinceCheck += curi.getContentSize();
if (contentSinceCheck/1024 > getRecheckThresholdKb()) {
ProcessResult r = checkAvailableSpace(curi);
contentSinceCheck = 0;
return r;
} else {
return ProcessResult.PROCEED;
}
}
}
/**
* Probe via 'df' to see if monitored mounts have fallen
* below the pause available threshold. If so, request a
* crawl pause.
* @param curi Current context.
*/
private ProcessResult checkAvailableSpace(CrawlURI curi) {
try {
String df = IOUtils.toString(Runtime.getRuntime().exec(
"df -k").getInputStream());
Matcher matcher = VALID_DF_OUTPUT.matcher(df);
if(!matcher.matches()) {
logger.severe("'df -k' output unacceptable for low-disk checking");
return ProcessResult.PROCEED;
}
List monitoredMounts = getMonitorMounts();
matcher = AVAILABLE_EXTRACTOR.matcher(df);
while (matcher.find()) {
String mount = matcher.group(2);
if (monitoredMounts.contains(mount)) {
long availKilobytes = Long.parseLong(matcher.group(1));
int thresholdKilobytes = getPauseThresholdKb();
if (availKilobytes < thresholdKilobytes ) {
logger.log(Level.SEVERE, "Low Disk Pause",
availKilobytes + "K available on " + mount
+ " (below threshold "
+ thresholdKilobytes + "K)");
controller.requestCrawlPause();
return ProcessResult.PROCEED;
}
}
}
} catch (IOException e) {
curi.getNonFatalFailures().add(e);
}
return ProcessResult.PROCEED;
}
}