All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.postprocessor.LowDiskPauseProcessor Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.postprocessor;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.archive.crawler.framework.CrawlController;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.CrawlURI;
import org.springframework.beans.factory.annotation.Autowired;

/**
 * Processor module which uses 'df -k', where available and with
 * the expected output format (on Linux), to monitor available 
 * disk space and pause the crawl if free space on  monitored 
 * filesystems falls below certain thresholds.
 * 
 * @deprecated Is highly system dependent. 
 *             Use {@link org.archive.crawler.monitor.DiskSpaceMonitor} instead.
 */
@Deprecated
public class LowDiskPauseProcessor extends Processor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    /**
     * Logger.
     */
    private static final Logger logger =
        Logger.getLogger(LowDiskPauseProcessor.class.getName());


    protected CrawlController controller;
    public CrawlController getCrawlController() {
        return this.controller;
    }
    @Autowired
    public void setCrawlController(CrawlController controller) {
        this.controller = controller;
    }
    
    /**
     * List of filessystem mounts whose 'available' space should be monitored
     * via 'df' (if available).
     */
    protected List monitorMounts = new ArrayList();
    public List getMonitorMounts() {
        return this.monitorMounts;
    }
    public void setMonitorMounts(List monitorMounts) {
        this.monitorMounts = monitorMounts;
    }

    /**
     * When available space on any monitored mounts falls below this threshold,
     * the crawl will be paused.
     */
    protected int pauseThresholdKb = 500*1024; // 500MB 
    public int getPauseThresholdKb() {
        return this.pauseThresholdKb;
    }
    public void setPauseThresholdKb(int pauseThresholdKb) {
        this.pauseThresholdKb = pauseThresholdKb;
    }
    
    /**
     * Available space via 'df' is rechecked after every increment of this much
     * content (uncompressed) is observed.
     */
    protected int recheckThresholdKb = 200*1024; // 200MB 
    public int getRecheckThresholdKb() {
        return this.recheckThresholdKb;
    }
    public void setRecheckThresholdKb(int recheckThresholdKb) {
        this.recheckThresholdKb = recheckThresholdKb;
    }
    
    protected int contentSinceCheck = 0;
    
    public static final Pattern VALID_DF_OUTPUT = 
        Pattern.compile("(?s)^Filesystem\\s+1K-blocks\\s+Used\\s+Available\\s+Use%\\s+Mounted on\\n.*");
    public static final Pattern AVAILABLE_EXTRACTOR = 
        Pattern.compile("(?m)\\s(\\d+)\\s+\\d+%\\s+(\\S+)$");
    
    public LowDiskPauseProcessor() {
    } 
    
    
    @Override
    protected boolean shouldProcess(CrawlURI curi) {
        return true;
    }

    @Override
    protected void innerProcess(CrawlURI uri) {
        throw new AssertionError();
    }
    
    /**
     * Notes a CrawlURI's content size in its running tally. If the 
     * recheck increment of content has passed through since the last
     * available-space check, checks available space and pauses the 
     * crawl if any monitored mounts are below the configured threshold. 
     * 
     * @param curi CrawlURI to process.
     */
    @Override
    protected ProcessResult innerProcessResult(CrawlURI curi) {
        synchronized (this) {
            contentSinceCheck += curi.getContentSize();
            if (contentSinceCheck/1024 > getRecheckThresholdKb()) {
                ProcessResult r = checkAvailableSpace(curi);
                contentSinceCheck = 0;
                return r;
            } else {
                return ProcessResult.PROCEED;
            }
        }
    }


    /**
     * Probe via 'df' to see if monitored mounts have fallen
     * below the pause available threshold. If so, request a 
     * crawl pause. 
     * @param curi Current context.
     */
    private ProcessResult checkAvailableSpace(CrawlURI curi) {
        try {
            String df = IOUtils.toString(Runtime.getRuntime().exec(
                    "df -k").getInputStream());
            Matcher matcher = VALID_DF_OUTPUT.matcher(df);
            if(!matcher.matches()) {
                logger.severe("'df -k' output unacceptable for low-disk checking");
                return ProcessResult.PROCEED;
            }
            List monitoredMounts = getMonitorMounts();
            matcher = AVAILABLE_EXTRACTOR.matcher(df);
            while (matcher.find()) {
                String mount = matcher.group(2);
                if (monitoredMounts.contains(mount)) {
                    long availKilobytes = Long.parseLong(matcher.group(1));
                    int thresholdKilobytes = getPauseThresholdKb();
                    if (availKilobytes < thresholdKilobytes ) {
                        logger.log(Level.SEVERE, "Low Disk Pause",
                                availKilobytes + "K available on " + mount
                                        + " (below threshold "
                                        + thresholdKilobytes + "K)");
                        controller.requestCrawlPause();
                        return ProcessResult.PROCEED;
                    }
                }
            }
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
        }
        return ProcessResult.PROCEED;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy