org.archive.crawler.reporting.CrawlStatSnapshot Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-engine Show documentation
The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.reporting;

import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.util.ArchiveUtils;
import org.archive.util.PaddingStringBuffer;

/**
 * Frozen snapshot of a variety of crawl statistics. Used for 
 * obtaining a consistent set of stats and a short log of stats
 * for calculating rates.
 * 
 * @author gojomo
 */
public class CrawlStatSnapshot {
    public long timestamp; 
    
    public long urisFetched;
    public long bytesProcessed;
    
    public long discoveredUriCount;
    public long queuedUriCount;
    public long futureUriCount; 
    public long finishedUriCount;
    public long downloadedUriCount;
    public long downloadFailures;
    public long downloadDisregards;
    
    public long elapsedMilliseconds; 
    
    public double docsPerSecond;
    public double currentDocsPerSecond;
    
    public long totalKiBPerSec;
    public long currentKiBPerSec;
    
    public int busyThreads;
    
    public float congestionRatio; 
    public long deepestUri;
    public long averageDepth;
    
    public long novelBytes;
    public long novelUriCount;
    public long warcNovelBytes;
    public long warcNovelUriCount;
    
    /**
     * Collect all relevant snapshot samples, from the given CrawlController
     * and StatisticsTracker (which also provides the previous snapshot 
     * for rate-calculations.
     * 
     * @param controller
     * @param stats
     */
    public void collect(CrawlController controller, StatisticsTracker stats) {
        // TODO: reconsider names of these methods, inline?    
        downloadedUriCount = controller.getFrontier().succeededFetchCount();
        bytesProcessed = stats.crawledBytes.getTotalBytes();
        timestamp = System.currentTimeMillis();
        
        novelBytes = stats.crawledBytes.get(CrawledBytesHistotable.NOVEL);
        novelUriCount = stats.crawledBytes.get(CrawledBytesHistotable.NOVELCOUNT);
        warcNovelBytes = stats.crawledBytes.get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES);
        warcNovelUriCount = stats.crawledBytes.get(CrawledBytesHistotable.WARC_NOVEL_URLS);
        
        elapsedMilliseconds = stats.getCrawlElapsedTime();
        discoveredUriCount = controller.getFrontier().discoveredUriCount();
        finishedUriCount = controller.getFrontier().finishedUriCount();
        queuedUriCount = controller.getFrontier().queuedUriCount();
        futureUriCount = controller.getFrontier().futureUriCount(); 
        downloadFailures = controller.getFrontier().failedFetchCount();
        downloadDisregards = controller.getFrontier().disregardedUriCount();
        
        busyThreads = controller.getActiveToeCount();
        
        congestionRatio = controller.getFrontier().congestionRatio();
        deepestUri = controller.getFrontier().deepestUri();
        averageDepth = controller.getFrontier().averageDepth();
        
        // overall rates
        docsPerSecond = (double) downloadedUriCount /
            (stats.getCrawlElapsedTime() / 1000d);
        totalKiBPerSec = (long)((bytesProcessed / 1024d) /
            ((stats.getCrawlElapsedTime()+1) / 1000d));
        
        CrawlStatSnapshot lastSnapshot = stats.snapshots.peek();

        if(lastSnapshot==null) {
            // no previous snapshot; unable to calculate current rates
            return;
        }

        // last sample period rates
        long sampleTime = timestamp - lastSnapshot.timestamp;
        currentDocsPerSecond =
            (double) (downloadedUriCount - lastSnapshot.downloadedUriCount) 
            / (sampleTime / 1000d);
        currentKiBPerSec = 
            (long) (((bytesProcessed-lastSnapshot.bytesProcessed)/1024)
            / (sampleTime / 1000d));
    }
    
    /**
     * Return one line of current progress-statistics
     * 
     * @return String of stats
     */
    public String getProgressStatisticsLine() {
        return new PaddingStringBuffer()
            .append(ArchiveUtils.getLog14Date(timestamp))
            .raAppend(32, discoveredUriCount)
            .raAppend(44, queuedUriCount)
            .raAppend(57, downloadedUriCount)
            .raAppend(74, ArchiveUtils.
                doubleToString(currentDocsPerSecond, 2) +
                "(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")
            .raAppend(85, currentKiBPerSec + "(" + totalKiBPerSec + ")")
            .raAppend(99, downloadFailures)
            .raAppend(113, busyThreads)
            .raAppend(126, (Runtime.getRuntime().totalMemory() -
                Runtime.getRuntime().freeMemory()) / 1024)
            .raAppend(140, Runtime.getRuntime().totalMemory() / 1024)
            .raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2))
            .raAppend(165, deepestUri)
            .raAppend(177, averageDepth)
            .toString();
    }
    
    public long totalCount() {
        return queuedUriCount + busyThreads +
            downloadedUriCount;
    }
    
    /**
     * This returns the number of completed URIs as a percentage of the total
     * number of URIs encountered (should be inverse to the discovery curve)
     *
     * @return The number of completed URIs as a percentage of the total
     * number of URIs encountered
     */
    public int percentOfDiscoveredUrisCompleted() {
        long total = discoveredUriCount;
        if (total == 0) {
            return 0;
        }
        return (int) (100 * finishedUriCount / total);
    }

    /**
     * Return true if this snapshot shows no tangible progress in 
     * its URI counts over the supplied snapshot. May be used to 
     * suppress unnecessary redundant reporting/checkpointing. 
     * @param lastSnapshot
     * @return true if this snapshot stats are essentially same as previous given
     */
    public boolean sameProgressAs(CrawlStatSnapshot lastSnapshot) {
        if(lastSnapshot==null) {
            return false;
        }
        return (finishedUriCount == lastSnapshot.finishedUriCount)
            && (queuedUriCount == lastSnapshot.queuedUriCount)
            && (downloadDisregards == lastSnapshot.downloadDisregards);
    }
}