org.archive.crawler.reporting.CrawlStatSnapshot Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.reporting;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.util.ArchiveUtils;
import org.archive.util.PaddingStringBuffer;
/**
* Frozen snapshot of a variety of crawl statistics. Used for
* obtaining a consistent set of stats and a short log of stats
* for calculating rates.
*
* @author gojomo
*/
public class CrawlStatSnapshot {
public long timestamp;
public long urisFetched;
public long bytesProcessed;
public long discoveredUriCount;
public long queuedUriCount;
public long futureUriCount;
public long finishedUriCount;
public long downloadedUriCount;
public long downloadFailures;
public long downloadDisregards;
public long elapsedMilliseconds;
public double docsPerSecond;
public double currentDocsPerSecond;
public long totalKiBPerSec;
public long currentKiBPerSec;
public int busyThreads;
public float congestionRatio;
public long deepestUri;
public long averageDepth;
public long novelBytes;
public long novelUriCount;
public long warcNovelBytes;
public long warcNovelUriCount;
/**
* Collect all relevant snapshot samples, from the given CrawlController
* and StatisticsTracker (which also provides the previous snapshot
* for rate-calculations.
*
* @param controller
* @param stats
*/
public void collect(CrawlController controller, StatisticsTracker stats) {
// TODO: reconsider names of these methods, inline?
downloadedUriCount = controller.getFrontier().succeededFetchCount();
bytesProcessed = stats.crawledBytes.getTotalBytes();
timestamp = System.currentTimeMillis();
novelBytes = stats.crawledBytes.get(CrawledBytesHistotable.NOVEL);
novelUriCount = stats.crawledBytes.get(CrawledBytesHistotable.NOVELCOUNT);
warcNovelBytes = stats.crawledBytes.get(CrawledBytesHistotable.WARC_NOVEL_CONTENT_BYTES);
warcNovelUriCount = stats.crawledBytes.get(CrawledBytesHistotable.WARC_NOVEL_URLS);
elapsedMilliseconds = stats.getCrawlElapsedTime();
discoveredUriCount = controller.getFrontier().discoveredUriCount();
finishedUriCount = controller.getFrontier().finishedUriCount();
queuedUriCount = controller.getFrontier().queuedUriCount();
futureUriCount = controller.getFrontier().futureUriCount();
downloadFailures = controller.getFrontier().failedFetchCount();
downloadDisregards = controller.getFrontier().disregardedUriCount();
busyThreads = controller.getActiveToeCount();
congestionRatio = controller.getFrontier().congestionRatio();
deepestUri = controller.getFrontier().deepestUri();
averageDepth = controller.getFrontier().averageDepth();
// overall rates
docsPerSecond = (double) downloadedUriCount /
(stats.getCrawlElapsedTime() / 1000d);
totalKiBPerSec = (long)((bytesProcessed / 1024d) /
((stats.getCrawlElapsedTime()+1) / 1000d));
CrawlStatSnapshot lastSnapshot = stats.snapshots.peek();
if(lastSnapshot==null) {
// no previous snapshot; unable to calculate current rates
return;
}
// last sample period rates
long sampleTime = timestamp - lastSnapshot.timestamp;
currentDocsPerSecond =
(double) (downloadedUriCount - lastSnapshot.downloadedUriCount)
/ (sampleTime / 1000d);
currentKiBPerSec =
(long) (((bytesProcessed-lastSnapshot.bytesProcessed)/1024)
/ (sampleTime / 1000d));
}
/**
* Return one line of current progress-statistics
*
* @return String of stats
*/
public String getProgressStatisticsLine() {
return new PaddingStringBuffer()
.append(ArchiveUtils.getLog14Date(timestamp))
.raAppend(32, discoveredUriCount)
.raAppend(44, queuedUriCount)
.raAppend(57, downloadedUriCount)
.raAppend(74, ArchiveUtils.
doubleToString(currentDocsPerSecond, 2) +
"(" + ArchiveUtils.doubleToString(docsPerSecond, 2) + ")")
.raAppend(85, currentKiBPerSec + "(" + totalKiBPerSec + ")")
.raAppend(99, downloadFailures)
.raAppend(113, busyThreads)
.raAppend(126, (Runtime.getRuntime().totalMemory() -
Runtime.getRuntime().freeMemory()) / 1024)
.raAppend(140, Runtime.getRuntime().totalMemory() / 1024)
.raAppend(153, ArchiveUtils.doubleToString(congestionRatio, 2))
.raAppend(165, deepestUri)
.raAppend(177, averageDepth)
.toString();
}
public long totalCount() {
return queuedUriCount + busyThreads +
downloadedUriCount;
}
/**
* This returns the number of completed URIs as a percentage of the total
* number of URIs encountered (should be inverse to the discovery curve)
*
* @return The number of completed URIs as a percentage of the total
* number of URIs encountered
*/
public int percentOfDiscoveredUrisCompleted() {
long total = discoveredUriCount;
if (total == 0) {
return 0;
}
return (int) (100 * finishedUriCount / total);
}
/**
* Return true if this snapshot shows no tangible progress in
* its URI counts over the supplied snapshot. May be used to
* suppress unnecessary redundant reporting/checkpointing.
* @param lastSnapshot
* @return true if this snapshot stats are essentially same as previous given
*/
public boolean sameProgressAs(CrawlStatSnapshot lastSnapshot) {
if(lastSnapshot==null) {
return false;
}
return (finishedUriCount == lastSnapshot.finishedUriCount)
&& (queuedUriCount == lastSnapshot.queuedUriCount)
&& (downloadDisregards == lastSnapshot.downloadDisregards);
}
}