org.archive.modules.fetcher.FetchStats Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the
Heritrix application to crawl the web. The modules in this project can
be used in applications other than Heritrix, however.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules.fetcher;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.LinkedHashMap;
import java.util.Map;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.modules.CrawlURI;
import org.archive.util.ArchiveUtils;
import org.archive.util.ReportUtils;
import org.archive.util.Reporter;
/**
* Collector of statistics for a 'subset' of a crawl,
* such as a server (host:port), host, or frontier group
* (eg queue).
*
* @author gojomo
*/
public class FetchStats extends CrawledBytesHistotable implements Serializable, FetchStatusCodes, Reporter {
private static final long serialVersionUID = 2l;
public enum Stage {SCHEDULED, RELOCATED, RETRIED, SUCCEEDED, DISREGARDED, FAILED};
public static final String TOTAL_SCHEDULED = "totalScheduled"; // anything initially scheduled
// (totalScheduled - (fetchSuccesses + fetchFailures)
public static final String FETCH_SUCCESSES = "fetchSuccesses"; // anything disposed-success
// (HTTP 2XX response codes, other non-errors)
public static final String FETCH_FAILURES = "fetchFailures"; // anything disposed-failure
public static final String FETCH_DISREGARDS = "fetchDisregards";// anything disposed-disregard
public static final String FETCH_RESPONSES = "fetchResponses"; // all positive responses (incl. 3XX, 4XX, 5XX)
public static final String ROBOTS_DENIALS = "robotsDenials"; // all robots-precluded failures
public static final String SUCCESS_BYTES = "successBytes"; // total size of all success responses
public static final String TOTAL_BYTES = "totalBytes"; // total size of all responses
public static final String FETCH_NONRESPONSES = "fetchNonResponses"; // processing attempts resulting in no response
// (both failures and temp deferrals)
public interface HasFetchStats {
public FetchStats getSubstats();
}
public interface CollectsFetchStats {
public void tally(CrawlURI curi, Stage stage);
}
protected long lastSuccessTime;
public synchronized void tally(CrawlURI curi, Stage stage) {
switch(stage) {
case SCHEDULED:
tally(TOTAL_SCHEDULED, 1);
break;
case RETRIED:
if(curi.getFetchStatus()<=0) {
tally(FETCH_NONRESPONSES, 1);
}
break;
case SUCCEEDED:
tally(FETCH_SUCCESSES, 1);
tally(FETCH_RESPONSES, 1);
tally(TOTAL_BYTES, curi.getContentSize());
tally(SUCCESS_BYTES, curi.getContentSize());
lastSuccessTime = curi.getFetchCompletedTime();
break;
case DISREGARDED:
tally(FETCH_DISREGARDS, 1);
if(curi.getFetchStatus()==S_ROBOTS_PRECLUDED) {
tally(ROBOTS_DENIALS, 1);
}
break;
case FAILED:
if(curi.getFetchStatus()<=0) {
tally(FETCH_NONRESPONSES, 1);
} else {
tally(FETCH_RESPONSES, 1);
tally(TOTAL_BYTES, curi.getContentSize());
}
tally(FETCH_FAILURES, 1);
break;
default:
break;
}
if (curi.getFetchStatus() > 0) {
this.accumulate(curi);
}
}
public long getFetchSuccesses() {
return get(FETCH_SUCCESSES);
}
public long getFetchResponses() {
return get(FETCH_RESPONSES);
}
public long getSuccessBytes() {
return get(SUCCESS_BYTES);
}
public long getTotalBytes() {
return get(TOTAL_BYTES);
}
public long getFetchNonResponses() {
return get(FETCH_NONRESPONSES);
}
public long getTotalScheduled() {
return get(TOTAL_SCHEDULED);
}
public long getFetchDisregards() {
return get(FETCH_DISREGARDS);
}
public long getRobotsDenials() {
return get(ROBOTS_DENIALS);
}
public long getRemaining() {
return get(TOTAL_SCHEDULED) - (get(FETCH_SUCCESSES) + get(FETCH_FAILURES)+ get(FETCH_DISREGARDS));
}
public long getRecordedFinishes() {
return get(FETCH_SUCCESSES) + get(FETCH_FAILURES);
}
public long getNovelBytes() {
return get(NOVEL);
}
public long getNovelUrls() {
return get(NOVELCOUNT);
}
public long getNotModifiedBytes() {
return get(NOTMODIFIED);
}
public long getNotModifiedUrls() {
return get(NOTMODIFIEDCOUNT);
}
public long getDupByHashBytes() {
return get(DUPLICATE);
}
public long getDupByHashUrls() {
return get(DUPLICATECOUNT);
}
public long getOtherDupBytes() {
return get(OTHERDUPLICATE);
}
public long getOtherDupUrls() {
return get(OTHERDUPLICATECOUNT);
}
/* (non-Javadoc)
* @see org.archive.util.Reporter#reportTo(java.io.PrintWriter)
*/
@Override // Reporter
public void reportTo(PrintWriter writer) {
writer.println(shortReportLegend());
shortReportLineTo(writer);
}
@Override
public String shortReportLegend() {
return "totalScheduled fetchSuccesses fetchFailures fetchDisregards " +
"fetchResponses robotsDenials successBytes totalBytes " +
"fetchNonResponses lastSuccessTime";
}
public String shortReportLine() {
return ReportUtils.shortReportLine(this);
}
@Override
public void shortReportLineTo(PrintWriter writer) {
writer.print(get(TOTAL_SCHEDULED));
writer.print(" ");
writer.print(get(FETCH_SUCCESSES));
writer.print(" ");
writer.print(get(FETCH_FAILURES));
writer.print(" ");
writer.print(get(FETCH_DISREGARDS));
writer.print(" ");
writer.print(get(FETCH_RESPONSES));
writer.print(" ");
writer.print(get(ROBOTS_DENIALS));
writer.print(" ");
writer.print(get(SUCCESS_BYTES));
writer.print(" ");
writer.print(get(TOTAL_BYTES));
writer.print(" ");
writer.print(get(FETCH_NONRESPONSES));
writer.print(" ");
writer.print(ArchiveUtils.getLog17Date(lastSuccessTime));
}
@Override
public Map shortReportMap() {
Map map = new LinkedHashMap(this);
map.put("lastSuccessTime",lastSuccessTime);
return map;
}
public long getLastSuccessTime() {
return lastSuccessTime;
}
}