org.archive.crawler.reporting.StatisticsTracker Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.reporting;
import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.collections.Closure;
import org.archive.bdb.BdbModule;
import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.event.CrawlURIDispositionEvent;
import org.archive.crawler.event.StatSnapshotEvent;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.seeds.SeedListener;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.JSONUtils;
import org.archive.util.MimetypeUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.ObjectIdentityMemCache;
import org.archive.util.PaddingStringBuffer;
import org.archive.util.Supplier;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.Lifecycle;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;
import com.sleepycat.je.DatabaseException;
/**
* This is an implementation of the AbstractTracker. It is designed to function
* with the WUI as well as performing various logging activity.
*
* At the end of each snapshot a line is written to the
* 'progress-statistics.log' file.
*
* The header of that file is as follows:
*
[timestamp] [discovered] [queued] [downloaded] [doc/s(avg)] [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]
* First there is a timestamp, accurate down to 1 second.
*
* discovered, queued, downloaded and dl-failures
* are (respectively) the discovered URI count, pending URI count, successfully
* fetched count and failed fetch count from the frontier at the time of the
* snapshot.
*
* KB/s(avg) is the bandwidth usage. We use the total bytes downloaded
* to calculate average bandwidth usage (KB/sec). Since we also note the value
* each time a snapshot is made we can calculate the average bandwidth usage
* during the last snapshot period to gain a "current" rate. The first number is
* the current and the average is in parenthesis.
*
* doc/s(avg) works the same way as doc/s except it show the number of
* documents (URIs) rather then KB downloaded.
*
* busy-threads is the total number of ToeThreads that are not available
* (and thus presumably busy processing a URI). This information is extracted
* from the crawl controller.
*
* Finally mem-use-KB is extracted from the run time environment
* (Runtime.getRuntime().totalMemory()
).
*
* In addition to the data collected for the above logs, various other data
* is gathered and stored by this tracker.
*
* - Successfully downloaded documents per fetch status code
*
- Successfully downloaded documents per document mime type
*
- Amount of data per mime type
*
- Successfully downloaded documents per host
*
- Amount of data per host
*
- Disposition of all seeds (this is written to 'reports.log' at end of
* crawl)
*
- Successfully downloaded documents per host per source
*
*
* @author Parker Thompson
* @author Kristinn Sigurdsson
* @author gojomo
*/
public class StatisticsTracker
implements
ApplicationContextAware,
ApplicationListener,
SeedListener,
Lifecycle,
Runnable,
Checkpointable,
BeanNameAware {
@SuppressWarnings("unused")
private static final long serialVersionUID = 6L;
protected SeedModule seeds;
public SeedModule getSeeds() {
return this.seeds;
}
@Autowired
public void setSeeds(SeedModule seeds) {
this.seeds = seeds;
}
protected BdbModule bdb;
@Autowired
public void setBdbModule(BdbModule bdb) {
this.bdb = bdb;
}
protected ConfigPath reportsDir = new ConfigPath(Engine.REPORTS_DIR_NAME,"${launchId}/reports");
public ConfigPath getReportsDir() {
return reportsDir;
}
public void setReportsDir(ConfigPath reportsDir) {
this.reportsDir = reportsDir;
}
protected ServerCache serverCache;
public ServerCache getServerCache() {
return this.serverCache;
}
@Autowired
public void setServerCache(ServerCache serverCache) {
this.serverCache = serverCache;
}
protected int liveHostReportSize = 20;
public int getLiveHostReportSize() {
return liveHostReportSize;
}
public void setLiveHostReportSize(int liveHostReportSize) {
this.liveHostReportSize = liveHostReportSize;
}
protected ApplicationContext appCtx;
public void setApplicationContext(ApplicationContext appCtx) throws BeansException {
this.appCtx = appCtx;
}
/**
* Messages from the StatisticsTracker.
*/
private final static Logger logger =
Logger.getLogger(StatisticsTracker.class.getName());
/**
* Whether to maintain seed disposition records (expensive in
* crawls with millions of seeds)
*/
protected boolean trackSeeds = true;
public boolean getTrackSeeds() {
return this.trackSeeds;
}
public void setTrackSeeds(boolean trackSeeds) {
this.trackSeeds = trackSeeds;
}
/**
* Whether to maintain hosts-per-source-tag records for; very expensive in
* crawls with large numbers of source-tags (seeds) or large crawls
* over many hosts
*/
protected boolean trackSources = true;
public boolean getTrackSources() {
return this.trackSources;
}
public void setTrackSources(boolean trackSources) {
this.trackSources = trackSources;
}
/**
* The interval between writing progress information to log.
*/
protected int intervalSeconds = 20;
public int getIntervalSeconds() {
return this.intervalSeconds;
}
public void setIntervalSeconds(int interval) {
this.intervalSeconds = interval;
}
/**
* Number of crawl-stat sample snapshots to keep for calculation
* purposes.
*/
protected int keepSnapshotsCount = 5;
public int getKeepSnapshotsCount() {
return this.keepSnapshotsCount;
}
public void setKeepSnapshotsCount(int count) {
this.keepSnapshotsCount = count;
}
protected CrawlController controller;
public CrawlController getCrawlController() {
return this.controller;
}
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
/** wall-clock time the crawl started */
protected long crawlStartTime;
/** wall-clock time the crawl ended */
protected long crawlEndTime = -1; // Until crawl ends, this value is -1.
/** wall-clock time of last pause, while pause in progress */
protected long crawlPauseStarted = 0;
/** duration tally of all time spent in paused state */
protected long crawlTotalPausedTime = 0;
/** snapshots of crawl tallies and rates */
protected LinkedList snapshots = new LinkedList();
protected ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
/*
* Cumulative data
*/
/** tally sizes novel, verified (same hash), vouched (not-modified) */
protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable();
public CrawledBytesHistotable getCrawledBytes() {
return crawledBytes;
}
// TODO: fortify these against key explosion with bigmaps like other tallies
/** Keep track of the file types we see (mime type -> count) */
protected ConcurrentMap mimeTypeDistribution
= new ConcurrentHashMap();
protected ConcurrentMap mimeTypeBytes
= new ConcurrentHashMap();
/** Keep track of fetch status codes */
protected ConcurrentMap statusCodeDistribution
= new ConcurrentHashMap();
/** Keep track of URL counts per host per seed */
// TODO: restore spill-to-disk, like with processedSeedsRecords
protected ConcurrentHashMap> sourceHostDistribution =
new ConcurrentHashMap>();
/** Keep track of crawled bytes stats per seed */
// TODO: spill-to-disk (requires bdb replacement for Histotable, or some
// other refactoring)
protected ConcurrentHashMap statsBySource =
new ConcurrentHashMap();
/**
* Record of seeds and latest results
*/
protected ObjectIdentityCache processedSeedsRecords =
new ObjectIdentityMemCache();
protected long seedsTotal = -1;
protected long seedsCrawled = -1;
public StatisticsTracker() {
}
protected List reports;
public List getReports() {
// lazy initialization so we don't pointlessly create a bunch of beans
// right before setReports is called
if (reports == null) {
reports = new LinkedList();
reports.add(new CrawlSummaryReport());
reports.add(new SeedsReport());
reports.add(new HostsReport());
reports.add(new SourceTagsReport());
reports.add(new MimetypesReport());
reports.add(new ResponseCodeReport());
reports.add(new ProcessorsReport());
reports.add(new FrontierSummaryReport());
reports.add(new ToeThreadsReport());
}
return reports;
}
public void setReports(List reports) {
this.reports = reports;
}
protected boolean isRunning = false;
public boolean isRunning() {
return isRunning;
}
public void stop() {
isRunning = false;
executor.shutdownNow();
progressStatisticsEvent();
dumpReports();
}
@SuppressWarnings("unchecked")
public void start() {
isRunning = true;
boolean isRecover = (recoveryCheckpoint != null);
try {
this.processedSeedsRecords = bdb.getObjectCache("processedSeedsRecords",
isRecover, SeedRecord.class);
if(isRecover) {
JSONObject json = recoveryCheckpoint.loadJson(beanName);
crawlStartTime = json.getLong("crawlStartTime");
crawlEndTime = json.getLong("crawlEndTime");
crawlTotalPausedTime = json.getLong("crawlTotalPausedTime");
crawlPauseStarted = json.getLong("crawlPauseStarted");
tallyCurrentPause();
JSONUtils.putAllAtomicLongs(
mimeTypeDistribution,
json.getJSONObject("mimeTypeDistribution"));
JSONUtils.putAllAtomicLongs(
mimeTypeBytes,
json.getJSONObject("mimeTypeBytes"));
JSONUtils.putAllAtomicLongs(
statusCodeDistribution,
json.getJSONObject("statusCodeDistribution"));
JSONObject shd = json.getJSONObject("sourceHostDistribution");
Iterator keyIter = shd.keys();
for(; keyIter.hasNext();) {
String source = keyIter.next();
ConcurrentHashMap hostUriCount = new ConcurrentHashMap();
JSONUtils.putAllAtomicLongs(hostUriCount,shd.getJSONObject(source));
sourceHostDistribution.put(source, hostUriCount);
}
// optional so we can still recover checkpoints from earlier versions of heritrix
JSONObject ss = json.optJSONObject("statsBySource");
if (ss != null) {
keyIter = ss.keys();
for(; keyIter.hasNext();) {
String source = keyIter.next();
CrawledBytesHistotable cb = new CrawledBytesHistotable();
JSONUtils.putAllLongs(cb, ss.getJSONObject(source));
statsBySource.put(source, cb);
}
}
JSONUtils.putAllLongs(
crawledBytes,
json.getJSONObject("crawledBytes"));
}
} catch (DatabaseException e) {
throw new IllegalStateException(e);
} catch (JSONException e) {
throw new IllegalStateException(e);
}
// Log the legend
this.controller.logProgressStatistics(progressStatisticsLegend());
executor.scheduleAtFixedRate(this, 0, getIntervalSeconds(), TimeUnit.SECONDS);
}
/**
* Do activity. Is called by ScheduledExecutorService at intervals specified by
* intervalSeconds
*
*/
public void run() {
try {
progressStatisticsEvent();
} catch (Throwable e) {
logger.log(Level.SEVERE, "unexpected exception from progressStatisticsEvent()", e);
}
}
/**
* @return legend for progress-statistics lines/log
*/
public String progressStatisticsLegend() {
return " timestamp" +
" discovered " +
" queued downloaded doc/s(avg) KB/s(avg) " +
" dl-failures busy-thread mem-use-KB heap-size-KB " +
" congestion max-depth avg-depth";
}
public String getProgressStamp() {
return
progressStatisticsLegend()
+ "\n"
+ getSnapshot().getProgressStatisticsLine();
}
/**
* Notify tracker that crawl has begun. Must be called
* outside tracker's own thread, to ensure it is noted
* before other threads start interacting with tracker.
*/
public void noteStart() {
if (this.crawlStartTime == 0) {
// Note the time the crawl starts (only if not already set)
this.crawlStartTime = System.currentTimeMillis();
}
}
/**
* A method for logging current crawler state.
*
* This method will be called by run() at intervals specified in
* the crawl order file. It is also invoked when pausing or
* stopping a crawl to capture the state at that point. Default behavior is
* call to {@link CrawlController#logProgressStatistics} so CrawlController
* can act on progress statistics event.
*
* It is recommended that for implementations of this method it be
* carefully considered if it should be synchronized in whole or in
* part
*/
protected synchronized void progressStatisticsEvent() {
CrawlStatSnapshot snapshot = getSnapshot();
if (this.controller != null) {
this.controller.logProgressStatistics(snapshot.getProgressStatisticsLine());
}
snapshots.addFirst(snapshot);
while(snapshots.size()>getKeepSnapshotsCount()) {
snapshots.removeLast();
}
// publish app event
appCtx.publishEvent(new StatSnapshotEvent(this,snapshot));
// temporary workaround for
// [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS?
// http://sourceforge.net/support/tracker.php?aid=996161
Lookup.getDefaultCache(DClass.IN).clearCache();
}
public CrawlStatSnapshot getSnapshot() {
// TODO: take snapshot implementation from a spring prototype?
CrawlStatSnapshot snapshot = new CrawlStatSnapshot();
snapshot.collect(controller,this);
return snapshot;
}
public LinkedList listSnapshots() {
// not named getSnapshots to avoid autodiscovery as a (invalid) bean-property
return snapshots;
}
public CrawlStatSnapshot getLastSnapshot() {
CrawlStatSnapshot snap = snapshots.peek();
return snap == null ? getSnapshot() : snap;
}
public long getCrawlElapsedTime() {
if (crawlStartTime == 0) {
// if no start time set yet, consider elapsed time zero
return 0;
}
if (crawlPauseStarted != 0) {
// currently paused, calculate time up to last pause
return crawlPauseStarted - crawlTotalPausedTime - crawlStartTime;
}
// not paused, calculate total time to end or (if running) now
return ((crawlEndTime>0)?crawlEndTime:System.currentTimeMillis())
- crawlTotalPausedTime - crawlStartTime;
}
public void crawlPausing(String statusMessage) {
logNote("CRAWL WAITING - " + statusMessage);
}
protected void logNote(final String note) {
this.controller.logProgressStatistics(new PaddingStringBuffer()
.append(ArchiveUtils.getLog14Date(new Date()))
.append(" ")
.append(note)
.toString());
}
public void crawlPaused(String statusMessage) {
crawlPauseStarted = System.currentTimeMillis();
progressStatisticsEvent();
logNote("CRAWL PAUSED - " + statusMessage);
}
public void crawlResuming(String statusMessage) {
tallyCurrentPause();
if (this.crawlStartTime == 0) {
noteStart();
}
logNote("CRAWL RUNNING - " + statusMessage);
}
public void crawlEmpty(String statusMessage) {
logNote("CRAWL EMPTY - " + statusMessage);
}
/**
* For a current pause (if any), add paused time to total and reset
*/
protected void tallyCurrentPause() {
if (this.crawlPauseStarted > 0) {
// Ok, we managed to actually pause before resuming.
this.crawlTotalPausedTime
+= (System.currentTimeMillis() - this.crawlPauseStarted);
}
this.crawlPauseStarted = 0;
}
public void crawlEnding(String sExitMessage) {
logNote("CRAWL ENDING - " + sExitMessage);
}
public void crawlEnded(String sExitMessage) {
crawlEndTime = System.currentTimeMillis();
logNote("CRAWL ENDED - " + sExitMessage);
}
/**
* Returns how long the current crawl has been running *including*
* time paused (contrast with getCrawlElapsedTime()).
*
* @return The length of time - in msec - that this crawl has been running.
*/
public long getCrawlDuration() {
return ((crawlEndTime>0)?crawlEndTime:System.currentTimeMillis())
- crawlStartTime;
}
/** Returns a HashMap that contains information about distributions of
* encountered mime types. Key/value pairs represent
* mime type -< count.
*
* Note: All the values are wrapped with a {@link AtomicLong AtomicLong}
* @return mimeTypeDistribution
*/
public Map getFileDistribution() {
return mimeTypeDistribution;
}
/**
* Increment a counter for a key in a given HashMap. Used for various
* aggregate data.
*
* @param map The Map or ConcurrentMap
* @param key The key for the counter to be incremented, if it does not
* exist it will be added (set to 1). If null it will
* increment the counter "unknown".
*/
protected static void incrementMapCount(ConcurrentMap map,
String key) {
incrementMapCount(map,key,1);
}
/**
* Increment a counter for a key in a given HashMap by an arbitrary amount.
* Used for various aggregate data. The increment amount can be negative.
*
*
* @param map
* The HashMap
* @param key
* The key for the counter to be incremented, if it does not exist
* it will be added (set to equal to increment
).
* If null it will increment the counter "unknown".
* @param increment
* The amount to increment counter related to the key
.
*/
protected static void incrementMapCount(ConcurrentMap map,
String key, long increment) {
if (key == null) {
key = "unknown";
}
AtomicLong lw = (AtomicLong)map.get(key);
if(lw == null) {
lw = new AtomicLong(0);
AtomicLong prevVal = map.putIfAbsent(key, lw);
if(prevVal != null) {
lw = prevVal;
}
}
lw.addAndGet(increment);
}
/**
* Sort the entries of the given Map in descending order by their
* values, which must be longs wrapped with AtomicLong
.
*
* Elements are sorted by value from largest to smallest. Equal values are
* sorted by their keys. The returned map is a StoredSortedMap, and
* thus may include duplicate keys.
*
* If the passed-in map requires access to be synchronized, the caller
* should ensure this synchronization.
*
* @param mapOfAtomicLongValues
* Assumes values are wrapped with AtomicLong.
* @return a sorted set containing the same elements as the map.
*/
public DisposableStoredSortedMap getReverseSortedCopy(
final Map mapOfAtomicLongValues) {
DisposableStoredSortedMap sortedMap =
bdb.getStoredMap(
null,
Long.class,
String.class,
true,
false);
for(String k : mapOfAtomicLongValues.keySet()) {
sortedMap.put(-mapOfAtomicLongValues.get(k).longValue(), k);
}
return sortedMap;
}
/**
* Return a objectCache representing the distribution of status codes for
* successfully fetched curis, as represented by a cache where key ->
* val represents (string)code -> (integer)count.
*
* Note: All the values are wrapped with a
* {@link AtomicLong AtomicLong}
*
* @return statusCodeDistribution
*/
public Map getStatusCodeDistribution() {
return statusCodeDistribution;
}
/**
* Returns the time (in millisec) when a URI belonging to a given host was
* last finished processing.
*
* @param host The host to look up time of last completed URI.
* @return Returns the time (in millisec) when a URI belonging to a given
* host was last finished processing. If no URI has been completed for host
* -1 will be returned.
*/
public long getHostLastFinished(String host){
return serverCache.getHostFor(host).getSubstats().getLastSuccessTime();
}
/**
* Returns the accumulated number of bytes downloaded from a given host.
* @param host name of the host
* @return the accumulated number of bytes downloaded from a given host
*/
public long getBytesPerHost(String host){
return serverCache.getHostFor(host).getSubstats().getTotalBytes();
}
/**
* Returns the accumulated number of bytes from files of a given file type.
* @param filetype Filetype to check.
* @return the accumulated number of bytes from files of a given mime type
*/
public long getBytesPerFileType(String filetype){
return getReportValue(mimeTypeBytes, filetype);
}
/**
* Get the total number of ToeThreads (sleeping and active)
*
* @return The total number of ToeThreads
*/
public int threadCount() {
return this.controller != null? controller.getToeCount(): 0;
}
public String crawledBytesSummary() {
return crawledBytes.summary();
}
/**
* If the curi is a seed, we update the processedSeeds cache.
*
* @param curi The CrawlURI that may be a seed.
* @param disposition The disposition of the CrawlURI.
*/
protected void handleSeed(final CrawlURI curi, final String disposition) {
if(getTrackSeeds()) {
if(curi.isSeed()){
SeedRecord sr = processedSeedsRecords.getOrUse(
curi.getURI(),
new Supplier() {
public SeedRecord get() {
return new SeedRecord(curi, disposition);
}});
sr.updateWith(curi,disposition);
}
} // else ignore
}
public void crawledURISuccessful(CrawlURI curi) {
handleSeed(curi,"Seed successfully crawled");
// save crawled bytes tally
crawledBytes.accumulate(curi);
// Save status codes
incrementMapCount(statusCodeDistribution,
Integer.toString(curi.getFetchStatus()));
// Save mime types
String mime = MimetypeUtils.truncate(curi.getContentType());
incrementMapCount(mimeTypeDistribution, mime);
incrementMapCount(mimeTypeBytes, mime, curi.getContentSize());
ServerCache sc = serverCache;
if (getTrackSources() && curi.getData().containsKey(A_SOURCE_TAG)) {
saveSourceStats(curi.getSourceTag(),
sc.getHostFor(curi.getUURI()).getHostName());
tallySourceStats(curi);
}
}
protected void saveSourceStats(String source, String hostname) {
ConcurrentMap hostUriCount = sourceHostDistribution.get(source);
if(hostUriCount == null) {
hostUriCount = new ConcurrentHashMap();
ConcurrentMap prevVal = sourceHostDistribution.putIfAbsent(source, hostUriCount);
if (prevVal!=null) {
hostUriCount = prevVal;
}
}
incrementMapCount(hostUriCount, hostname);
}
protected void tallySourceStats(CrawlURI curi) {
String source = curi.getSourceTag();
CrawledBytesHistotable sourceStats = statsBySource.get(source);
if (sourceStats == null) {
sourceStats = new CrawledBytesHistotable();
statsBySource.put(source, sourceStats);
}
sourceStats.accumulate(curi);
}
public void crawledURINeedRetry(CrawlURI curi) {
handleSeed(curi,"Failed to crawl seed, will retry");
}
public void crawledURIDisregard(CrawlURI curi) {
handleSeed(curi,"Seed was disregarded");
}
public void crawledURIFailure(CrawlURI curi) {
handleSeed(curi,"Failed to crawl seed");
}
/**
* Get a seed iterator for the job being monitored. Only reports
* known seeds from processedSeedsRecords -- but as a SeedListener,
* that should be complete.
*
* Note: This iterator will iterate over a list of strings not
* UURIs like the Scope seed iterator. The strings are equal to the URIs'
* getURIString() values.
* @return the seed iterator
*/
public Iterator getSeedsIterator() {
return processedSeedsRecords.keySet().iterator();
}
public DisposableStoredSortedMap calcSeedRecordsSortedByStatusCode() {
Iterator i = getSeedsIterator();
DisposableStoredSortedMap sortedMap =
bdb.getStoredMap(
null,
Integer.class,
SeedRecord.class,
true,
false);
while (i.hasNext()) {
String seed = i.next();
SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed);
if(sr==null) {
sr = new SeedRecord(seed,"Seed has not been processed");
// no need to retain synthesized record
}
sortedMap.put(sr.sortShiftStatusCode(), sr);
}
return sortedMap;
}
/**
* Return a copy of the hosts distribution in reverse-sorted (largest first)
* order.
*
* @return SortedMap of hosts distribution
*/
public DisposableStoredSortedMap getReverseSortedHostCounts(
Map hostCounts) {
synchronized(hostCounts){
return getReverseSortedCopy(hostCounts);
}
}
/**
* Return a copy of the hosts distribution in reverse-sorted
* (largest first) order.
* @return SortedMap of hosts distribution
*/
public DisposableStoredSortedMap calcReverseSortedHostsDistribution() {
final DisposableStoredSortedMap sortedMap =
bdb.getStoredMap(
null,
Long.class,
String.class,
true,
false);
serverCache.forAllHostsDo(new Closure() {
@Override
public void execute(Object hostObj) {
CrawlHost host = (CrawlHost) hostObj;
sortedMap.put(-host.getSubstats().getFetchSuccesses(), host.getHostName());
}
});
return sortedMap;
}
public File writeReportFile(String reportName) {
for(Report report: getReports()) {
if (report.getClass().getSimpleName().equals(reportName)) {
return writeReportFile(report, false);
}
}
return null;
}
protected File writeReportFile(Report report, boolean force) {
File f = new File(getReportsDir().getFile(), report.getFilename());
if(f.exists() && !controller.isRunning() && controller.hasStarted() && !force
&& !(report instanceof CrawlSummaryReport)) {
// controller already started and stopped
// and file exists
// and force not requested
// so, don't overwrite
// except for crawlReport
logger.info("reusing report: " + f.getAbsolutePath());
return f;
}
try {
FileUtils.ensureWriteableDirectory(f.getParentFile());
PrintWriter bw = new PrintWriter(new FileWriter(f));
report.write(bw, this);
bw.close();
addToManifest(f.getAbsolutePath(),
CrawlerLoggerModule.MANIFEST_REPORT_FILE, true);
} catch (IOException e) {
logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() +
" at the end of crawl.", e);
}
logger.info("wrote report: " + f.getAbsolutePath());
return f;
}
protected void addToManifest(String absolutePath, char manifest_report_file, boolean b) {
// TODO Auto-generated method stub
}
/**
* Run the reports.
*/
public void dumpReports() {
// TODO: sooner than here! Add all files mentioned in the crawl
// order to the manifest set.
//controller.addOrderToManifest();
for (Report report: getReports()) {
if (report.getShouldReportAtEndOfCrawl()) {
try {
writeReportFile(report, true);
} catch (RuntimeException re) {
logger.log(Level.SEVERE, re.getMessage(), re);
}
}
}
}
public void crawlCheckpoint(/*StateProvider*/ Object def, File cpDir) throws Exception {
// CrawlController is managing the checkpointing of this object.
logNote("CRAWL CHECKPOINTING TO " + cpDir.toString());
}
private long getReportValue(Map map, String key) {
if (key == null) {
return -1;
}
Object o = map.get(key);
if (o == null) {
return -2;
}
if (!(o instanceof AtomicLong)) {
throw new IllegalStateException("Expected AtomicLong but got "
+ o.getClass() + " for " + key);
}
return ((AtomicLong)o).get();
}
public void onApplicationEvent(ApplicationEvent event) {
if(event instanceof CrawlStateEvent) {
CrawlStateEvent event1 = (CrawlStateEvent)event;
switch(event1.getState()) {
case PAUSED:
this.crawlPaused(event1.getMessage());
break;
case RUNNING:
this.crawlResuming(event1.getMessage());
break;
case EMPTY:
this.crawlEmpty(event1.getMessage());
break;
case PAUSING:
this.crawlPausing(event1.getMessage());
break;
case STOPPING:
this.crawlEnding(event1.getMessage());
break;
case FINISHED:
this.crawlEnded(event1.getMessage());
break;
case PREPARING:
this.crawlResuming(event1.getMessage());
break;
default:
throw new RuntimeException("Unknown state: " + event1.getState());
}
}
if(event instanceof CrawlURIDispositionEvent) {
CrawlURIDispositionEvent dvent = (CrawlURIDispositionEvent)event;
switch(dvent.getDisposition()) {
case SUCCEEDED:
this.crawledURISuccessful(dvent.getCrawlURI());
break;
case FAILED:
this.crawledURIFailure(dvent.getCrawlURI());
break;
case DISREGARDED:
this.crawledURIDisregard(dvent.getCrawlURI());
break;
case DEFERRED_FOR_RETRY:
this.crawledURINeedRetry(dvent.getCrawlURI());
break;
default:
throw new RuntimeException("Unknown disposition: " + dvent.getDisposition());
}
}
}
public void tallySeeds() {
seedsTotal = 0;
seedsCrawled = 0;
if(processedSeedsRecords==null) {
// nothing to tally
return;
}
for (Iterator i = getSeedsIterator();i.hasNext();) {
SeedRecord sr = processedSeedsRecords.get(i.next());
seedsTotal++;
if(sr!=null &&(sr.getStatusCode() > 0)) {
seedsCrawled++;
}
}
}
/**
* Create a seed record, even on initial notification (before
* any real attempt/processing.
*
* @see org.archive.modules.seeds.SeedListener#addedSeed(org.archive.modules.CrawlURI)
*/
public void addedSeed(CrawlURI curi) {
// record even undisposed-seeds for reporting purposes
handleSeed((CrawlURI) curi, "");
}
/**
* Do nothing with nonseed lines.
*
* @see org.archive.modules.seeds.SeedListener#nonseedLine(java.lang.String)
*/
public boolean nonseedLine(String line) {
return false;
}
public void concludedSeedBatch() {
// do nothing;
}
// BeanNameAware
protected String beanName;
public void setBeanName(String name) {
this.beanName = name;
}
// Checkpointable
public void startCheckpoint(Checkpoint checkpointInProgress) {}
public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException {
JSONObject json = new JSONObject();
try {
json.put("crawlStartTime",crawlStartTime);
json.put("crawlEndTime",crawlEndTime);
long virtualCrawlPauseStarted = crawlPauseStarted;
if(virtualCrawlPauseStarted<1) {
// TODO: use instant checkpoint started?
virtualCrawlPauseStarted = System.currentTimeMillis();
}
json.put("crawlPauseStarted",virtualCrawlPauseStarted);
json.put("crawlTotalPausedTime",crawlTotalPausedTime);
json.put("mimeTypeDistribution", mimeTypeDistribution);
json.put("mimeTypeBytes", mimeTypeBytes);
json.put("statusCodeDistribution", statusCodeDistribution);
json.put("sourceHostDistribution", sourceHostDistribution);
json.put("statsBySource", statsBySource);
json.put("crawledBytes", crawledBytes);
// TODO: save crawledBytesHistotable
checkpointInProgress.saveJson(beanName, json);
} catch (JSONException e) {
// impossible
throw new RuntimeException(e);
}
}
public void finishCheckpoint(Checkpoint checkpointInProgress) {}
protected Checkpoint recoveryCheckpoint;
public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) {
this.recoveryCheckpoint = recoveryCheckpoint;
}
public CrawledBytesHistotable getSourceStats(String source) {
return statsBySource.get(source);
}
}