All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.reporting.StatisticsTracker Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.reporting;

import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.collections.Closure;
import org.archive.bdb.BdbModule;
import org.archive.bdb.DisposableStoredSortedMap;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.event.CrawlURIDispositionEvent;
import org.archive.crawler.event.StatSnapshotEvent;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Engine;
import org.archive.crawler.util.CrawledBytesHistotable;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.modules.seeds.SeedListener;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.ConfigPath;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.JSONUtils;
import org.archive.util.MimetypeUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.ObjectIdentityMemCache;
import org.archive.util.PaddingStringBuffer;
import org.archive.util.Supplier;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.context.Lifecycle;
import org.xbill.DNS.DClass;
import org.xbill.DNS.Lookup;

import com.sleepycat.je.DatabaseException;

/**
 * This is an implementation of the AbstractTracker. It is designed to function
 * with the WUI as well as performing various logging activity.
 * 

* At the end of each snapshot a line is written to the * 'progress-statistics.log' file. *

* The header of that file is as follows: *

 [timestamp] [discovered]    [queued] [downloaded] [doc/s(avg)]  [KB/s(avg)] [dl-failures] [busy-thread] [mem-use-KB]
* First there is a timestamp, accurate down to 1 second. *

* discovered, queued, downloaded and dl-failures * are (respectively) the discovered URI count, pending URI count, successfully * fetched count and failed fetch count from the frontier at the time of the * snapshot. *

* KB/s(avg) is the bandwidth usage. We use the total bytes downloaded * to calculate average bandwidth usage (KB/sec). Since we also note the value * each time a snapshot is made we can calculate the average bandwidth usage * during the last snapshot period to gain a "current" rate. The first number is * the current and the average is in parenthesis. *

* doc/s(avg) works the same way as doc/s except it show the number of * documents (URIs) rather then KB downloaded. *

* busy-threads is the total number of ToeThreads that are not available * (and thus presumably busy processing a URI). This information is extracted * from the crawl controller. *

* Finally mem-use-KB is extracted from the run time environment * (Runtime.getRuntime().totalMemory()). *

* In addition to the data collected for the above logs, various other data * is gathered and stored by this tracker. *

    *
  • Successfully downloaded documents per fetch status code *
  • Successfully downloaded documents per document mime type *
  • Amount of data per mime type *
  • Successfully downloaded documents per host *
  • Amount of data per host *
  • Disposition of all seeds (this is written to 'reports.log' at end of * crawl) *
  • Successfully downloaded documents per host per source *
* * @author Parker Thompson * @author Kristinn Sigurdsson * @author gojomo */ public class StatisticsTracker implements ApplicationContextAware, ApplicationListener, SeedListener, Lifecycle, Runnable, Checkpointable, BeanNameAware { @SuppressWarnings("unused") private static final long serialVersionUID = 6L; protected SeedModule seeds; public SeedModule getSeeds() { return this.seeds; } @Autowired public void setSeeds(SeedModule seeds) { this.seeds = seeds; } protected BdbModule bdb; @Autowired public void setBdbModule(BdbModule bdb) { this.bdb = bdb; } protected ConfigPath reportsDir = new ConfigPath(Engine.REPORTS_DIR_NAME,"${launchId}/reports"); public ConfigPath getReportsDir() { return reportsDir; } public void setReportsDir(ConfigPath reportsDir) { this.reportsDir = reportsDir; } protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } protected int liveHostReportSize = 20; public int getLiveHostReportSize() { return liveHostReportSize; } public void setLiveHostReportSize(int liveHostReportSize) { this.liveHostReportSize = liveHostReportSize; } protected ApplicationContext appCtx; public void setApplicationContext(ApplicationContext appCtx) throws BeansException { this.appCtx = appCtx; } /** * Messages from the StatisticsTracker. */ private final static Logger logger = Logger.getLogger(StatisticsTracker.class.getName()); /** * Whether to maintain seed disposition records (expensive in * crawls with millions of seeds) */ protected boolean trackSeeds = true; public boolean getTrackSeeds() { return this.trackSeeds; } public void setTrackSeeds(boolean trackSeeds) { this.trackSeeds = trackSeeds; } /** * Whether to maintain hosts-per-source-tag records for; very expensive in * crawls with large numbers of source-tags (seeds) or large crawls * over many hosts */ protected boolean trackSources = true; public boolean getTrackSources() { return this.trackSources; } public void setTrackSources(boolean trackSources) { this.trackSources = trackSources; } /** * The interval between writing progress information to log. */ protected int intervalSeconds = 20; public int getIntervalSeconds() { return this.intervalSeconds; } public void setIntervalSeconds(int interval) { this.intervalSeconds = interval; } /** * Number of crawl-stat sample snapshots to keep for calculation * purposes. */ protected int keepSnapshotsCount = 5; public int getKeepSnapshotsCount() { return this.keepSnapshotsCount; } public void setKeepSnapshotsCount(int count) { this.keepSnapshotsCount = count; } protected CrawlController controller; public CrawlController getCrawlController() { return this.controller; } @Autowired public void setCrawlController(CrawlController controller) { this.controller = controller; } /** wall-clock time the crawl started */ protected long crawlStartTime; /** wall-clock time the crawl ended */ protected long crawlEndTime = -1; // Until crawl ends, this value is -1. /** wall-clock time of last pause, while pause in progress */ protected long crawlPauseStarted = 0; /** duration tally of all time spent in paused state */ protected long crawlTotalPausedTime = 0; /** snapshots of crawl tallies and rates */ protected LinkedList snapshots = new LinkedList(); protected ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(); /* * Cumulative data */ /** tally sizes novel, verified (same hash), vouched (not-modified) */ protected CrawledBytesHistotable crawledBytes = new CrawledBytesHistotable(); public CrawledBytesHistotable getCrawledBytes() { return crawledBytes; } // TODO: fortify these against key explosion with bigmaps like other tallies /** Keep track of the file types we see (mime type -> count) */ protected ConcurrentMap mimeTypeDistribution = new ConcurrentHashMap(); protected ConcurrentMap mimeTypeBytes = new ConcurrentHashMap(); /** Keep track of fetch status codes */ protected ConcurrentMap statusCodeDistribution = new ConcurrentHashMap(); /** Keep track of URL counts per host per seed */ // TODO: restore spill-to-disk, like with processedSeedsRecords protected ConcurrentHashMap> sourceHostDistribution = new ConcurrentHashMap>(); /** Keep track of crawled bytes stats per seed */ // TODO: spill-to-disk (requires bdb replacement for Histotable, or some // other refactoring) protected ConcurrentHashMap statsBySource = new ConcurrentHashMap(); /** * Record of seeds and latest results */ protected ObjectIdentityCache processedSeedsRecords = new ObjectIdentityMemCache(); protected long seedsTotal = -1; protected long seedsCrawled = -1; public StatisticsTracker() { } protected List reports; public List getReports() { // lazy initialization so we don't pointlessly create a bunch of beans // right before setReports is called if (reports == null) { reports = new LinkedList(); reports.add(new CrawlSummaryReport()); reports.add(new SeedsReport()); reports.add(new HostsReport()); reports.add(new SourceTagsReport()); reports.add(new MimetypesReport()); reports.add(new ResponseCodeReport()); reports.add(new ProcessorsReport()); reports.add(new FrontierSummaryReport()); reports.add(new ToeThreadsReport()); } return reports; } public void setReports(List reports) { this.reports = reports; } protected boolean isRunning = false; public boolean isRunning() { return isRunning; } public void stop() { isRunning = false; executor.shutdownNow(); progressStatisticsEvent(); dumpReports(); } @SuppressWarnings("unchecked") public void start() { isRunning = true; boolean isRecover = (recoveryCheckpoint != null); try { this.processedSeedsRecords = bdb.getObjectCache("processedSeedsRecords", isRecover, SeedRecord.class); if(isRecover) { JSONObject json = recoveryCheckpoint.loadJson(beanName); crawlStartTime = json.getLong("crawlStartTime"); crawlEndTime = json.getLong("crawlEndTime"); crawlTotalPausedTime = json.getLong("crawlTotalPausedTime"); crawlPauseStarted = json.getLong("crawlPauseStarted"); tallyCurrentPause(); JSONUtils.putAllAtomicLongs( mimeTypeDistribution, json.getJSONObject("mimeTypeDistribution")); JSONUtils.putAllAtomicLongs( mimeTypeBytes, json.getJSONObject("mimeTypeBytes")); JSONUtils.putAllAtomicLongs( statusCodeDistribution, json.getJSONObject("statusCodeDistribution")); JSONObject shd = json.getJSONObject("sourceHostDistribution"); Iterator keyIter = shd.keys(); for(; keyIter.hasNext();) { String source = keyIter.next(); ConcurrentHashMap hostUriCount = new ConcurrentHashMap(); JSONUtils.putAllAtomicLongs(hostUriCount,shd.getJSONObject(source)); sourceHostDistribution.put(source, hostUriCount); } // optional so we can still recover checkpoints from earlier versions of heritrix JSONObject ss = json.optJSONObject("statsBySource"); if (ss != null) { keyIter = ss.keys(); for(; keyIter.hasNext();) { String source = keyIter.next(); CrawledBytesHistotable cb = new CrawledBytesHistotable(); JSONUtils.putAllLongs(cb, ss.getJSONObject(source)); statsBySource.put(source, cb); } } JSONUtils.putAllLongs( crawledBytes, json.getJSONObject("crawledBytes")); } } catch (DatabaseException e) { throw new IllegalStateException(e); } catch (JSONException e) { throw new IllegalStateException(e); } // Log the legend this.controller.logProgressStatistics(progressStatisticsLegend()); executor.scheduleAtFixedRate(this, 0, getIntervalSeconds(), TimeUnit.SECONDS); } /** * Do activity. Is called by ScheduledExecutorService at intervals specified by * intervalSeconds * */ public void run() { try { progressStatisticsEvent(); } catch (Throwable e) { logger.log(Level.SEVERE, "unexpected exception from progressStatisticsEvent()", e); } } /** * @return legend for progress-statistics lines/log */ public String progressStatisticsLegend() { return " timestamp" + " discovered " + " queued downloaded doc/s(avg) KB/s(avg) " + " dl-failures busy-thread mem-use-KB heap-size-KB " + " congestion max-depth avg-depth"; } public String getProgressStamp() { return progressStatisticsLegend() + "\n" + getSnapshot().getProgressStatisticsLine(); } /** * Notify tracker that crawl has begun. Must be called * outside tracker's own thread, to ensure it is noted * before other threads start interacting with tracker. */ public void noteStart() { if (this.crawlStartTime == 0) { // Note the time the crawl starts (only if not already set) this.crawlStartTime = System.currentTimeMillis(); } } /** * A method for logging current crawler state. * * This method will be called by run() at intervals specified in * the crawl order file. It is also invoked when pausing or * stopping a crawl to capture the state at that point. Default behavior is * call to {@link CrawlController#logProgressStatistics} so CrawlController * can act on progress statistics event. *

* It is recommended that for implementations of this method it be * carefully considered if it should be synchronized in whole or in * part */ protected synchronized void progressStatisticsEvent() { CrawlStatSnapshot snapshot = getSnapshot(); if (this.controller != null) { this.controller.logProgressStatistics(snapshot.getProgressStatisticsLine()); } snapshots.addFirst(snapshot); while(snapshots.size()>getKeepSnapshotsCount()) { snapshots.removeLast(); } // publish app event appCtx.publishEvent(new StatSnapshotEvent(this,snapshot)); // temporary workaround for // [ 996161 ] Fix DNSJava issues (memory) -- replace with JNDI-DNS? // http://sourceforge.net/support/tracker.php?aid=996161 Lookup.getDefaultCache(DClass.IN).clearCache(); } public CrawlStatSnapshot getSnapshot() { // TODO: take snapshot implementation from a spring prototype? CrawlStatSnapshot snapshot = new CrawlStatSnapshot(); snapshot.collect(controller,this); return snapshot; } public LinkedList listSnapshots() { // not named getSnapshots to avoid autodiscovery as a (invalid) bean-property return snapshots; } public CrawlStatSnapshot getLastSnapshot() { CrawlStatSnapshot snap = snapshots.peek(); return snap == null ? getSnapshot() : snap; } public long getCrawlElapsedTime() { if (crawlStartTime == 0) { // if no start time set yet, consider elapsed time zero return 0; } if (crawlPauseStarted != 0) { // currently paused, calculate time up to last pause return crawlPauseStarted - crawlTotalPausedTime - crawlStartTime; } // not paused, calculate total time to end or (if running) now return ((crawlEndTime>0)?crawlEndTime:System.currentTimeMillis()) - crawlTotalPausedTime - crawlStartTime; } public void crawlPausing(String statusMessage) { logNote("CRAWL WAITING - " + statusMessage); } protected void logNote(final String note) { this.controller.logProgressStatistics(new PaddingStringBuffer() .append(ArchiveUtils.getLog14Date(new Date())) .append(" ") .append(note) .toString()); } public void crawlPaused(String statusMessage) { crawlPauseStarted = System.currentTimeMillis(); progressStatisticsEvent(); logNote("CRAWL PAUSED - " + statusMessage); } public void crawlResuming(String statusMessage) { tallyCurrentPause(); if (this.crawlStartTime == 0) { noteStart(); } logNote("CRAWL RUNNING - " + statusMessage); } public void crawlEmpty(String statusMessage) { logNote("CRAWL EMPTY - " + statusMessage); } /** * For a current pause (if any), add paused time to total and reset */ protected void tallyCurrentPause() { if (this.crawlPauseStarted > 0) { // Ok, we managed to actually pause before resuming. this.crawlTotalPausedTime += (System.currentTimeMillis() - this.crawlPauseStarted); } this.crawlPauseStarted = 0; } public void crawlEnding(String sExitMessage) { logNote("CRAWL ENDING - " + sExitMessage); } public void crawlEnded(String sExitMessage) { crawlEndTime = System.currentTimeMillis(); logNote("CRAWL ENDED - " + sExitMessage); } /** * Returns how long the current crawl has been running *including* * time paused (contrast with getCrawlElapsedTime()). * * @return The length of time - in msec - that this crawl has been running. */ public long getCrawlDuration() { return ((crawlEndTime>0)?crawlEndTime:System.currentTimeMillis()) - crawlStartTime; } /** Returns a HashMap that contains information about distributions of * encountered mime types. Key/value pairs represent * mime type -< count. *

* Note: All the values are wrapped with a {@link AtomicLong AtomicLong} * @return mimeTypeDistribution */ public Map getFileDistribution() { return mimeTypeDistribution; } /** * Increment a counter for a key in a given HashMap. Used for various * aggregate data. * * @param map The Map or ConcurrentMap * @param key The key for the counter to be incremented, if it does not * exist it will be added (set to 1). If null it will * increment the counter "unknown". */ protected static void incrementMapCount(ConcurrentMap map, String key) { incrementMapCount(map,key,1); } /** * Increment a counter for a key in a given HashMap by an arbitrary amount. * Used for various aggregate data. The increment amount can be negative. * * * @param map * The HashMap * @param key * The key for the counter to be incremented, if it does not exist * it will be added (set to equal to increment). * If null it will increment the counter "unknown". * @param increment * The amount to increment counter related to the key. */ protected static void incrementMapCount(ConcurrentMap map, String key, long increment) { if (key == null) { key = "unknown"; } AtomicLong lw = (AtomicLong)map.get(key); if(lw == null) { lw = new AtomicLong(0); AtomicLong prevVal = map.putIfAbsent(key, lw); if(prevVal != null) { lw = prevVal; } } lw.addAndGet(increment); } /** * Sort the entries of the given Map in descending order by their * values, which must be longs wrapped with AtomicLong. *

* Elements are sorted by value from largest to smallest. Equal values are * sorted by their keys. The returned map is a StoredSortedMap, and * thus may include duplicate keys. * * If the passed-in map requires access to be synchronized, the caller * should ensure this synchronization. * * @param mapOfAtomicLongValues * Assumes values are wrapped with AtomicLong. * @return a sorted set containing the same elements as the map. */ public DisposableStoredSortedMap getReverseSortedCopy( final Map mapOfAtomicLongValues) { DisposableStoredSortedMap sortedMap = bdb.getStoredMap( null, Long.class, String.class, true, false); for(String k : mapOfAtomicLongValues.keySet()) { sortedMap.put(-mapOfAtomicLongValues.get(k).longValue(), k); } return sortedMap; } /** * Return a objectCache representing the distribution of status codes for * successfully fetched curis, as represented by a cache where key -> * val represents (string)code -> (integer)count. * * Note: All the values are wrapped with a * {@link AtomicLong AtomicLong} * * @return statusCodeDistribution */ public Map getStatusCodeDistribution() { return statusCodeDistribution; } /** * Returns the time (in millisec) when a URI belonging to a given host was * last finished processing. * * @param host The host to look up time of last completed URI. * @return Returns the time (in millisec) when a URI belonging to a given * host was last finished processing. If no URI has been completed for host * -1 will be returned. */ public long getHostLastFinished(String host){ return serverCache.getHostFor(host).getSubstats().getLastSuccessTime(); } /** * Returns the accumulated number of bytes downloaded from a given host. * @param host name of the host * @return the accumulated number of bytes downloaded from a given host */ public long getBytesPerHost(String host){ return serverCache.getHostFor(host).getSubstats().getTotalBytes(); } /** * Returns the accumulated number of bytes from files of a given file type. * @param filetype Filetype to check. * @return the accumulated number of bytes from files of a given mime type */ public long getBytesPerFileType(String filetype){ return getReportValue(mimeTypeBytes, filetype); } /** * Get the total number of ToeThreads (sleeping and active) * * @return The total number of ToeThreads */ public int threadCount() { return this.controller != null? controller.getToeCount(): 0; } public String crawledBytesSummary() { return crawledBytes.summary(); } /** * If the curi is a seed, we update the processedSeeds cache. * * @param curi The CrawlURI that may be a seed. * @param disposition The disposition of the CrawlURI. */ protected void handleSeed(final CrawlURI curi, final String disposition) { if(getTrackSeeds()) { if(curi.isSeed()){ SeedRecord sr = processedSeedsRecords.getOrUse( curi.getURI(), new Supplier() { public SeedRecord get() { return new SeedRecord(curi, disposition); }}); sr.updateWith(curi,disposition); } } // else ignore } public void crawledURISuccessful(CrawlURI curi) { handleSeed(curi,"Seed successfully crawled"); // save crawled bytes tally crawledBytes.accumulate(curi); // Save status codes incrementMapCount(statusCodeDistribution, Integer.toString(curi.getFetchStatus())); // Save mime types String mime = MimetypeUtils.truncate(curi.getContentType()); incrementMapCount(mimeTypeDistribution, mime); incrementMapCount(mimeTypeBytes, mime, curi.getContentSize()); ServerCache sc = serverCache; if (getTrackSources() && curi.getData().containsKey(A_SOURCE_TAG)) { saveSourceStats(curi.getSourceTag(), sc.getHostFor(curi.getUURI()).getHostName()); tallySourceStats(curi); } } protected void saveSourceStats(String source, String hostname) { ConcurrentMap hostUriCount = sourceHostDistribution.get(source); if(hostUriCount == null) { hostUriCount = new ConcurrentHashMap(); ConcurrentMap prevVal = sourceHostDistribution.putIfAbsent(source, hostUriCount); if (prevVal!=null) { hostUriCount = prevVal; } } incrementMapCount(hostUriCount, hostname); } protected void tallySourceStats(CrawlURI curi) { String source = curi.getSourceTag(); CrawledBytesHistotable sourceStats = statsBySource.get(source); if (sourceStats == null) { sourceStats = new CrawledBytesHistotable(); statsBySource.put(source, sourceStats); } sourceStats.accumulate(curi); } public void crawledURINeedRetry(CrawlURI curi) { handleSeed(curi,"Failed to crawl seed, will retry"); } public void crawledURIDisregard(CrawlURI curi) { handleSeed(curi,"Seed was disregarded"); } public void crawledURIFailure(CrawlURI curi) { handleSeed(curi,"Failed to crawl seed"); } /** * Get a seed iterator for the job being monitored. Only reports * known seeds from processedSeedsRecords -- but as a SeedListener, * that should be complete. * * Note: This iterator will iterate over a list of strings not * UURIs like the Scope seed iterator. The strings are equal to the URIs' * getURIString() values. * @return the seed iterator */ public Iterator getSeedsIterator() { return processedSeedsRecords.keySet().iterator(); } public DisposableStoredSortedMap calcSeedRecordsSortedByStatusCode() { Iterator i = getSeedsIterator(); DisposableStoredSortedMap sortedMap = bdb.getStoredMap( null, Integer.class, SeedRecord.class, true, false); while (i.hasNext()) { String seed = i.next(); SeedRecord sr = (SeedRecord) processedSeedsRecords.get(seed); if(sr==null) { sr = new SeedRecord(seed,"Seed has not been processed"); // no need to retain synthesized record } sortedMap.put(sr.sortShiftStatusCode(), sr); } return sortedMap; } /** * Return a copy of the hosts distribution in reverse-sorted (largest first) * order. * * @return SortedMap of hosts distribution */ public DisposableStoredSortedMap getReverseSortedHostCounts( Map hostCounts) { synchronized(hostCounts){ return getReverseSortedCopy(hostCounts); } } /** * Return a copy of the hosts distribution in reverse-sorted * (largest first) order. * @return SortedMap of hosts distribution */ public DisposableStoredSortedMap calcReverseSortedHostsDistribution() { final DisposableStoredSortedMap sortedMap = bdb.getStoredMap( null, Long.class, String.class, true, false); serverCache.forAllHostsDo(new Closure() { @Override public void execute(Object hostObj) { CrawlHost host = (CrawlHost) hostObj; sortedMap.put(-host.getSubstats().getFetchSuccesses(), host.getHostName()); } }); return sortedMap; } public File writeReportFile(String reportName) { for(Report report: getReports()) { if (report.getClass().getSimpleName().equals(reportName)) { return writeReportFile(report, false); } } return null; } protected File writeReportFile(Report report, boolean force) { File f = new File(getReportsDir().getFile(), report.getFilename()); if(f.exists() && !controller.isRunning() && controller.hasStarted() && !force && !(report instanceof CrawlSummaryReport)) { // controller already started and stopped // and file exists // and force not requested // so, don't overwrite // except for crawlReport logger.info("reusing report: " + f.getAbsolutePath()); return f; } try { FileUtils.ensureWriteableDirectory(f.getParentFile()); PrintWriter bw = new PrintWriter(new FileWriter(f)); report.write(bw, this); bw.close(); addToManifest(f.getAbsolutePath(), CrawlerLoggerModule.MANIFEST_REPORT_FILE, true); } catch (IOException e) { logger.log(Level.SEVERE, "Unable to write " + f.getAbsolutePath() + " at the end of crawl.", e); } logger.info("wrote report: " + f.getAbsolutePath()); return f; } protected void addToManifest(String absolutePath, char manifest_report_file, boolean b) { // TODO Auto-generated method stub } /** * Run the reports. */ public void dumpReports() { // TODO: sooner than here! Add all files mentioned in the crawl // order to the manifest set. //controller.addOrderToManifest(); for (Report report: getReports()) { if (report.getShouldReportAtEndOfCrawl()) { try { writeReportFile(report, true); } catch (RuntimeException re) { logger.log(Level.SEVERE, re.getMessage(), re); } } } } public void crawlCheckpoint(/*StateProvider*/ Object def, File cpDir) throws Exception { // CrawlController is managing the checkpointing of this object. logNote("CRAWL CHECKPOINTING TO " + cpDir.toString()); } private long getReportValue(Map map, String key) { if (key == null) { return -1; } Object o = map.get(key); if (o == null) { return -2; } if (!(o instanceof AtomicLong)) { throw new IllegalStateException("Expected AtomicLong but got " + o.getClass() + " for " + key); } return ((AtomicLong)o).get(); } public void onApplicationEvent(ApplicationEvent event) { if(event instanceof CrawlStateEvent) { CrawlStateEvent event1 = (CrawlStateEvent)event; switch(event1.getState()) { case PAUSED: this.crawlPaused(event1.getMessage()); break; case RUNNING: this.crawlResuming(event1.getMessage()); break; case EMPTY: this.crawlEmpty(event1.getMessage()); break; case PAUSING: this.crawlPausing(event1.getMessage()); break; case STOPPING: this.crawlEnding(event1.getMessage()); break; case FINISHED: this.crawlEnded(event1.getMessage()); break; case PREPARING: this.crawlResuming(event1.getMessage()); break; default: throw new RuntimeException("Unknown state: " + event1.getState()); } } if(event instanceof CrawlURIDispositionEvent) { CrawlURIDispositionEvent dvent = (CrawlURIDispositionEvent)event; switch(dvent.getDisposition()) { case SUCCEEDED: this.crawledURISuccessful(dvent.getCrawlURI()); break; case FAILED: this.crawledURIFailure(dvent.getCrawlURI()); break; case DISREGARDED: this.crawledURIDisregard(dvent.getCrawlURI()); break; case DEFERRED_FOR_RETRY: this.crawledURINeedRetry(dvent.getCrawlURI()); break; default: throw new RuntimeException("Unknown disposition: " + dvent.getDisposition()); } } } public void tallySeeds() { seedsTotal = 0; seedsCrawled = 0; if(processedSeedsRecords==null) { // nothing to tally return; } for (Iterator i = getSeedsIterator();i.hasNext();) { SeedRecord sr = processedSeedsRecords.get(i.next()); seedsTotal++; if(sr!=null &&(sr.getStatusCode() > 0)) { seedsCrawled++; } } } /** * Create a seed record, even on initial notification (before * any real attempt/processing. * * @see org.archive.modules.seeds.SeedListener#addedSeed(org.archive.modules.CrawlURI) */ public void addedSeed(CrawlURI curi) { // record even undisposed-seeds for reporting purposes handleSeed((CrawlURI) curi, ""); } /** * Do nothing with nonseed lines. * * @see org.archive.modules.seeds.SeedListener#nonseedLine(java.lang.String) */ public boolean nonseedLine(String line) { return false; } public void concludedSeedBatch() { // do nothing; } // BeanNameAware protected String beanName; public void setBeanName(String name) { this.beanName = name; } // Checkpointable public void startCheckpoint(Checkpoint checkpointInProgress) {} public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException { JSONObject json = new JSONObject(); try { json.put("crawlStartTime",crawlStartTime); json.put("crawlEndTime",crawlEndTime); long virtualCrawlPauseStarted = crawlPauseStarted; if(virtualCrawlPauseStarted<1) { // TODO: use instant checkpoint started? virtualCrawlPauseStarted = System.currentTimeMillis(); } json.put("crawlPauseStarted",virtualCrawlPauseStarted); json.put("crawlTotalPausedTime",crawlTotalPausedTime); json.put("mimeTypeDistribution", mimeTypeDistribution); json.put("mimeTypeBytes", mimeTypeBytes); json.put("statusCodeDistribution", statusCodeDistribution); json.put("sourceHostDistribution", sourceHostDistribution); json.put("statsBySource", statsBySource); json.put("crawledBytes", crawledBytes); // TODO: save crawledBytesHistotable checkpointInProgress.saveJson(beanName, json); } catch (JSONException e) { // impossible throw new RuntimeException(e); } } public void finishCheckpoint(Checkpoint checkpointInProgress) {} protected Checkpoint recoveryCheckpoint; public void setRecoveryCheckpoint(Checkpoint recoveryCheckpoint) { this.recoveryCheckpoint = recoveryCheckpoint; } public CrawledBytesHistotable getSourceStats(String source) { return statsBySource.get(source); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy