org.archive.crawler.framework.CrawlJob Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.framework;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Semaphore;
import java.util.logging.FileHandler;
import java.util.logging.Formatter;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.collections.ListUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.crawler.event.CrawlStateEvent;
import org.archive.crawler.framework.CrawlController.StopCompleteEvent;
import org.archive.crawler.frontier.WorkQueue;
import org.archive.crawler.reporting.AlertThreadGroup;
import org.archive.crawler.reporting.CrawlStatSnapshot;
import org.archive.crawler.reporting.StatisticsTracker;
import org.archive.spring.ConfigPath;
import org.archive.spring.ConfigPathConfigurer;
import org.archive.spring.PathSharingContext;
import org.archive.util.ArchiveUtils;
import org.archive.util.ObjectIdentityCache;
import org.archive.util.TextUtils;
import org.joda.time.DateTime;
import org.springframework.beans.BeanWrapperImpl;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanCreationException;
import org.springframework.beans.factory.NoSuchBeanDefinitionException;
import org.springframework.context.ApplicationEvent;
import org.springframework.context.ApplicationListener;
import org.springframework.validation.Errors;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
* CrawlJob represents a crawl configuration, including its
* configuration files, instantiated/running ApplicationContext, and
* disk output, potentially across multiple runs.
*
* CrawlJob provides convenience methods for an administrative
* interface to assemble, launch, monitor, and manage crawls.
*
* @author gojomo
*/
public class CrawlJob implements Comparable, ApplicationListener {
private final static Logger LOGGER =
Logger.getLogger(CrawlJob.class.getName());
protected File primaryConfig;
protected PathSharingContext ac;
protected int launchCount;
protected boolean isLaunchInfoPartial;
protected DateTime lastLaunch;
protected AlertThreadGroup alertThreadGroup;
protected DateTime xmlOkAt = new DateTime(0L);
protected Logger jobLogger;
public CrawlJob(File cxml) {
primaryConfig = cxml;
isLaunchInfoPartial = false;
scanJobLog(); // XXX look at launch directories instead/first?
}
public File getPrimaryConfig() {
return primaryConfig;
}
public File getJobDir() {
return getPrimaryConfig().getParentFile();
}
public String getShortName() {
return getJobDir().getName();
}
public File getJobLog() {
return new File(getJobDir(),"job.log");
}
public synchronized PathSharingContext getJobContext() {
return ac;
}
public boolean isLaunchInfoPartial() {
return isLaunchInfoPartial;
}
/**
* Get a logger to a distinguished file, job.log in the job's
* directory, into which job-specific events may be reported.
*
* @return Logger writing to the job-specific log
*/
public Logger getJobLogger() {
if(jobLogger == null) {
jobLogger = Logger.getLogger(getShortName());
try {
mainJobLogHandler = new FileHandler(getJobLog().getAbsolutePath(),true);
mainJobLogHandler.setFormatter(new JobLogFormatter());
jobLogger.addHandler(mainJobLogHandler);
} catch (SecurityException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
jobLogger.setLevel(Level.INFO);
}
return jobLogger;
}
public DateTime getLastLaunch() {
return lastLaunch;
}
public int getLaunchCount() {
return launchCount;
}
/**
* Refresh knowledge of total launched and last launch by scanning
* the job.log.
*/
protected void scanJobLog() {
File jobLog = getJobLog();
launchCount = 0;
if(!jobLog.exists()) return;
try {
Pattern launchLine = Pattern.compile("(\\S+) (\\S+) Job launched");
long startPosition = 0;
if (jobLog.length() > FileUtils.ONE_KB * 100) {
isLaunchInfoPartial = true;
startPosition = jobLog.length()-(FileUtils.ONE_KB * 100);
}
FileInputStream jobLogIn = new FileInputStream(jobLog);
jobLogIn.getChannel().position(startPosition);
BufferedReader jobLogReader = new BufferedReader(
new InputStreamReader(jobLogIn));
String line;
// If we sliced into the file, make sure we skip to the next line:
// (See https://github.com/internetarchive/heritrix3/issues/239)
if (startPosition != 0) {
line = jobLogReader.readLine();
}
// Parse lines looking for launch details:
while ((line = jobLogReader.readLine()) != null) {
Matcher m = launchLine.matcher(line);
if (m.matches()) {
launchCount++;
lastLaunch = new DateTime(m.group(1));
}
}
jobLogReader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* Is this job a 'profile' (or template), meaning it may be editted
* or copied to another jobs, but should not be launched. Profiles
* are marked with the convention that their short name
* (job directory name) begins "profile-".
*
* @return true if this job is a 'profile'
*/
public boolean isProfile() {
return primaryConfig.getName().startsWith("profile-");
}
//
// writing a basic HTML representation
//
public void writeHtmlTo(PrintWriter pw) {
writeHtmlTo(pw,"./");
}
public void writeHtmlTo(PrintWriter pw, String uriPrefix) {
pw.println("");
pw.println(""+getShortName()+"");
if(isProfile()) {
pw.println("(profile)");
}
if(hasApplicationContext()) {
pw.println("«"+getJobStatusDescription()+"»");
}
if (true == isLaunchInfoPartial) {
pw.print(" at least ");
} else {
pw.print(" ");
}
pw.println(getLaunchCount() + " launches");
pw.println("");
pw.println("");
pw.println(getPrimaryConfig());
pw.println("");
if(lastLaunch!=null) {
pw.println("(last at "+lastLaunch+")");
}
}
/**
* Is the primary XML config minimally well-formed?
*/
public void checkXML() {
// TODO: suppress check if XML unchanged? job.log when XML changed?
DateTime testTime = new DateTime(getPrimaryConfig().lastModified());
Document doc = getDomDocument(getPrimaryConfig());
// TODO: check for other minimal requirements, like
// presence of a few key components (CrawlController etc.)?
if(doc!=null) {
xmlOkAt = testTime;
} else {
xmlOkAt = new DateTime(0L);
}
}
/**
* Read a file to a DOM Document; return null if this isn't possible
* for any reason.
*
* @param f File of XML
* @return org.w3c.dom.Document or null if problems encountered
*/
protected Document getDomDocument(File f) {
try {
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
return docBuilder.parse(f);
} catch (ParserConfigurationException e) {
return null;
} catch (SAXException e) {
return null;
} catch (IOException e) {
return null;
}
}
/**
* Is the primary config file legal XML?
*
* @return true if the primary configuration file passed XML testing
*/
public boolean isXmlOk() {
return xmlOkAt.getMillis() >= getPrimaryConfig().lastModified();
}
/**
* Can the configuration yield an assembled ApplicationContext?
*/
public synchronized void instantiateContainer() {
checkXML();
if(ac==null) {
try {
ac = new PathSharingContext(new String[] {primaryConfig.toURI().toString()}, false, null);
ac.addApplicationListener(this);
ac.refresh();
getCrawlController(); // trigger NoSuchBeanDefinitionException if no CC
getJobLogger().log(Level.INFO,"Job instantiated");
} catch (BeansException be) {
// Calling doTeardown() and therefore ac.close() here sometimes
// triggers an IllegalStateException and logs stack trace from
// within spring, even if ac.isActive(). So, just null it.
ac = null;
beansException(be);
}
}
}
/**
* Report a BeansException during instantiation; report chain in
* reverse order (so root cause is first); ignore non-BeansExceptions
* or messages without a useful compact message.
* @param be BeansException
*/
protected void beansException(BeansException be) {
LinkedList beMsgs = new LinkedList();
Throwable t = be;
while (t!=null) {
if(t instanceof BeansException) {
String msg = shortMessage((BeansException)t);
if(msg!=null) {
beMsgs.add(msg);
}
}
t = t.getCause();
}
Collections.reverse(beMsgs);
String shortMessage = StringUtils.join(beMsgs,"; ");
getJobLogger().log(Level.SEVERE,shortMessage,be);
}
/**
* Return a short useful message for common BeansExceptions.
* @param ex BeansException
* @return String short descriptive message
*/
protected String shortMessage(BeansException ex) {
if(ex instanceof NoSuchBeanDefinitionException) {
NoSuchBeanDefinitionException nsbde = (NoSuchBeanDefinitionException)ex;
return "Missing required bean: "
+ (nsbde.getBeanName()!=null ? "\""+nsbde.getBeanName()+"\" " : "")
+ (nsbde.getBeanType()!=null ? "\""+nsbde.getBeanType()+"\" " : "");
}
if(ex instanceof BeanCreationException) {
BeanCreationException bce = (BeanCreationException)ex;
return bce.getBeanName()== null
? ""
: "Can't create bean '"+bce.getBeanName()+"'";
}
return ex.getMessage().replace('\n', ' ');
}
public synchronized boolean hasApplicationContext() {
return ac!=null;
}
/**
* Does the assembled ApplicationContext self-validate? Any failures
* are reported as WARNING log events in the job log.
*
* TODO: make these severe?
*/
public synchronized void validateConfiguration() {
instantiateContainer();
if(ac==null) {
// fatal errors already encountered and reported
return;
}
ac.validate();
HashMap allErrors = ac.getAllErrors();
for(String name : allErrors.keySet()) {
for(Object err : allErrors.get(name).getAllErrors()) {
LOGGER.log(Level.WARNING,err.toString());
}
}
}
/**
* Did the ApplicationContext self-validate?
* return true if validation passed without errors
*/
public synchronized boolean hasValidApplicationContext() {
if(ac==null) {
return false;
}
HashMap allErrors = ac.getAllErrors();
return allErrors != null && allErrors.isEmpty();
}
//
// Valid job lifecycle operations
//
/**
* Launch a crawl into 'running' status, assembling if necessary.
*
* (Note the crawl may have been configured to start in a 'paused'
* state.)
*/
public void launch() {
if (isProfile()) {
throw new IllegalArgumentException("Can't launch profile" + this);
}
if(isRunning()) {
getJobLogger().log(Level.SEVERE,"Can't relaunch running job");
return;
} else {
CrawlController cc = getCrawlController();
if(cc!=null && cc.hasStarted()) {
getJobLogger().log(Level.SEVERE,"Can't relaunch previously-launched assembled job");
return;
}
}
validateConfiguration();
if(!hasValidApplicationContext()) {
getJobLogger().log(Level.SEVERE,"Can't launch problem configuration");
return;
}
//final String job = changeState(j, ACTIVE);
// this temporary thread ensures all crawl-created threads
// land in the AlertThreadGroup, to assist crawl-wide
// logging/alerting
alertThreadGroup = new AlertThreadGroup(getShortName());
alertThreadGroup.addLogger(getJobLogger());
Thread launcher = new Thread(alertThreadGroup, getShortName()+" launchthread") {
public void run() {
CrawlController cc = getCrawlController();
startContext();
if(cc!=null) {
cc.requestCrawlStart();
}
}
};
getJobLogger().log(Level.INFO,"Job launched");
scanJobLog();
launcher.start();
try {
launcher.join();
} catch (InterruptedException e) {
// do nothing
}
}
protected transient Handler mainJobLogHandler;
protected transient Handler currentLaunchJobLogHandler;
protected boolean needTeardown = false;
/**
* Start the context, catching and reporting any BeansExceptions.
*/
protected synchronized void startContext() {
try {
ac.start();
// job log file covering just this launch
getJobLogger().removeHandler(currentLaunchJobLogHandler);
File f = new File(ac.getCurrentLaunchDir(), "job.log");
currentLaunchJobLogHandler = new FileHandler(f.getAbsolutePath(), true);
currentLaunchJobLogHandler.setFormatter(new JobLogFormatter());
getJobLogger().addHandler(currentLaunchJobLogHandler);
} catch (BeansException be) {
doTeardown();
beansException(be);
} catch (Exception e) {
LOGGER.log(Level.SEVERE,e.getClass().getSimpleName()+": "+e.getMessage(),e);
try {
doTeardown();
} catch (Exception e2) {
e2.printStackTrace(System.err);
}
}
}
/**
* Sort for reverse-chronological listing.
*
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(CrawlJob o) {
// prefer reverse-chronological ordering
return -((Long)getLastActivityTime()).compareTo(o.getLastActivityTime());
}
public long getLastActivityTime() {
return Math.max(getPrimaryConfig().lastModified(), getJobLog().lastModified());
}
public synchronized boolean isRunning() {
return this.ac != null && this.ac.isActive() && this.ac.isRunning();
}
public synchronized CrawlController getCrawlController() {
if(ac==null) {
return null;
}
return (CrawlController) ac.getBean("crawlController");
}
public boolean isPausable() {
CrawlController cc = getCrawlController();
if(cc==null) {
return false;
}
return cc.isActive();
}
public boolean isUnpausable() {
CrawlController cc = getCrawlController();
if(cc==null) {
return false;
}
return cc.isPaused() || cc.isPausing();
}
/**
* Return the configured Checkpointer instance, if there is exactly
* one, otherwise null.
*
* @return Checkpointer
*/
public synchronized CheckpointService getCheckpointService() {
if(ac==null) {
return null;
}
Map beans =
getJobContext().getBeansOfType(CheckpointService.class);
return (beans.size() == 1) ? beans.values().iterator().next() : null;
}
/**
* Ensure a fresh start for any configuration changes or relaunches,
* by stopping and discarding an existing ApplicationContext.
*
* @return true if teardown is complete when method returns, false if still in progress
*/
public synchronized boolean teardown() {
CrawlController cc = getCrawlController();
if (cc != null) {
cc.requestCrawlStop();
needTeardown = true;
// wait up to 3 seconds for stop
for(int i = 0; i < 11; i++) {
if(cc.isStopComplete()) {
break;
}
try {
Thread.sleep(300);
} catch (InterruptedException e) {
// do nothing
}
}
if (cc.isStopComplete()) {
doTeardown();
}
}
assert needTeardown == (ac != null);
return !needTeardown;
}
// ac guaranteed to be null after this method is called
protected synchronized void doTeardown() {
needTeardown = false;
try {
if (ac != null) {
ac.close();
}
} finally {
// all this stuff should happen even in case ac.close() bugs out
ac = null;
xmlOkAt = new DateTime(0);
if (currentLaunchJobLogHandler != null) {
getJobLogger().removeHandler(currentLaunchJobLogHandler);
currentLaunchJobLogHandler.close();
currentLaunchJobLogHandler = null;
}
getJobLogger().log(Level.INFO, "Job instance discarded");
if (mainJobLogHandler != null) {
getJobLogger().removeHandler(mainJobLogHandler);
mainJobLogHandler.close();
mainJobLogHandler = null;
}
jobLogger = null;
}
}
/**
* Formatter for job.log
*/
public class JobLogFormatter extends Formatter {
@Override
public String format(LogRecord record) {
StringBuilder sb = new StringBuilder();
sb
.append(new DateTime(record.getMillis()))
.append(" ")
.append(record.getLevel())
.append(" ")
.append(record.getMessage())
.append("\n");
return sb.toString();
}
}
/**
* Return all config files included via 'import' statements in the
* primary config (or other included configs).
*
* @param xml File to examine
* @return List<File> of all transitively-imported Files
*/
@SuppressWarnings("unchecked")
public List getImportedConfigs(File xml) {
List imports = new LinkedList();
Document doc = getDomDocument(xml);
if(doc==null) {
return ListUtils.EMPTY_LIST;
}
NodeList importElements = doc.getElementsByTagName("import");
for(int i = 0; i < importElements.getLength(); i++) {
File imported = new File(
getJobDir(),
importElements.item(i).getAttributes().getNamedItem("resource").getTextContent());
imports.add(imported);
imports.addAll(getImportedConfigs(imported));
}
return imports;
}
/**
* Return all known ConfigPaths, as an aid to viewing or editing.
*
* @return all ConfigPaths known to the ApplicationContext, in a
* map by name, or an empty map if no ApplicationContext
*/
@SuppressWarnings("unchecked")
public synchronized Map getConfigPaths() {
if(ac==null) {
return MapUtils.EMPTY_MAP;
}
ConfigPathConfigurer cpc =
(ConfigPathConfigurer)ac.getBean("configPathConfigurer");
return cpc.getAllConfigPaths();
}
/**
* Compute a path relative to the job directory for all contained
* files, or null if the File is not inside the job directory.
*
* @param f File
* @return path relative to the job directory, or null if File not
* inside job dir
*/
public String jobDirRelativePath(File f) {
try {
String filePath = f.getCanonicalPath();
String jobPath = getJobDir().getCanonicalPath();
if(filePath.startsWith(jobPath)) {
String jobRelative = filePath.substring(jobPath.length()).replace(File.separatorChar, '/');
if(jobRelative.startsWith("/")) {
jobRelative = jobRelative.substring(1);
}
return jobRelative;
}
} catch (IOException e) {
getJobLogger().log(Level.WARNING,"bad file: "+f);
}
return null;
}
/**
* Log note of all ApplicationEvents.
*
* @see org.springframework.context.ApplicationListener#onApplicationEvent(org.springframework.context.ApplicationEvent)
*/
public void onApplicationEvent(ApplicationEvent event) {
if(event instanceof CrawlStateEvent) {
getJobLogger().log(Level.INFO, ((CrawlStateEvent)event).getState() +
(ac.getCurrentLaunchId() != null ? " " + ac.getCurrentLaunchId() : ""));
}
if (event instanceof StopCompleteEvent) {
synchronized (this) {
if (needTeardown) {
doTeardown();
}
}
}
if(event instanceof CheckpointSuccessEvent) {
getJobLogger().log(Level.INFO, "CHECKPOINTED "+((CheckpointSuccessEvent)event).getCheckpoint().getName());
}
}
/**
* Is it reasonable to offer a launch button
* @return true if launchable
*/
public boolean isLaunchable() {
if (!hasApplicationContext()) {
// ok to try launch if not yet built
return true;
}
if (!hasValidApplicationContext()) {
// never launch if specifically invalid
return false;
}
// launchable if cc not yet instantiated or not yet started
CrawlController cc = getCrawlController();
return cc == null || !cc.hasStarted();
}
public int getAlertCount() {
if (alertThreadGroup != null) {
return alertThreadGroup.getAlertCount();
} else {
return 0;
}
}
protected StatisticsTracker getStats() {
CrawlController cc = getCrawlController();
return cc!=null ? cc.getStatisticsTracker() : null;
}
public Map rateReportData() {
StatisticsTracker stats = getStats();
if (stats == null) {
return null;
}
CrawlStatSnapshot snapshot = stats.getSnapshot();
Map map = new LinkedHashMap();
map.put("currentDocsPerSecond", snapshot.currentDocsPerSecond);
map.put("averageDocsPerSecond", snapshot.docsPerSecond);
map.put("currentKiBPerSec", snapshot.currentKiBPerSec);
map.put("averageKiBPerSec", snapshot.totalKiBPerSec);
return map;
}
public Object rateReport() {
StatisticsTracker stats = getStats();
if(stats==null) {
return "n/a";
}
CrawlStatSnapshot snapshot = stats.getSnapshot();
StringBuilder sb = new StringBuilder();
sb
.append(ArchiveUtils.doubleToString(snapshot.currentDocsPerSecond,2))
.append(" URIs/sec (")
.append(ArchiveUtils.doubleToString(snapshot.docsPerSecond,2))
.append(" avg); ")
.append(snapshot.currentKiBPerSec)
.append(" KB/sec (")
.append(snapshot.totalKiBPerSec)
.append(" avg)");
return sb.toString();
}
public Map loadReportData() {
StatisticsTracker stats = getStats();
if (stats == null) {
return null;
}
CrawlStatSnapshot snapshot = stats.getSnapshot();
Map map = new LinkedHashMap();
map.put("busyThreads", snapshot.busyThreads);
map.put("totalThreads", stats.threadCount());
map.put("congestionRatio", snapshot.congestionRatio);
map.put("averageQueueDepth", snapshot.averageDepth);
map.put("deepestQueueDepth", snapshot.deepestUri);
return map;
}
public Object loadReport() {
StatisticsTracker stats = getStats();
if(stats==null) {
return "n/a";
}
CrawlStatSnapshot snapshot = stats.getSnapshot();
StringBuilder sb = new StringBuilder();
sb
.append(snapshot.busyThreads)
.append(" active of ")
.append(stats.threadCount())
.append(" threads; ")
.append(ArchiveUtils.doubleToString(snapshot.congestionRatio,2))
.append(" congestion ratio; ")
.append(snapshot.deepestUri)
.append(" deepest queue; ")
.append(snapshot.averageDepth)
.append(" average depth");
return sb.toString();
}
public Map uriTotalsReportData() {
StatisticsTracker stats = getStats();
if (stats == null) {
return null;
}
CrawlStatSnapshot snapshot = stats.getSnapshot();
Map totals = new LinkedHashMap();
totals.put("downloadedUriCount", snapshot.downloadedUriCount);
totals.put("queuedUriCount", snapshot.queuedUriCount);
totals.put("totalUriCount", snapshot.totalCount());
totals.put("futureUriCount", snapshot.futureUriCount);
return totals;
}
public String uriTotalsReport() {
Map uriTotals = uriTotalsReportData();
if (uriTotals == null) {
return "n/a";
}
StringBuilder sb = new StringBuilder(64);
sb
.append(uriTotals.get("downloadedUriCount"))
.append(" downloaded + ")
.append(uriTotals.get("queuedUriCount"))
.append(" queued = ")
.append(uriTotals.get("totalUriCount"))
.append(" total");
if(uriTotals.get("futureUriCount") >0) {
sb
.append(" (")
.append(uriTotals.get("futureUriCount"))
.append(" future)");
}
return sb.toString();
}
public Map sizeTotalsReportData() {
StatisticsTracker stats = getStats();
if(stats==null) {
return null;
}
// stats.crawledBytesSummary() also includes totals, so add those in here
TreeMap map = new TreeMap(stats.getCrawledBytes());
map.put("total", stats.getCrawledBytes().getTotalBytes());
map.put("totalCount", stats.getCrawledBytes().getTotalUrls());
return map;
}
public String sizeTotalsReport() {
StatisticsTracker stats = getStats();
if(stats==null) {
return "n/a";
}
return stats.crawledBytesSummary();
}
public Map elapsedReportData() {
StatisticsTracker stats = getStats();
if(stats==null) {
return null;
}
Map map = new LinkedHashMap();
long timeElapsed = stats.getCrawlElapsedTime();
map.put("elapsedMilliseconds", timeElapsed);
map.put("elapsedPretty", ArchiveUtils.formatMillisecondsToConventional(timeElapsed));
return map;
}
public String elapsedReport() {
StatisticsTracker stats = getStats();
if(stats==null) {
return "n/a";
}
long timeElapsed = stats.getCrawlElapsedTime();
return ArchiveUtils.formatMillisecondsToConventional(timeElapsed);
}
public Map threadReportData() {
CrawlController cc = getCrawlController();
if (cc == null) {
return null;
}
return cc.getToeThreadReportShortData();
}
public String threadReport() {
CrawlController cc = getCrawlController();
if(cc==null) {
return "n/a";
}
return cc.getToeThreadReportShort();
}
public Map frontierReportData() {
CrawlController cc = getCrawlController();
if (cc == null) {
return null;
}
return cc.getFrontier().shortReportMap();
}
public String frontierReport() {
CrawlController cc = getCrawlController();
if(cc==null) {
return "n/a";
}
return cc.getFrontierReportShort();
}
public void terminate() {
if (getCrawlController() != null) {
getCrawlController().requestCrawlStop();
}
}
/**
* Utility method for getting a bean or any other object addressable
* with a 'bean path' -- a property-path string (with dots and
* []indexes) starting with a bean name.
*
* TODO: move elsewhere? on the appContext? a util class?
*
* @param beanPath String 'property-path' with bean name as first segment
* @return Object targeted by beanPath, or null if nont
*/
public Object getBeanpathTarget(String beanPath) {
try {
int i = beanPath.indexOf(".");
String beanName = i<0?beanPath:beanPath.substring(0,i);
Object namedBean = ac.getBean(beanName);
if (i<0) {
return namedBean;
} else {
BeanWrapperImpl bwrap = new BeanWrapperImpl(namedBean);
String propPath = beanPath.substring(i+1);
return bwrap.getPropertyValue(propPath);
}
} catch (BeansException e) {
return null;
}
}
public String getJobStatusDescription() {
if(!hasApplicationContext()) {
return "Unbuilt";
} else if(isRunning()) {
if (!getCrawlController().isRunning()) {
return "Not running: " +
getCrawlController().getCrawlExitStatus();
}
return "Active: "+getCrawlController().getState();
} else if(isLaunchable()){
return "Ready";
} else {
return "Finished: "+getCrawlController().getCrawlExitStatus();
}
}
protected Semaphore exportLock = new Semaphore(1);
public long exportPendingUris() {
CrawlController cc = getCrawlController();
if (cc==null) {
return -1L;
}
if (!cc.isPaused()) {
cc.requestCrawlPause();
return -2L;
}
Frontier f = cc.getFrontier();
if (f == null) {
return -3L;
}
long pendingUrisCount = 0L;
boolean bLocked = exportLock.tryAcquire();
if (bLocked) {
try {
File outFile = new File(getJobDir(), "pendingUris.txt");
if (outFile.exists()) {
outFile.delete();
}
FileOutputStream out = new FileOutputStream(outFile);
OutputStreamWriter outStreamWriter = new OutputStreamWriter(out, StandardCharsets.UTF_8);
PrintWriter writer = new PrintWriter(new BufferedWriter(outStreamWriter, 65536));
pendingUrisCount = f.exportPendingUris(writer);
writer.close();
outStreamWriter.close();
out.close();
}
catch (IOException e) {
LOGGER.log(Level.SEVERE, e.getMessage(), e);
}
finally {
exportLock.release();
}
}
else {
return -4L;
}
return pendingUrisCount;
}
}//EOC