org.archive.crawler.framework.Engine Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.framework;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.archive.util.ArchiveUtils;
/**
* Implementation for Engine. Jobs and profiles are stored in a
* directory called the jobsDir. The jobs are contained as subdirectories of
* jobDir.
*
* @author pjack
* @author gojomo
*/
public class Engine {
@SuppressWarnings("unused")
private static final long serialVersionUID = 4L;
final public static String LOGS_DIR_NAME = "logs subdirectory";
final public static String REPORTS_DIR_NAME = "reports subdirectory";
final private static Logger LOGGER =
Logger.getLogger(Engine.class.getName());
/** directory where job directories are expected */
protected File jobsDir;
/** map of job short names -> CrawlJob instances */
protected HashMap jobConfigs = new HashMap();
protected String profileCxmlPath =
"/org/archive/crawler/restlet/profile-crawler-beans.cxml";
public Engine(File jobsDir) {
this.jobsDir = jobsDir;
try {
org.archive.util.FileUtils.ensureWriteableDirectory(jobsDir);
} catch (IOException e) {
throw new IllegalStateException(e);
}
findJobConfigs();
// TODO: cleanup any cruft from improperly ended jobs
}
/**
* Find all job configurations in the usual place -- subdirectories
* of the jobs directory with files ending '.cxml', and from jobPathFiles
* (previously added by user) found in the jobs directory
*/
public void findJobConfigs() {
// TODO: allow other places/paths to be scanned/added as well?
// remove crawljobs whose directories have disappeared
// TODO: try a more delicate cleanup; eg: if appCtx exists?
for(String jobName: jobConfigs.keySet().toArray(new String[0])) {
CrawlJob cj = jobConfigs.get(jobName);
if(!cj.getJobDir().exists()) {
jobConfigs.remove(jobName);
}
}
// just in case...
if (! jobsDir.exists()) {
LOGGER.log(Level.SEVERE,"jobsDir has disappeared: "+jobsDir.toString());
return;
}
// discover any new job directories
for (File candidateFile: jobsDir.listFiles()) {
File jobFile = candidateFile;
if (candidateFile.getName().endsWith(".jobpath")) {
// convert .jobpaths to the referenced external directory
jobFile = getJobDirectoryFrom(candidateFile);
}
if (jobConfigs.containsKey(jobFile.getName())) {
continue;
}
if(!addJobDirectory(jobFile)) {
LOGGER.log(Level.WARNING,"invalid job directory: " + jobFile
+ " where job expected from: " + candidateFile);
}
}
}
/**
* Return the job directory File read from the supplied ".jobpath" file,
* or null on any error.
*/
protected File getJobDirectoryFrom(File jobPathFile) {
try {
return new File(FileUtils.readFileToString(jobPathFile).trim());
} catch (IOException e) {
LOGGER.log(Level.SEVERE,"bad .jobpath: "+jobPathFile, e);
return null;
}
}
/**
* Adds a job directory to the Engine known jobConfigs if not extant.
*
* @param dir directory to be added
* @return true if directory successfully added, false for any failure
*/
public boolean addJobDirectory(File dir) {
if(dir==null) {
return false;
}
File[] candidateConfigs = dir.listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(".cxml");
}});
if(candidateConfigs==null || candidateConfigs.length == 0) {
// no CXML file found!
return false;
}
if(jobConfigs.containsKey(dir.getName())) {
// same-name job already exists
return false;
}
for (File cxml : candidateConfigs) {
try {
CrawlJob cj = new CrawlJob(cxml);
if(!cj.getJobDir().getParentFile().equals(getJobsDir())) {
writeJobPathFile(cj);
}
jobConfigs.put(cj.getShortName(),cj);
LOGGER.log(Level.INFO,"added crawl job: " + cj.getShortName());
return true;
} catch (IOException iae) {
LOGGER.log(Level.SEVERE,"unable to add job directory"+dir,iae);
} catch (IllegalArgumentException iae) {
LOGGER.log(Level.SEVERE,"bad cxml: "+cxml,iae);
}
}
// path rejected for some reason
return false;
}
public Map getJobConfigs() {
return jobConfigs;
}
/**
* Copy a job to a new location, possibly making a job
* a profile or a profile a runnable job.
*
* @param orig CrawlJob representing source
* @param destDir File location destination
* @param asProfile true if destination should become a profile
* @throws IOException
*/
public synchronized void copy(CrawlJob orig, File destDir, boolean asProfile)
throws IOException {
org.archive.util.FileUtils.ensureWriteableDirectory(destDir);
if(destDir.list().length>0) {
throw new IOException("destination dir not empty");
}
File srcDir = orig.getPrimaryConfig().getParentFile();
// FIXME: Add option for only copying history DB
// FIXME: Don't hardcode these names
// FIXME: (?) copy any referenced file (ConfigFile/ConfigPath),
// even outside the job directory?
// copy all simple files except the 'job.log' and its '.lck' (if any)
FileUtils.copyDirectory(srcDir, destDir,
FileFilterUtils.andFileFilter(
FileFilterUtils.fileFileFilter(),
FileFilterUtils.notFileFilter(
FileFilterUtils.prefixFileFilter("job.log"))));
// ...and all contents of 'resources' subdir...
File srcResources = new File(srcDir, "resources");
if (srcResources.isDirectory()) {
FileUtils.copyDirectory(srcResources, new File(destDir, "resources"));
}
File newPrimaryConfig = new File(destDir, orig.getPrimaryConfig().getName());
if(asProfile) {
if(!orig.isProfile()) {
// rename cxml to have 'profile-' prefix
FileUtils.moveFile(
newPrimaryConfig,
new File(destDir, "profile-"+newPrimaryConfig.getName()));
}
} else {
if(orig.isProfile()) {
// rename cxml to remove 'profile-' prefix
FileUtils.moveFile(
newPrimaryConfig,
new File(destDir, newPrimaryConfig.getName().substring(8)));
}
}
findJobConfigs();
}
/**
* Copy a job to a new location, possibly making a job
* a profile or a profile a runnable job.
*
* @param cj CrawlJob representing source
* @param copyTo String location destination; interpreted relative to jobsDir
* @param asProfile true if destination should become a profile
* @throws IOException
*/
public void copy(CrawlJob cj, String copyTo, boolean asProfile) throws IOException {
File dest = new File(copyTo);
if(!dest.isAbsolute()) {
dest = new File(jobsDir,copyTo);
}
copy(cj,dest,asProfile);
}
public String getHeritrixVersion(){
return ArchiveUtils.VERSION;
}
public synchronized void deleteJob(CrawlJob job) throws IOException {
FileUtils.deleteDirectory(job.getJobDir());
}
public void requestLaunch(String shortName) {
jobConfigs.get(shortName).launch();
}
public CrawlJob getJob(String shortName) {
if(!jobConfigs.containsKey(shortName)) {
// try a rescan if not already present
findJobConfigs();
}
return jobConfigs.get(shortName);
}
public File getJobsDir() {
return jobsDir;
}
public Map heapReportData() {
Map map = new LinkedHashMap();
map.put("usedBytes", Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory());
map.put("totalBytes", Runtime.getRuntime().totalMemory());
map.put("maxBytes", Runtime.getRuntime().maxMemory());
return map;
}
public String heapReport() {
long totalMemory = Runtime.getRuntime().totalMemory();
long freeMemory = Runtime.getRuntime().freeMemory();
long maxMemory = Runtime.getRuntime().maxMemory();
StringBuilder sb = new StringBuilder(64);
sb
.append((totalMemory-freeMemory)/1024)
.append(" KiB used; ")
.append(totalMemory/1024)
.append(" KiB current heap; ")
.append(maxMemory/1024)
.append(" KiB max heap");
return sb.toString();
}
public void shutdown() {
// TODO stop everything
for(CrawlJob job : jobConfigs.values()) {
if(job.isRunning()) {
job.terminate();
}
}
waitForNoRunningJobs(0);
}
/**
* Wait for all jobs to be in non-running state, or until timeout
* (given in ms) elapses. Use '0' for no timeout (wait as long as
* necessary.
*
* @param timeout
* @return true if timeout occurred and a job is (possibly) still running
*/
public boolean waitForNoRunningJobs(long timeout) {
long startTime = System.currentTimeMillis();
// wait for all jobs to not be running
outer: while(true) {
if(timeout>0 && (startTime+timeout)>System.currentTimeMillis()) {
return true;
}
try {
Thread.sleep(250);
} catch (InterruptedException e) {
break;
}
for(CrawlJob job : jobConfigs.values()) {
if(job.isRunning()) {
continue outer;
}
}
break;
}
try {
// wait an extra quarter second for good measure
Thread.sleep(250);
} catch (InterruptedException e) {
// ignore
}
return false;
}
/**
* @return InputStream resource from defined profile CXML path
*/
protected InputStream getProfileCxmlResource() {
return getClass().getResourceAsStream(profileCxmlPath);
}
/**
* create a new job dir and copy profile CXML into as non-profile CXML
* @param newJobDir new job directory
*/
public boolean createNewJobWithDefaults(File newJobDir) {
try {
// get crawler-beans template into string
InputStream inStream = getProfileCxmlResource();
String defaultCxmlStr;
defaultCxmlStr = IOUtils.toString(inStream);
inStream.close();
// write default crawler-beans string to new job dir
org.archive.util.FileUtils.ensureWriteableDirectory(newJobDir);
File newJobCxml = new File(newJobDir,"crawler-beans.cxml");
FileUtils.writeStringToFile(newJobCxml, defaultCxmlStr);
return true;
} catch (IOException e) {
LOGGER.log(Level.SEVERE,"failed to create new job: "
+ newJobDir.getAbsolutePath());
return false;
}
}
/**
* Writes a .jobpath file for the new CrawlJob, whose directory is
* outside the main Engine jobs directory.
*
* @param job CrawlJob whose main directory the .jobpath should point to
* @throws IOException for any IO error
*/
public void writeJobPathFile(CrawlJob job) throws IOException {
String jobpathFileName = job.getShortName()+".jobpath";
File jobpathFile = new File(jobsDir,jobpathFileName);
FileUtils.writeStringToFile(jobpathFile, job.getJobDir().getAbsolutePath()+"\n");
LOGGER.log(Level.INFO, "wrote jobpath file: " + jobpathFileName);
}
}