org.archive.modules.ScriptedProcessor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the
Heritrix application to crawl the web. The modules in this project can
be used in applications other than Heritrix, however.
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.modules;
import java.io.Reader;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import org.apache.commons.io.IOUtils;
import org.archive.io.ReadSource;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanInitializationException;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Required;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
/**
* A processor which runs a JSR-223 script on the CrawlURI.
*
* Script source may be provided via a file local to the crawler or
* an inline configuration string.
*
* The source must include a function "run()" taking one argument.
* Each processed CrawlURI is passed to this script function.
*
* Other variables available to the script include 'self' (this
* ScriptedProcessor instance) and 'context' (the crawl's
* ApplicationContext instance, from which all named beans are
* reachable).
*
* TODO: provide way to trigger reload of script mid-crawl; perhaps
* by watching for a certain applicationEvent?
*
* @author gojomo
* @version $Date$, $Revision$
*/
public class ScriptedProcessor extends Processor
implements ApplicationContextAware, InitializingBean {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
private static final Logger logger =
Logger.getLogger(ScriptedProcessor.class.getName());
/** engine name; default "beanshell" */
protected String engineName = "beanshell";
public String getEngineName() {
return this.engineName;
}
public void setEngineName(String name) {
this.engineName = name;
}
protected ReadSource scriptSource = null;
public ReadSource getScriptSource() {
return this.scriptSource;
}
@Required
public void setScriptSource(ReadSource source) {
this.scriptSource = source;
}
/**
* Whether each ToeThread should get its own independent script
* engine, or they should share synchronized access to one
* engine. Default is true, meaning each thread gets its own
* isolated engine.
*/
protected boolean isolateThreads = true;
public boolean getIsolateThreads() {
return isolateThreads;
}
public void setIsolateThreads(boolean isolateThreads) {
this.isolateThreads = isolateThreads;
}
protected ApplicationContext appCtx;
public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
this.appCtx = applicationContext;
}
transient protected ThreadLocal threadEngine =
new ThreadLocal();
protected ScriptEngine sharedEngine;
/**
* Constructor.
*/
public ScriptedProcessor() {
super();
}
public void afterPropertiesSet() throws Exception {
// fail at build-time if script engine not available
if(null == new ScriptEngineManager().getEngineByName(engineName)) {
throw new BeanInitializationException("named ScriptEngine not available");
}
}
protected boolean shouldProcess(CrawlURI curi) {
return true;
}
@Override
protected void innerProcess(CrawlURI curi) {
// depending on previous configuration, engine may
// be local to this thread or shared
ScriptEngine engine = getEngine();
synchronized(engine) {
// synchronization is harmless for local thread engine,
// necessary for shared engine
engine.put("curi",curi);
engine.put("appCtx", appCtx);
try {
engine.eval("process(curi)");
} catch (ScriptException e) {
logger.log(Level.WARNING,e.getMessage(),e);
} finally {
engine.put("curi", null);
engine.put("appCtx", null);
}
}
}
/**
* Get the proper ScriptEngine instance -- either shared or local
* to this thread.
* @return ScriptEngine to use
*/
protected synchronized ScriptEngine getEngine() {
if (getIsolateThreads()) {
if (threadEngine.get() == null) {
threadEngine.set(newEngine());
}
return threadEngine.get();
} else {
if (sharedEngine == null) {
sharedEngine = newEngine();
}
return sharedEngine;
}
}
/**
* Create a new {@link ScriptEngine} instance, preloaded with any supplied
* source file and the variables 'self' (this {@link ScriptedProcessor}) and
* 'context' (the {@link ApplicationContext}).
*
* @return the new ScriptEngine instance
*/
protected ScriptEngine newEngine() {
ScriptEngine interpreter = new ScriptEngineManager().getEngineByName(engineName);
interpreter.put("self", this);
interpreter.put("context", appCtx);
Reader reader = null;
try {
reader = getScriptSource().obtainReader();
interpreter.eval(reader);
} catch (ScriptException e) {
logger.log(Level.SEVERE,"script problem",e);
} finally {
IOUtils.closeQuietly(reader);
}
return interpreter;
}
}