All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.ScriptedProcessor Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules;

import java.io.Reader;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;

import org.apache.commons.io.IOUtils;
import org.archive.io.ReadSource;
import org.springframework.beans.BeansException;
import org.springframework.beans.factory.BeanInitializationException;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Required;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;

/**
 * A processor which runs a JSR-223 script on the CrawlURI. 
 *
 * Script source may be provided via a file local to the crawler or
 * an inline configuration string. 
 * 
 * The source must include a function "run()" taking one argument. 
 * Each processed CrawlURI is passed to this script function. 
 * 
 * Other variables available to the script include 'self' (this 
 * ScriptedProcessor instance) and 'context' (the crawl's 
 * ApplicationContext instance, from which all named beans are
 * reachable). 
 * 
 * TODO: provide way to trigger reload of script mid-crawl; perhaps
 * by watching for a certain applicationEvent? 
 * 
 * @author gojomo
 * @version $Date$, $Revision$
 */
public class ScriptedProcessor extends Processor 
implements ApplicationContextAware, InitializingBean {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    private static final Logger logger =
        Logger.getLogger(ScriptedProcessor.class.getName());

    /** engine name; default "beanshell" */
    protected String engineName = "beanshell";
    public String getEngineName() {
        return this.engineName;
    }
    public void setEngineName(String name) {
        this.engineName = name;
    }
    
    protected ReadSource scriptSource = null;
    public ReadSource getScriptSource() {
        return this.scriptSource;
    }
    @Required
    public void setScriptSource(ReadSource source) {
        this.scriptSource = source; 
    }

    /**
     * Whether each ToeThread should get its own independent script 
     * engine, or they should share synchronized access to one 
     * engine. Default is true, meaning each thread gets its own 
     * isolated engine.
     */
    protected boolean isolateThreads = true; 
    public boolean getIsolateThreads() {
        return isolateThreads;
    }
    public void setIsolateThreads(boolean isolateThreads) {
        this.isolateThreads = isolateThreads;
    }

    protected ApplicationContext appCtx;
    public void setApplicationContext(ApplicationContext applicationContext) throws BeansException {
        this.appCtx = applicationContext;
    }
    
    transient protected ThreadLocal threadEngine = 
        new ThreadLocal();
    protected ScriptEngine sharedEngine;

    /**
     * Constructor.
     */
    public ScriptedProcessor() {
        super();
    }

    public void afterPropertiesSet() throws Exception {
        // fail at build-time if script engine not available
        if(null == new ScriptEngineManager().getEngineByName(engineName)) {
            throw new BeanInitializationException("named ScriptEngine not available");
        }
    }
    protected boolean shouldProcess(CrawlURI curi) {
        return true;
    }
    
    @Override
    protected void innerProcess(CrawlURI curi) {
        // depending on previous configuration, engine may 
        // be local to this thread or shared
        ScriptEngine engine = getEngine(); 
        synchronized(engine) {
            // synchronization is harmless for local thread engine,
            // necessary for shared engine
            engine.put("curi",curi);
            engine.put("appCtx", appCtx);
            try {
                engine.eval("process(curi)");
            } catch (ScriptException e) {
                logger.log(Level.WARNING,e.getMessage(),e);
            } finally { 
                engine.put("curi", null);
                engine.put("appCtx", null);
            }
        }
    }

    /**
     * Get the proper ScriptEngine instance -- either shared or local 
     * to this thread. 
     * @return ScriptEngine to use
     */
    protected synchronized ScriptEngine getEngine() {
        if (getIsolateThreads()) {
            if (threadEngine.get() == null) {
                threadEngine.set(newEngine());
            }
            return threadEngine.get();
        } else {
            if (sharedEngine == null) {
                sharedEngine = newEngine();
            }
            return sharedEngine;
        }
    }

    /**
     * Create a new {@link ScriptEngine} instance, preloaded with any supplied
     * source file and the variables 'self' (this {@link ScriptedProcessor}) and
     * 'context' (the {@link ApplicationContext}).
     * 
     * @return the new ScriptEngine instance
     */
    protected ScriptEngine newEngine() {
        ScriptEngine interpreter = new ScriptEngineManager().getEngineByName(engineName);

        interpreter.put("self", this);
        interpreter.put("context", appCtx);
        
        Reader reader = null;
        try {
            reader = getScriptSource().obtainReader();
            interpreter.eval(reader);
        } catch (ScriptException e) {
            logger.log(Level.SEVERE,"script problem",e);
        } finally {
            IOUtils.closeQuietly(reader);
        }

        return interpreter; 
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy