All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pig.scripting.jruby.JrubyScriptEngine Maven / Gradle / Ivy

There is a newer version: 0.17.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.pig.scripting.jruby;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.FuncSpec;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.scripting.ScriptEngine;
import org.apache.pig.tools.pigstats.PigStats;

import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyBoolean;
import org.jruby.embed.LocalContextScope;
import org.jruby.embed.LocalVariableBehavior;
import org.jruby.embed.ScriptingContainer;
import org.jruby.javasupport.JavaEmbedUtils.EvalUnit;
import org.jruby.CompatVersion;

import com.google.common.collect.Maps;

/**
 * Implementation of the script engine for Jruby, which facilitates the registration
 * of scripts as UDFs, and also provides information (via the nested class RubyFunctions)
 * on the registered functions.
 */
public class JrubyScriptEngine extends ScriptEngine {
    private static final Log LOG = LogFactory.getLog(JrubyScriptEngine.class);

    //TODO test if it is necessary to have a per script (or even per method) runtime. PRO: avoid collisions CON: a bunch of runtimes, which could be slow
    protected static final ScriptingContainer rubyEngine;

    private boolean isInitialized = false;

    static {
        rubyEngine = new ScriptingContainer(LocalContextScope.SINGLETHREAD, LocalVariableBehavior.PERSISTENT);
        rubyEngine.setCompatVersion(CompatVersion.RUBY1_9);
    }

    /**
     * This is a static class which provides functionality around the functions that are registered with Pig.
     */
    static class RubyFunctions {
        /**
         * This cache maps function type to a map that maps path to a map of function name to the object
         * which contains information about that function. In the case of an EvalFunc, there is a special
         * function which encapsulates information about the function. In the case of an Accumulator or
         * Algebraic, it is just an instance of the Class object that extends AccumulatorPigUdf or
         * AlgebraicPigUdf, respectively.
         */
        private static Map>> functionsCache = Maps.newHashMap();

        private static Map alreadyRunCache = Maps.newHashMap();

        private static Map cacheFunction = Maps.newHashMap();

        static {
            //TODO use an enum instead?
            cacheFunction.put("evalfunc", "PigUdf.get_functions_to_register");
            cacheFunction.put("accumulator", "AccumulatorPigUdf.classes_to_register");
            cacheFunction.put("algebraic", "AlgebraicPigUdf.classes_to_register");

            functionsCache.put("evalfunc", new HashMap>());
            functionsCache.put("accumulator", new HashMap>());
            functionsCache.put("algebraic", new HashMap>());
        }

        @SuppressWarnings("unchecked")
        private static Map getFromCache(String path, Map> cacheToUpdate, String regCommand) {
            Boolean runCheck = alreadyRunCache.get(path);
            if (runCheck == null || !runCheck.booleanValue()) {
                for (Map.Entry>> entry : functionsCache.entrySet())
                    entry.getValue().remove(path);

                rubyEngine.runScriptlet(getScriptAsStream(path), path);

                alreadyRunCache.put(path, true);
            }

            Map funcMap = cacheToUpdate.get(path);

            if (funcMap == null) {
                funcMap = (Map)rubyEngine.runScriptlet(regCommand);
                cacheToUpdate.put(path, funcMap);
            }

            return funcMap;
        }

        public static Map getFunctions(String cache, String path) {
            return getFromCache(path, functionsCache.get(cache), cacheFunction.get(cache));
        }
    }

    /**
     * Evaluates the script containing ruby udfs to determine what udfs are defined as well as
     * what libaries and other external resources are necessary. These libraries and resources
     * are then packaged with the job jar itself.
     */
    @Override
    public void registerFunctions(String path, String namespace, PigContext pigContext) throws IOException {
        if (!isInitialized) {
            pigContext.scriptJars.add(getJarPath(Ruby.class));
            pigContext.addScriptFile("pigudf.rb", "pigudf.rb");
            isInitialized = true;
        }

        for (Map.Entry entry : RubyFunctions.getFunctions("evalfunc", path).entrySet()) {
            String method = entry.getKey();

            String functionType = rubyEngine.callMethod(entry.getValue(), "name", String.class);

            FuncSpec funcspec = new FuncSpec(JrubyEvalFunc.class.getCanonicalName() + "('" + path + "','" + method +"')");
            pigContext.registerFunction(namespace + "." + method, funcspec);
        }

        for (Map.Entry entry : RubyFunctions.getFunctions("accumulator", path).entrySet()) {
            String method = entry.getKey();

            if (rubyEngine.callMethod(entry.getValue(), "check_if_necessary_methods_present", RubyBoolean.class).isFalse())
                throw new RuntimeException("Method " + method + " does not have all of the required methods present!");

            pigContext.registerFunction(namespace + "." + method, new FuncSpec(JrubyAccumulatorEvalFunc.class.getCanonicalName() + "('" + path + "','" + method +"')"));
        }

        for (Map.Entry entry : RubyFunctions.getFunctions("algebraic", path).entrySet()) {
            String method = entry.getKey();

            if (rubyEngine.callMethod(entry.getValue(), "check_if_necessary_methods_present", RubyBoolean.class).isFalse())
                throw new RuntimeException("Method " + method + " does not have all of the required methods present!");

            Schema schema = PigJrubyLibrary.rubyToPig(rubyEngine.callMethod(entry.getValue(), "get_output_schema", RubySchema.class));
            String canonicalName = JrubyAlgebraicEvalFunc.class.getCanonicalName() + "$";

            // In the case of an Algebraic UDF, a type specific EvalFunc is necessary (ie not EvalFunc), so we
            // inspect the type and instantiated the proper class.
            switch (schema.getField(0).type) {
                case DataType.BAG: canonicalName += "Bag"; break;
                case DataType.TUPLE: canonicalName += "Tuple"; break;
                case DataType.CHARARRAY: canonicalName += "Chararray"; break;
                case DataType.DOUBLE: canonicalName += "Double"; break;
                case DataType.FLOAT: canonicalName += "Float"; break;
                case DataType.INTEGER: canonicalName += "Integer"; break;
                case DataType.LONG: canonicalName += "Long"; break;
                case DataType.DATETIME: canonicalName += "DateTime"; break;
                case DataType.MAP: canonicalName += "Map"; break;
                case DataType.BYTEARRAY: canonicalName += "DataByteArray"; break;
                default: throw new ExecException("Unable to instantiate Algebraic EvalFunc " + method + " as schema type is invalid");
            }

            canonicalName += "JrubyAlgebraicEvalFunc";

            pigContext.registerFunction(namespace + "." + method, new FuncSpec(canonicalName + "('" + path + "','" + method +"')"));
        }

        // Determine what external dependencies to ship with the job jar
        HashSet toShip = libsToShip();
        for (String lib : toShip) {
            File libFile = new File(lib);
            if (lib.endsWith(".rb")) {
                pigContext.addScriptFile(lib);
            } else if (libFile.isDirectory()) {
                //
                // Need to package entire contents of directory
                //
                List files = listRecursively(libFile);
                for (File file : files) {
                    if (file.isDirectory()) {
                        continue;
                    } else if (file.getName().endsWith(".jar") || file.getName().endsWith(".zip")) {
                        pigContext.scriptJars.add(file.getPath());
                    } else {
                        String localPath = libFile.getName() + file.getPath().replaceFirst(libFile.getPath(), "");
                        pigContext.addScriptFile(localPath, file.getPath());
                    }
                }
            } else {
                pigContext.scriptJars.add(lib);
            }
        }
    }
    /**
     * Consults the scripting container, after the script has been evaluated, to
     * determine what dependencies to ship.
     * 

* FIXME: Corner cases like the following: "def foobar; require 'json'; end" * are NOT dealt with using this method */ private HashSet libsToShip() { RubyArray loadedLibs = (RubyArray)rubyEngine.get("$\""); RubyArray loadPaths = (RubyArray)rubyEngine.get("$LOAD_PATH"); // Current directory first loadPaths.add(0, ""); HashSet toShip = new HashSet(); HashSet shippedLib = new HashSet(); for (Object loadPath : loadPaths) { for (Object lib : loadedLibs) { if (lib.toString().equals("pigudf.rb")) continue; if (shippedLib.contains(lib)) continue; String possiblePath = (loadPath.toString().isEmpty()?"":loadPath.toString() + File.separator) + lib.toString(); if ((new File(possiblePath)).exists()) { // remove prefix ./ toShip.add(possiblePath.startsWith("./")?possiblePath.substring(2): possiblePath); shippedLib.add(lib); } } } return toShip; } private static List listRecursively(File directory) { File[] entries = directory.listFiles(); ArrayList files = new ArrayList(); // Go over entries for (File entry : entries) { files.add(entry); if (entry.isDirectory()) { files.addAll(listRecursively(entry)); } } return files; } @Override protected Map> main(PigContext pigContext, String scriptFile) throws IOException { throw new UnsupportedOperationException("Unimplemented"); } @Override protected String getScriptingLang() { return "jruby"; } @Override protected Map getParamsFromVariables() throws IOException { Map vars = Maps.newHashMap(); return vars; } }