org.apache.pig.scripting.Pig Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.scripting;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FsShell;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
import org.apache.pig.impl.PigContext;
import org.apache.pig.tools.grunt.GruntParser;
/**
* The class being used in scripts to interact with Pig
*/
public class Pig {
private static final Log LOG = LogFactory.getLog(Pig.class);
private static List defineCache = new ArrayList();
private static List scriptUDFCache = new ArrayList();
/**
* Run a filesystem command. Any output from this command is written to
* stdout or stderr as appropriate.
* @param cmd Filesystem command to run along with its arguments as one
* string.
* @throws IOException
*/
public static int fs(String cmd) throws IOException {
ScriptPigContext ctx = getScriptContext();
FsShell shell = new FsShell(ConfigurationUtil.toConfiguration(ctx
.getPigContext().getProperties()));
int code = -1;
if (cmd != null) {
String[] cmdTokens = cmd.split("\\s+");
if (!cmdTokens[0].startsWith("-")) cmdTokens[0] = "-" + cmdTokens[0];
try {
code = shell.run(cmdTokens);
} catch (Exception e) {
throw new IOException("Run filesystem command failed", e);
}
}
return code;
}
/**
* Run a sql command. Any output from this command is written to
* stdout or stderr as appropriate.
* @param cmd sql command to run along with its arguments as one
* string. Currently only hcat is supported as a sql backend
* @throws IOException
*/
public static int sql(String cmd) throws IOException {
ScriptPigContext ctx = getScriptContext();
if (!ctx.getPigContext().getProperties().get("pig.sql.type").equals("hcat")) {
throw new IOException("sql command only support hcat currently");
}
if (ctx.getPigContext().getProperties().get("hcat.bin")==null) {
throw new IOException("hcat.bin is not defined. Define it to be your hcat script (Usually $HCAT_HOME/bin/hcat");
}
String hcatBin = (String)ctx.getPigContext().getProperties().get("hcat.bin");
if (new File("hcat.bin").exists()) {
throw new IOException(hcatBin + " does not exist. Please check your 'hcat.bin' setting in pig.properties.");
}
int ret = GruntParser.runSQLCommand(hcatBin, cmd, false);
return ret;
}
/**
* Register a jar for use in Pig. Once this is done this jar will be
* registered for all subsequent Pig pipelines in this script.
* If you wish to register it for only a single Pig pipeline, use
* register within that definition.
* @param jarfile Path of jar to include.
* @throws IOException if the indicated jarfile cannot be found.
*/
public static void registerJar(String jarfile) throws IOException {
LOG.info("Register jar: "+ jarfile);
ScriptPigContext ctx = getScriptContext();
PigServer pigServer = new PigServer(ctx.getPigContext(), false);
pigServer.registerJar(jarfile);
}
/**
* Register scripting UDFs for use in Pig. Once this is done all UDFs
* defined in the file will be available for all subsequent
* Pig pipelines in this script. If you wish to register UDFS for
* only a single Pig pipeline, use register within that definition.
* @param udffile Path of the script UDF file
* @param namespace namespace of the UDFs
* @throws IOException
*/
public static void registerUDF(String udffile, String namespace)
throws IOException {
LOG.info("Register script UDF file: "+ udffile);
ScriptPigContext ctx = getScriptContext();
ScriptEngine engine = ctx.getScriptEngine();
// script file contains only functions, no need to separate
// functions from control flow code
if (namespace != null && namespace.isEmpty()) namespace = null;
engine.registerFunctions(udffile, namespace, ctx.getPigContext());
addRegisterScriptUDFClause(udffile, namespace);
}
/**
* Define an alias for a UDF or a streaming command. This definition
* will then be present for all subsequent Pig pipelines defined in this
* script. If you wish to define it for only a single Pig pipeline, use
* define within that definition.
* @param alias name of the defined alias
* @param definition string this alias is defined as
*/
public static void define(String alias, String definition)
throws IOException {
LOG.info("Add define clause: "+ alias + " -- " + definition);
addDefineClause(alias, definition);
}
/**
* Set a variable for use in Pig Latin. This set
* will then be present for all subsequent Pig pipelines defined in this
* script. If you wish to set it for only a single Pig pipeline, use
* set within that definition.
* @param var variable to set
* @param value to set it to
*/
public static void set(String var, String value) throws IOException {
ScriptPigContext ctx = getScriptContext();
PigServer pigServer = new PigServer(ctx.getPigContext(), false);
pigServer.getPigContext().getProperties().setProperty(var, value);
}
/**
* Define a Pig pipeline.
* @param pl Pig Latin definition of the pipeline.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile.
*/
public static Pig compile(String pl) throws IOException {
return compile(null, pl);
}
/**
* Define a named portion of a Pig pipeline. This allows it
* to be imported into another pipeline.
* @param name Name that will be used to define this pipeline.
* The namespace is global.
* @param pl Pig Latin definition of the pipeline.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile.
*/
public static Pig compile(String name, String pl) throws IOException {
ScriptPigContext ctx = getScriptContext();
StringBuilder sb = new StringBuilder();
sb.append(getRegisterScriptUDFClauses()).append(getDefineClauses());
sb.append(pl).append("\n");
return new Pig(sb.toString(), ctx, name);
}
/**
* Define a Pig pipeline based on Pig Latin in a separate file.
* @param filename File to read Pig Latin from. This must be a purely
* Pig Latin file. It cannot contain host language constructs in it.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile or the file
* cannot be found.
*/
public static Pig compileFromFile(String filename)
throws IOException {
return compileFromFile(null, filename);
}
/**
* Define a named Pig pipeline based on Pig Latin in a separate file.
* This allows it to be imported into another pipeline.
* @param name Name that will be used to define this pipeline.
* The namespace is global.
* @param filename File to read Pig Latin from. This must be a purely
* Pig Latin file. It cannot contain host language constructs in it.
* @return Pig object representing this pipeline.
* @throws IOException if the Pig Latin does not compile or the file
* cannot be found.
*/
public static Pig compileFromFile(String name,
String filename) throws IOException {
return compile(name, getScriptFromFile(filename));
}
//-------------------------------------------------------------------------
/**
* Bind this to a set of variables. Values must be provided
* for all Pig Latin parameters.
* @param vars map of variables to bind. Keys should be parameters defined
* in the Pig Latin. Values should be strings that provide values for those
* parameters. They can be either constants or variables from the host
* language. Host language variables must contain strings.
* @return a {@link BoundScript} object
* @throws IOException if there is not a key for each
* Pig Latin parameter or if they contain unsupported types.
*/
public BoundScript bind(Map vars) throws IOException {
return new BoundScript(replaceParameters(script, vars), scriptContext, name);
}
/**
* Bind this to multiple sets of variables. This will
* cause the Pig Latin script to be executed in parallel over these sets of
* variables.
* @param vars list of maps of variables to bind. Keys should be parameters defined
* in the Pig Latin. Values should be strings that provide values for those
* variables. They can be either constants or variables from the host
* language. Host language variables must be strings.
* @return a {@link BoundScript} object
* @throws IOException if there is not a key for each
* Pig Latin parameter or if they contain unsupported types.
*/
public BoundScript bind(List