All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.pig.impl.builtin.StreamingUDF Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.impl.builtin;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.pig.EvalFunc;
import org.apache.pig.ExecType;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.Launcher;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.BufferedPositionedInputStream;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.streaming.InputHandler;
import org.apache.pig.impl.streaming.OutputHandler;
import org.apache.pig.impl.streaming.PigStreamingUDF;
import org.apache.pig.impl.streaming.StreamingCommand;
import org.apache.pig.impl.streaming.StreamingUDFException;
import org.apache.pig.impl.streaming.StreamingUDFInputHandler;
import org.apache.pig.impl.streaming.StreamingUDFOutputHandler;
import org.apache.pig.impl.streaming.StreamingUDFOutputSchemaException;
import org.apache.pig.impl.streaming.StreamingUtil;
import org.apache.pig.impl.util.UDFContext;
import org.apache.pig.impl.util.Utils;
import org.apache.pig.parser.ParserException;
import org.apache.pig.scripting.ScriptingOutputCapturer;
import com.google.common.base.Charsets;
public class StreamingUDF extends EvalFunc {
private static final Log log = LogFactory.getLog(StreamingUDF.class);
private static final String PYTHON_CONTROLLER_JAR_PATH = "/python/streaming/controller.py"; //Relative to root of pig jar.
private static final String PYTHON_PIG_UTIL_PATH = "/python/streaming/pig_util.py"; //Relative to root of pig jar.
//Indexes for arguments being passed to external process
private static final int UDF_LANGUAGE = 0;
private static final int PATH_TO_CONTROLLER_FILE = 1;
private static final int UDF_FILE_NAME = 2; //Name of file where UDF function is defined
private static final int UDF_FILE_PATH = 3; //Path to directory containing file where UDF function is defined
private static final int UDF_NAME = 4; //Name of UDF function being called.
private static final int PATH_TO_FILE_CACHE = 5; //Directory where required files (like pig_util) are cached on cluster nodes.
private static final int STD_OUT_OUTPUT_PATH = 6; //File for output from when user writes to standard output.
private static final int STD_ERR_OUTPUT_PATH = 7; //File for output from when user writes to standard error.
private static final int CONTROLLER_LOG_FILE_PATH = 8; //Controller log file logs progress through the controller script not user code.
private static final int IS_ILLUSTRATE = 9; //Controller captures output differently in illustrate vs running.
private String language;
private String filePath;
private String funcName;
private Schema schema;
private ExecType execType;
private String isIllustrate;
private boolean initialized = false;
private ScriptingOutputCapturer soc;
private Process process; // Handle to the external process
private ProcessErrorThread stderrThread; // thread to get process stderr
private ProcessInputThread stdinThread; // thread to send input to process
private ProcessOutputThread stdoutThread; //thread to read output from process
private InputHandler inputHandler;
private OutputHandler outputHandler;
private BlockingQueue inputQueue;
private BlockingQueue outputQueue;
private DataOutputStream stdin; // stdin of the process
private InputStream stdout; // stdout of the process
private InputStream stderr; // stderr of the process
private static final Object ERROR_OUTPUT = new Object();
private static final Object NULL_OBJECT = new Object(); //BlockingQueue can't have null. Use place holder object instead.
private volatile StreamingUDFException outerrThreadsError;
public static final String TURN_ON_OUTPUT_CAPTURING = "TURN_ON_OUTPUT_CAPTURING";
public StreamingUDF(String language,
String filePath, String funcName,
String outputSchemaString, String schemaLineNumber,
String execType, String isIllustrate)
throws StreamingUDFOutputSchemaException, ExecException {
this.language = language;
this.filePath = filePath;
this.funcName = funcName;
try {
this.schema = Utils.getSchemaFromString(outputSchemaString);
//ExecTypeProvider.fromString doesn't seem to load the ExecTypes in
//mapreduce mode so we'll try to figure out the exec type ourselves.
if (execType.equals("local")) {
this.execType = ExecType.LOCAL;
} else if (execType.equals("mapreduce")) {
this.execType = ExecType.MAPREDUCE;
} else {
//Not sure what exec type - try to get it from the string.
this.execType = ExecType.fromString(execType);
}
} catch (ParserException pe) {
throw new StreamingUDFOutputSchemaException(pe.getMessage(), Integer.valueOf(schemaLineNumber));
} catch (IOException ioe) {
String errorMessage = "Invalid exectype passed to StreamingUDF. Should be local or mapreduce";
log.error(errorMessage, ioe);
throw new ExecException(errorMessage, ioe);
}
this.isIllustrate = isIllustrate;
}
@Override
public Object exec(Tuple input) throws IOException {
if (!initialized) {
initialize();
initialized = true;
}
return getOutput(input);
}
private void initialize() throws ExecException, IOException {
inputQueue = new ArrayBlockingQueue(1);
outputQueue = new ArrayBlockingQueue(2);
soc = new ScriptingOutputCapturer(execType);
startUdfController();
createInputHandlers();
setStreams();
startThreads();
}
private StreamingCommand startUdfController() throws IOException {
StreamingCommand sc = new StreamingCommand(null, constructCommand());
ProcessBuilder processBuilder = StreamingUtil.createProcess(sc);
process = processBuilder.start();
Runtime.getRuntime().addShutdownHook(new Thread(new ProcessKiller() ) );
return sc;
}
private String[] constructCommand() throws IOException {
String[] command = new String[10];
Configuration conf = UDFContext.getUDFContext().getJobConf();
String jarPath = conf.get("mapred.jar");
String jobDir;
if (jarPath != null) {
jobDir = new File(jarPath).getParent();
} else {
jobDir = "";
}
String standardOutputRootWriteLocation = soc.getStandardOutputRootWriteLocation();
String controllerLogFileName, outFileName, errOutFileName;
if (execType == ExecType.LOCAL) {
controllerLogFileName = standardOutputRootWriteLocation + funcName + "_python.log";
outFileName = standardOutputRootWriteLocation + "cpython_" + funcName + "_" + ScriptingOutputCapturer.getRunId() + ".out";
errOutFileName = standardOutputRootWriteLocation + "cpython_" + funcName + "_" + ScriptingOutputCapturer.getRunId() + ".err";
} else {
controllerLogFileName = standardOutputRootWriteLocation + funcName + "_python.log";
outFileName = standardOutputRootWriteLocation + funcName + ".out";
errOutFileName = standardOutputRootWriteLocation + funcName + ".err";
}
soc.registerOutputLocation(funcName, outFileName);
command[UDF_LANGUAGE] = language;
command[PATH_TO_CONTROLLER_FILE] = getControllerPath(jobDir);
int lastSeparator = filePath.lastIndexOf(File.separator) + 1;
command[UDF_FILE_NAME] = filePath.substring(lastSeparator);
command[UDF_FILE_PATH] = lastSeparator <= 0 ?
"." :
filePath.substring(0, lastSeparator - 1);
command[UDF_NAME] = funcName;
command[PATH_TO_FILE_CACHE] = "\"" + jobDir + filePath.substring(0, lastSeparator) + "\"";
command[STD_OUT_OUTPUT_PATH] = outFileName;
command[STD_ERR_OUTPUT_PATH] = errOutFileName;
command[CONTROLLER_LOG_FILE_PATH] = controllerLogFileName;
command[IS_ILLUSTRATE] = isIllustrate;
return command;
}
private void createInputHandlers() throws ExecException, FrontendException {
PigStreamingUDF serializer = new PigStreamingUDF();
this.inputHandler = new StreamingUDFInputHandler(serializer);
PigStreamingUDF deserializer = new PigStreamingUDF(schema.getField(0));
this.outputHandler = new StreamingUDFOutputHandler(deserializer);
}
private void setStreams() throws IOException {
stdout = new DataInputStream(new BufferedInputStream(process
.getInputStream()));
outputHandler.bindTo("", new BufferedPositionedInputStream(stdout),
0, Long.MAX_VALUE);
stdin = new DataOutputStream(new BufferedOutputStream(process
.getOutputStream()));
inputHandler.bindTo(stdin);
stderr = new DataInputStream(new BufferedInputStream(process
.getErrorStream()));
}
private void startThreads() {
stdinThread = new ProcessInputThread();
stdinThread.start();
stdoutThread = new ProcessOutputThread();
stdoutThread.start();
stderrThread = new ProcessErrorThread();
stderrThread.start();
}
/**
* Find the path to the controller file for the streaming language.
*
* First check path to job jar and if the file is not found (like in the
* case of running hadoop in standalone mode) write the necessary files
* to temporary files and return that path.
*
* @param language
* @param jarPath
* @return
* @throws IOException
*/
private String getControllerPath(String jarPath) throws IOException {
if (language.toLowerCase().equals("python")) {
String controllerPath = jarPath + PYTHON_CONTROLLER_JAR_PATH;
File controller = new File(controllerPath);
if (!controller.exists()) {
File controllerFile = File.createTempFile("controller", ".py");
InputStream pythonControllerStream = Launcher.class.getResourceAsStream(PYTHON_CONTROLLER_JAR_PATH);
try {
FileUtils.copyInputStreamToFile(pythonControllerStream, controllerFile);
} finally {
pythonControllerStream.close();
}
controllerFile.deleteOnExit();
File pigUtilFile = new File(controllerFile.getParent() + "/pig_util.py");
pigUtilFile.deleteOnExit();
InputStream pythonUtilStream = Launcher.class.getResourceAsStream(PYTHON_PIG_UTIL_PATH);
try {
FileUtils.copyInputStreamToFile(pythonUtilStream, pigUtilFile);
} finally {
pythonUtilStream.close();
}
controllerPath = controllerFile.getAbsolutePath();
}
return controllerPath;
} else {
throw new ExecException("Invalid language: " + language);
}
}
/**
* Returns a list of file names (relative to root of pig jar) of files that need to be
* included in the jar shipped to the cluster.
*
* Will need to be smarter as more languages are added and the controller files are large.
*
* @return
*/
public static List getResourcesForJar() {
List files = new ArrayList();
files.add(PYTHON_CONTROLLER_JAR_PATH);
files.add(PYTHON_PIG_UTIL_PATH);
return files;
}
private Object getOutput(Tuple input) throws ExecException {
if (outputQueue == null) {
throw new ExecException("Process has already been shut down. No way to retrieve output for input: " + input);
}
if (ScriptingOutputCapturer.isClassCapturingOutput() &&
!soc.isInstanceCapturingOutput()) {
Tuple t = TupleFactory.getInstance().newTuple(TURN_ON_OUTPUT_CAPTURING);
try {
inputQueue.put(t);
} catch (InterruptedException e) {
throw new ExecException("Failed adding capture input flag to inputQueue");
}
soc.setInstanceCapturingOutput(true);
}
try {
if (this.getInputSchema() == null || this.getInputSchema().size() == 0) {
//When nothing is passed into the UDF the tuple
//being sent is the full tuple for the relation.
//We want it to be nothing (since that's what the user wrote).
input = TupleFactory.getInstance().newTuple(0);
}
inputQueue.put(input);
} catch (Exception e) {
throw new ExecException("Failed adding input to inputQueue", e);
}
Object o = null;
try {
if (outputQueue != null) {
o = outputQueue.take();
if (o == NULL_OBJECT) {
o = null;
}
}
} catch (Exception e) {
throw new ExecException("Problem getting output", e);
}
if (o == ERROR_OUTPUT) {
outputQueue = null;
if (outerrThreadsError == null) {
outerrThreadsError = new StreamingUDFException(this.language, "Problem with streaming udf. Can't recreate exception.");
}
throw outerrThreadsError;
}
return o;
}
@Override
public Schema outputSchema(Schema input) {
return this.schema;
}
/**
* The thread which consumes input and feeds it to the the Process
*/
class ProcessInputThread extends Thread {
ProcessInputThread() {
setDaemon(true);
}
public void run() {
try {
log.debug("Starting PIT");
while (true) {
Tuple inputTuple = inputQueue.take();
inputHandler.putNext(inputTuple);
try {
stdin.flush();
} catch(Exception e) {
return;
}
}
} catch (Exception e) {
log.error(e);
}
}
}
private static final int WAIT_FOR_ERROR_LENGTH = 500;
private static final int MAX_WAIT_FOR_ERROR_ATTEMPTS = 5;
/**
* The thread which consumes output from process
*/
class ProcessOutputThread extends Thread {
ProcessOutputThread() {
setDaemon(true);
}
public void run() {
Object o = null;
try{
log.debug("Starting POT");
//StreamUDFToPig wraps object in single element tuple
o = outputHandler.getNext().get(0);
while (o != OutputHandler.END_OF_OUTPUT) {
if (o != null)
outputQueue.put(o);
else
outputQueue.put(NULL_OBJECT);
o = outputHandler.getNext().get(0);
}
} catch(Exception e) {
if (outputQueue != null) {
try {
//Give error thread a chance to check the standard error output
//for an exception message.
int attempt = 0;
while (stderrThread.isAlive() && attempt < MAX_WAIT_FOR_ERROR_ATTEMPTS) {
Thread.sleep(WAIT_FOR_ERROR_LENGTH);
attempt++;
}
//Only write this if no other error. Don't want to overwrite
//an error from the error thread.
if (outerrThreadsError == null) {
outerrThreadsError = new StreamingUDFException(language, "Error deserializing output. Please check that the declared outputSchema for function " +
funcName + " matches the data type being returned.", e);
}
outputQueue.put(ERROR_OUTPUT); //Need to wake main thread.
} catch(InterruptedException ie) {
log.error(ie);
}
}
}
}
}
class ProcessErrorThread extends Thread {
public ProcessErrorThread() {
setDaemon(true);
}
public void run() {
try {
log.debug("Starting PET");
Integer lineNumber = null;
StringBuffer error = new StringBuffer();
String errInput;
BufferedReader reader = new BufferedReader(
new InputStreamReader(stderr, Charsets.UTF_8));
while ((errInput = reader.readLine()) != null) {
//First line of error stream is usually the line number of error.
//If its not a number just treat it as first line of error message.
if (lineNumber == null) {
try {
lineNumber = Integer.valueOf(errInput);
} catch (NumberFormatException nfe) {
error.append(errInput + "\n");
}
} else {
error.append(errInput + "\n");
}
}
outerrThreadsError = new StreamingUDFException(language, error.toString(), lineNumber);
if (outputQueue != null) {
outputQueue.put(ERROR_OUTPUT); //Need to wake main thread.
}
if (stderr != null) {
stderr.close();
stderr = null;
}
} catch (IOException e) {
log.debug("Process Ended");
} catch (Exception e) {
log.error("standard error problem", e);
}
}
}
public class ProcessKiller implements Runnable {
public void run() {
process.destroy();
}
}
}