org.apache.hadoop.hive.ql.exec.ScriptOperator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-exec Show documentation
Show all versions of hive-exec Show documentation
Hive is a data warehouse infrastructure built on top of Hadoop see
http://wiki.apache.org/hadoop/Hive
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ScriptDesc;
import org.apache.hadoop.hive.ql.plan.api.OperatorType;
import org.apache.hadoop.hive.serde2.Deserializer;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.Serializer;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.util.StringUtils;
/**
* ScriptOperator.
*
*/
public class ScriptOperator extends Operator implements
Serializable {
private static final long serialVersionUID = 1L;
/**
* Counter.
*
*/
public static enum Counter {
DESERIALIZE_ERRORS, SERIALIZE_ERRORS
}
private final transient LongWritable deserialize_error_count = new LongWritable();
private final transient LongWritable serialize_error_count = new LongWritable();
transient Thread outThread = null;
transient Thread errThread = null;
transient Process scriptPid = null;
transient Configuration hconf;
// Input to the script
transient Serializer scriptInputSerializer;
// Output from the script
transient Deserializer scriptOutputDeserializer;
transient volatile Throwable scriptError = null;
transient RecordWriter scriptOutWriter = null;
static final String IO_EXCEPTION_BROKEN_PIPE_STRING = "Broken pipe";
static final String IO_EXCEPTION_STREAM_CLOSED = "Stream closed";
static final String IO_EXCEPTION_PIPE_ENDED_WIN = "The pipe has been ended";
static final String IO_EXCEPTION_PIPE_CLOSED_WIN = "The pipe is being closed";
/**
* sends periodic reports back to the tracker.
*/
transient AutoProgressor autoProgressor;
// first row - the process should only be started if necessary, as it may
// conflict with some
// of the user assumptions.
transient boolean firstRow;
String safeEnvVarName(String name) {
StringBuilder safe = new StringBuilder();
int len = name.length();
for (int i = 0; i < len; i++) {
char c = name.charAt(i);
char s;
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z')
|| (c >= 'a' && c <= 'z')) {
s = c;
} else {
s = '_';
}
safe.append(s);
}
return safe.toString();
}
/**
* Most UNIX implementations impose some limit on the total size of environment variables and
* size of strings. To fit in this limit we need sometimes to truncate strings.
* @param value environment variable value to check
* @param name name of variable (used only for logging purposes)
* @param truncate truncate value or not
* @return original value, or truncated one if it's length is more then 20KB and
* truncate flag is set
* @see Linux
* Man page for more details
*/
String safeEnvVarValue(String value, String name, boolean truncate) {
final int lenLimit = 20*1024;
if (truncate && value.length() > lenLimit) {
value = value.substring(0, lenLimit);
LOG.warn("Length of environment variable " + name + " was truncated to " + lenLimit
+ " bytes to fit system limits.");
}
return value;
}
/**
* addJobConfToEnvironment is mostly shamelessly copied from hadoop streaming. Added additional
* check on environment variable length
*/
void addJobConfToEnvironment(Configuration conf, Map env) {
Iterator> it = conf.iterator();
while (it.hasNext()) {
Map.Entry en = it.next();
String name = en.getKey();
// String value = (String)en.getValue(); // does not apply variable
// expansion
String value = conf.get(name); // does variable expansion
name = safeEnvVarName(name);
boolean truncate = conf.getBoolean(HiveConf.ConfVars.HIVESCRIPTTRUNCATEENV.toString(), false);
value = safeEnvVarValue(value, name, truncate);
env.put(name, value);
}
}
/**
* Maps a relative pathname to an absolute pathname using the PATH enviroment.
*/
public class PathFinder {
String pathenv; // a string of pathnames
String pathSep; // the path seperator
String fileSep; // the file seperator in a directory
/**
* Construct a PathFinder object using the path from the specified system
* environment variable.
*/
public PathFinder(String envpath) {
pathenv = System.getenv(envpath);
pathSep = System.getProperty("path.separator");
fileSep = System.getProperty("file.separator");
}
/**
* Appends the specified component to the path list.
*/
public void prependPathComponent(String str) {
pathenv = str + pathSep + pathenv;
}
/**
* Returns the full path name of this file if it is listed in the path.
*/
public File getAbsolutePath(String filename) {
if (pathenv == null || pathSep == null || fileSep == null) {
return null;
}
int val = -1;
String classvalue = pathenv + pathSep;
while (((val = classvalue.indexOf(pathSep)) >= 0)
&& classvalue.length() > 0) {
//
// Extract each entry from the pathenv
//
String entry = classvalue.substring(0, val).trim();
File f = new File(entry);
try {
if (f.isDirectory()) {
//
// this entry in the pathenv is a directory.
// see if the required file is in this directory
//
f = new File(entry + fileSep + filename);
}
//
// see if the filename matches and we can read it
//
if (f.isFile() && f.canRead()) {
return f;
}
} catch (Exception exp) {
}
classvalue = classvalue.substring(val + 1).trim();
}
return null;
}
}
@Override
protected void initializeOp(Configuration hconf) throws HiveException {
firstRow = true;
statsMap.put(Counter.DESERIALIZE_ERRORS, deserialize_error_count);
statsMap.put(Counter.SERIALIZE_ERRORS, serialize_error_count);
try {
this.hconf = hconf;
scriptOutputDeserializer = conf.getScriptOutputInfo()
.getDeserializerClass().newInstance();
scriptOutputDeserializer.initialize(hconf, conf.getScriptOutputInfo()
.getProperties());
scriptInputSerializer = (Serializer) conf.getScriptInputInfo()
.getDeserializerClass().newInstance();
scriptInputSerializer.initialize(hconf, conf.getScriptInputInfo()
.getProperties());
outputObjInspector = scriptOutputDeserializer.getObjectInspector();
// initialize all children before starting the script
initializeChildren(hconf);
} catch (Exception e) {
throw new HiveException(ErrorMsg.SCRIPT_INIT_ERROR.getErrorCodedMsg(), e);
}
}
boolean isBrokenPipeException(IOException e) {
if (Shell.WINDOWS) {
String errMsg = e.getMessage();
return errMsg.equalsIgnoreCase(IO_EXCEPTION_PIPE_CLOSED_WIN) ||
errMsg.equalsIgnoreCase(IO_EXCEPTION_PIPE_ENDED_WIN);
}
return (e.getMessage().equalsIgnoreCase(IO_EXCEPTION_BROKEN_PIPE_STRING) ||
e.getMessage().equalsIgnoreCase(IO_EXCEPTION_STREAM_CLOSED));
}
boolean allowPartialConsumption() {
return HiveConf.getBoolVar(hconf, HiveConf.ConfVars.ALLOWPARTIALCONSUMP);
}
void displayBrokenPipeInfo() {
LOG
.info("The script did not consume all input data. This is considered as an error.");
LOG.info("set " + HiveConf.ConfVars.ALLOWPARTIALCONSUMP.toString()
+ "=true; to ignore it.");
return;
}
@Override
public void processOp(Object row, int tag) throws HiveException {
// initialize the user's process only when you recieve the first row
if (firstRow) {
firstRow = false;
try {
String[] cmdArgs = splitArgs(conf.getScriptCmd());
String prog = cmdArgs[0];
File currentDir = new File(".").getAbsoluteFile();
if (!new File(prog).isAbsolute()) {
PathFinder finder = new PathFinder("PATH");
finder.prependPathComponent(currentDir.toString());
File f = finder.getAbsolutePath(prog);
if (f != null) {
cmdArgs[0] = f.getAbsolutePath();
}
f = null;
}
String[] wrappedCmdArgs = addWrapper(cmdArgs);
LOG.info("Executing " + Arrays.asList(wrappedCmdArgs));
LOG.info("tablename="
+ hconf.get(HiveConf.ConfVars.HIVETABLENAME.varname));
LOG.info("partname="
+ hconf.get(HiveConf.ConfVars.HIVEPARTITIONNAME.varname));
LOG.info("alias=" + alias);
ProcessBuilder pb = new ProcessBuilder(wrappedCmdArgs);
Map env = pb.environment();
addJobConfToEnvironment(hconf, env);
env.put(safeEnvVarName(HiveConf.ConfVars.HIVEALIAS.varname), String
.valueOf(alias));
// Create an environment variable that uniquely identifies this script
// operator
String idEnvVarName = HiveConf.getVar(hconf,
HiveConf.ConfVars.HIVESCRIPTIDENVVAR);
String idEnvVarVal = getOperatorId();
env.put(safeEnvVarName(idEnvVarName), idEnvVarVal);
scriptPid = pb.start(); // Runtime.getRuntime().exec(wrappedCmdArgs);
DataOutputStream scriptOut = new DataOutputStream(
new BufferedOutputStream(scriptPid.getOutputStream()));
DataInputStream scriptIn = new DataInputStream(new BufferedInputStream(
scriptPid.getInputStream()));
DataInputStream scriptErr = new DataInputStream(
new BufferedInputStream(scriptPid.getErrorStream()));
scriptOutWriter = conf.getInRecordWriterClass().newInstance();
scriptOutWriter.initialize(scriptOut, hconf);
RecordReader scriptOutputReader = conf.getOutRecordReaderClass()
.newInstance();
scriptOutputReader.initialize(scriptIn, hconf, conf
.getScriptOutputInfo().getProperties());
outThread = new StreamThread(scriptOutputReader,
new OutputStreamProcessor(scriptOutputDeserializer
.getObjectInspector()), "OutputProcessor");
RecordReader scriptErrReader = conf.getErrRecordReaderClass()
.newInstance();
scriptErrReader.initialize(scriptErr, hconf, conf.getScriptErrInfo()
.getProperties());
errThread = new StreamThread(scriptErrReader, new ErrorStreamProcessor(
HiveConf.getIntVar(hconf, HiveConf.ConfVars.SCRIPTERRORLIMIT)),
"ErrorProcessor");
if (HiveConf
.getBoolVar(hconf, HiveConf.ConfVars.HIVESCRIPTAUTOPROGRESS)) {
autoProgressor = new AutoProgressor(this.getClass().getName(),
reporter, Utilities.getDefaultNotificationInterval(hconf),
HiveConf.getIntVar(hconf, HiveConf.ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT) * 1000);
autoProgressor.go();
}
outThread.start();
errThread.start();
} catch (Exception e) {
throw new HiveException(ErrorMsg.SCRIPT_INIT_ERROR.getErrorCodedMsg(), e);
}
}
if (scriptError != null) {
throw new HiveException(ErrorMsg.SCRIPT_GENERIC_ERROR.getErrorCodedMsg(), scriptError);
}
try {
Writable res = scriptInputSerializer.serialize(row,
inputObjInspectors[tag]);
scriptOutWriter.write(res);
} catch (SerDeException e) {
LOG.error("Error in serializing the row: " + e.getMessage());
scriptError = e;
serialize_error_count.set(serialize_error_count.get() + 1);
throw new HiveException(e);
} catch (IOException e) {
if (isBrokenPipeException(e) && allowPartialConsumption()) {
// Give the outThread a chance to finish before marking the operator as done
try {
scriptPid.waitFor();
} catch (InterruptedException interruptedException) {
}
// best effort attempt to write all output from the script before marking the operator
// as done
try {
if (outThread != null) {
outThread.join(0);
}
} catch (Exception e2) {
LOG.warn("Exception in closing outThread: "
+ StringUtils.stringifyException(e2));
}
setDone(true);
LOG
.warn("Got broken pipe during write: ignoring exception and setting operator to done");
} else {
LOG.error("Error in writing to script: " + e.getMessage());
if (isBrokenPipeException(e)) {
displayBrokenPipeInfo();
}
scriptError = e;
throw new HiveException(ErrorMsg.SCRIPT_IO_ERROR.getErrorCodedMsg(), e);
}
}
}
@Override
public void close(boolean abort) throws HiveException {
boolean new_abort = abort;
if (!abort) {
if (scriptError != null) {
throw new HiveException(ErrorMsg.SCRIPT_GENERIC_ERROR.getErrorCodedMsg(), scriptError);
}
// everything ok. try normal shutdown
try {
try {
if (scriptOutWriter != null) {
scriptOutWriter.close();
}
} catch (IOException e) {
if (isBrokenPipeException(e) && allowPartialConsumption()) {
LOG.warn("Got broken pipe: ignoring exception");
} else {
if (isBrokenPipeException(e)) {
displayBrokenPipeInfo();
}
throw e;
}
}
int exitVal = 0;
if (scriptPid != null) {
exitVal = scriptPid.waitFor();
}
if (exitVal != 0) {
LOG.error("Script failed with code " + exitVal);
new_abort = true;
}
} catch (IOException e) {
LOG.error("Got ioexception: " + e.getMessage());
e.printStackTrace();
new_abort = true;
} catch (InterruptedException e) {
}
} else {
// Error already occurred, but we still want to get the
// error code of the child process if possible.
try {
// Interrupt the current thread after 1 second
final Thread mythread = Thread.currentThread();
Timer timer = new Timer(true);
timer.schedule(new TimerTask() {
@Override
public void run() {
mythread.interrupt();
}
}, 1000);
// Wait for the child process to finish
int exitVal = 0;
if (scriptPid != null) {
scriptPid.waitFor();
}
// Cancel the timer
timer.cancel();
// Output the exit code
LOG.error("Script exited with code " + exitVal);
} catch (InterruptedException e) {
// Ignore
LOG.error("Script has not exited yet. It will be killed.");
}
}
// try these best effort
try {
if (outThread != null) {
outThread.join(0);
}
} catch (Exception e) {
LOG.warn("Exception in closing outThread: "
+ StringUtils.stringifyException(e));
}
try {
if (errThread != null) {
errThread.join(0);
}
} catch (Exception e) {
LOG.warn("Exception in closing errThread: "
+ StringUtils.stringifyException(e));
}
try {
if (scriptPid != null) {
scriptPid.destroy();
}
} catch (Exception e) {
LOG.warn("Exception in destroying scriptPid: "
+ StringUtils.stringifyException(e));
}
super.close(new_abort);
if (new_abort && !abort) {
throw new HiveException(ErrorMsg.SCRIPT_CLOSING_ERROR.getErrorCodedMsg());
}
}
interface StreamProcessor {
void processLine(Writable line) throws HiveException;
void close() throws HiveException;
}
class OutputStreamProcessor implements StreamProcessor {
Object row;
ObjectInspector rowInspector;
public OutputStreamProcessor(ObjectInspector rowInspector) {
this.rowInspector = rowInspector;
}
public void processLine(Writable line) throws HiveException {
try {
row = scriptOutputDeserializer.deserialize(line);
} catch (SerDeException e) {
deserialize_error_count.set(deserialize_error_count.get() + 1);
return;
}
forward(row, rowInspector);
}
public void close() {
}
}
/**
* The processor for stderr stream.
*
* TODO: In the future when we move to hadoop 0.18 and above, we should borrow
* the logic from HadoopStreaming: PipeMapRed.java MRErrorThread to support
* counters and status updates.
*/
class ErrorStreamProcessor implements StreamProcessor {
private long bytesCopied = 0;
private final long maxBytes;
private long lastReportTime;
public ErrorStreamProcessor(int maxBytes) {
this.maxBytes = maxBytes;
lastReportTime = 0;
}
public void processLine(Writable line) throws HiveException {
String stringLine = line.toString();
int len = 0;
if (line instanceof Text) {
len = ((Text) line).getLength();
} else if (line instanceof BytesWritable) {
len = ((BytesWritable) line).getSize();
}
// Report progress for each stderr line, but no more frequently than once
// per minute.
long now = System.currentTimeMillis();
// reporter is a member variable of the Operator class.
if (now - lastReportTime > 60 * 1000 && reporter != null) {
LOG.info("ErrorStreamProcessor calling reporter.progress()");
lastReportTime = now;
reporter.progress();
}
if ((maxBytes < 0) || (bytesCopied < maxBytes)) {
System.err.println(stringLine);
}
if (bytesCopied < maxBytes && bytesCopied + len >= maxBytes) {
System.err.println("Operator " + id + " " + getName()
+ ": exceeding stderr limit of " + maxBytes
+ " bytes, will truncate stderr messages.");
}
bytesCopied += len;
}
public void close() {
}
}
class StreamThread extends Thread {
RecordReader in;
StreamProcessor proc;
String name;
StreamThread(RecordReader in, StreamProcessor proc, String name) {
this.in = in;
this.proc = proc;
this.name = name;
setDaemon(true);
}
@Override
public void run() {
try {
Writable row = in.createRow();
while (true) {
long bytes = in.next(row);
if (bytes <= 0) {
break;
}
proc.processLine(row);
}
LOG.info("StreamThread " + name + " done");
} catch (Throwable th) {
scriptError = th;
LOG.warn("Exception in StreamThread.run(): " + th.getMessage() +
"\nCause: " + th.getCause());
LOG.warn(StringUtils.stringifyException(th));
} finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e) {
LOG.warn(name + ": error in closing ..");
LOG.warn(StringUtils.stringifyException(e));
}
try
{
if (null != proc) {
proc.close();
}
}catch (Exception e) {
LOG.warn(": error in closing .."+StringUtils.stringifyException(e));
}
}
}
}
/**
* Wrap the script in a wrapper that allows admins to control.
*/
protected String[] addWrapper(String[] inArgs) {
String wrapper = HiveConf.getVar(hconf, HiveConf.ConfVars.SCRIPTWRAPPER);
if (wrapper == null) {
return inArgs;
}
String[] wrapComponents = splitArgs(wrapper);
int totallength = wrapComponents.length + inArgs.length;
String[] finalArgv = new String[totallength];
for (int i = 0; i < wrapComponents.length; i++) {
finalArgv[i] = wrapComponents[i];
}
for (int i = 0; i < inArgs.length; i++) {
finalArgv[wrapComponents.length + i] = inArgs[i];
}
return (finalArgv);
}
// Code below shameless borrowed from Hadoop Streaming
public static String[] splitArgs(String args) {
final int OUTSIDE = 1;
final int SINGLEQ = 2;
final int DOUBLEQ = 3;
ArrayList argList = new ArrayList();
char[] ch = args.toCharArray();
int clen = ch.length;
int state = OUTSIDE;
int argstart = 0;
for (int c = 0; c <= clen; c++) {
boolean last = (c == clen);
int lastState = state;
boolean endToken = false;
if (!last) {
if (ch[c] == '\'') {
if (state == OUTSIDE) {
state = SINGLEQ;
} else if (state == SINGLEQ) {
state = OUTSIDE;
}
endToken = (state != lastState);
} else if (ch[c] == '"') {
if (state == OUTSIDE) {
state = DOUBLEQ;
} else if (state == DOUBLEQ) {
state = OUTSIDE;
}
endToken = (state != lastState);
} else if (ch[c] == ' ') {
if (state == OUTSIDE) {
endToken = true;
}
}
}
if (last || endToken) {
if (c == argstart) {
// unquoted space
} else {
String a;
a = args.substring(argstart, c);
argList.add(a);
}
argstart = c + 1;
lastState = state;
}
}
return (String[]) argList.toArray(new String[0]);
}
@Override
public String getName() {
return getOperatorName();
}
static public String getOperatorName() {
return "SCR";
}
@Override
public OperatorType getType() {
return OperatorType.SCRIPT;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy