org.apache.zeppelin.spark.IPySparkInterpreter Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.zeppelin.spark;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.zeppelin.interpreter.ZeppelinContext;
import org.apache.zeppelin.interpreter.InterpreterContext;
import org.apache.zeppelin.interpreter.InterpreterException;
import org.apache.zeppelin.interpreter.InterpreterResult;
import org.apache.zeppelin.python.IPythonInterpreter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Map;
import java.util.Properties;
/**
* PySparkInterpreter which use IPython underlying.
*/
public class IPySparkInterpreter extends IPythonInterpreter {
private static final Logger LOGGER = LoggerFactory.getLogger(IPySparkInterpreter.class);
private SparkInterpreter sparkInterpreter;
private boolean opened = false;
private InterpreterContext curIntpContext;
public IPySparkInterpreter(Properties property) {
super(property);
}
@Override
public synchronized void open() throws InterpreterException {
// IPySparkInterpreter may already be opened in PySparkInterpreter when ipython is available.
if (opened) {
return;
}
this.sparkInterpreter = getInterpreterInTheSameSessionByClassName(SparkInterpreter.class);
PySparkInterpreter pySparkInterpreter =
getInterpreterInTheSameSessionByClassName(PySparkInterpreter.class, false);
setProperty("zeppelin.python", pySparkInterpreter.getPythonExec(sparkInterpreter.getSparkContext().conf()));
SparkConf conf = sparkInterpreter.getSparkContext().getConf();
// only set PYTHONPATH in embedded, local or yarn-client mode.
// yarn-cluster will setup PYTHONPATH automatically.
if (!conf.contains(SparkStringConstants.SUBMIT_DEPLOY_MODE_PROP_NAME) ||
!conf.get(SparkStringConstants.SUBMIT_DEPLOY_MODE_PROP_NAME).equals("cluster")) {
setAdditionalPythonPath(PythonUtils.sparkPythonPath());
}
setUseBuiltinPy4j(false);
setAdditionalPythonInitFile("python/zeppelin_ipyspark.py");
super.open();
opened = true;
}
@Override
protected Map setupKernelEnv() throws IOException {
Map env = super.setupKernelEnv();
// set PYSPARK_PYTHON
SparkConf conf = sparkInterpreter.getSparkContext().getConf();
if (conf.contains("spark.pyspark.python")) {
env.put("PYSPARK_PYTHON", conf.get("spark.pyspark.python"));
}
return env;
}
@Override
public ZeppelinContext buildZeppelinContext() {
return sparkInterpreter.getZeppelinContext();
}
@Override
public InterpreterResult interpret(String st,
InterpreterContext context) throws InterpreterException {
// redirect java stdout/stdout to interpreter output. Because pyspark may call java code.
PrintStream originalStdout = System.out;
PrintStream originalStderr = System.err;
try {
System.setOut(new PrintStream(context.out));
System.setErr(new PrintStream(context.out));
Utils.printDeprecateMessage(sparkInterpreter.getSparkVersion(), context, properties);
InterpreterContext.set(context);
String jobGroupId = Utils.buildJobGroupId(context);
String jobDesc = Utils.buildJobDesc(context);
String setJobGroupStmt = "sc.setJobGroup('" + jobGroupId + "', '" + jobDesc + "')";
InterpreterResult result = super.interpret(setJobGroupStmt, context);
if (result.code().equals(InterpreterResult.Code.ERROR)) {
return new InterpreterResult(InterpreterResult.Code.ERROR, "Fail to setJobGroup");
}
String pool = "None";
if (context.getLocalProperties().containsKey("pool")) {
pool = "'" + context.getLocalProperties().get("pool") + "'";
}
String setPoolStmt = "sc.setLocalProperty('spark.scheduler.pool', " + pool + ")";
result = super.interpret(setPoolStmt, context);
if (result.code().equals(InterpreterResult.Code.ERROR)) {
return new InterpreterResult(InterpreterResult.Code.ERROR, "Fail to setPool");
}
this.curIntpContext = context;
String setInptContextStmt = "intp.setInterpreterContextInPython()";
result = super.interpret(setInptContextStmt, context);
if (result.code().equals(InterpreterResult.Code.ERROR)) {
return new InterpreterResult(InterpreterResult.Code.ERROR, "Fail to setCurIntpContext");
}
return super.interpret(st, context);
} finally {
System.setOut(originalStdout);
System.setErr(originalStderr);
}
}
// Python side will call InterpreterContext.get() too, but it is in a different thread other than the
// java interpreter thread. So we should call this method in python side as well.
public void setInterpreterContextInPython() {
InterpreterContext.set(curIntpContext);
}
@Override
public void cancel(InterpreterContext context) throws InterpreterException {
super.cancel(context);
sparkInterpreter.cancel(context);
}
@Override
public void close() throws InterpreterException {
LOGGER.info("Close IPySparkInterpreter");
super.close();
}
@Override
public int getProgress(InterpreterContext context) throws InterpreterException {
return sparkInterpreter.getProgress(context);
}
public boolean isSpark3() {
return sparkInterpreter.getSparkVersion().getMajorVersion() == 3;
}
// Used by PySpark
public boolean isAfterSpark33() {
return sparkInterpreter.getSparkVersion().newerThanEquals(SparkVersion.SPARK_3_3_0);
}
public JavaSparkContext getJavaSparkContext() {
return sparkInterpreter.getJavaSparkContext();
}
public Object getSQLContext() {
return sparkInterpreter.getSQLContext();
}
public Object getSparkSession() {
return sparkInterpreter.getSparkSession();
}
}