All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.sysml.api.MLContext Maven / Gradle / Ivy

There is a newer version: 1.2.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.api;


import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Scanner;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
import org.apache.sysml.api.jmlc.JMLCUtils;
import org.apache.sysml.api.monitoring.SparkMonitoringUtil;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.hops.OptimizerUtils.OptimizationLevel;
import org.apache.sysml.hops.globalopt.GlobalOptimizerWrapper;
import org.apache.sysml.hops.rewrite.ProgramRewriter;
import org.apache.sysml.hops.rewrite.RewriteRemovePersistentReadWrite;
import org.apache.sysml.parser.AParserWrapper;
import org.apache.sysml.parser.DMLProgram;
import org.apache.sysml.parser.DMLTranslator;
import org.apache.sysml.parser.DataExpression;
import org.apache.sysml.parser.Expression;
import org.apache.sysml.parser.IntIdentifier;
import org.apache.sysml.parser.LanguageException;
import org.apache.sysml.parser.StringIdentifier;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.parser.ParseException;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
import org.apache.sysml.runtime.controlprogram.Program;
import org.apache.sysml.runtime.controlprogram.caching.CacheableData;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContextFactory;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.instructions.Instruction;
import org.apache.sysml.runtime.instructions.cp.Data;
import org.apache.sysml.runtime.instructions.cp.VariableCPInstruction;
import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
import org.apache.sysml.runtime.instructions.spark.data.RDDProperties;
import org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair;
import org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction;
import org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction;
import org.apache.sysml.runtime.instructions.spark.functions.SparkListener;
import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.utils.Explain;
import org.apache.sysml.utils.Statistics;
import org.apache.sysml.utils.Explain.ExplainCounts;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;

/**
 * MLContext is useful for passing RDDs as input/output to SystemML. This API avoids the need to read/write
 * from HDFS (which is another way to pass inputs to SystemML).
 * 

* Typical usage for MLContext is as follows: *


 * scala> import org.apache.sysml.api.MLContext
 * 
*

* Create input DataFrame from CSV file and potentially perform some feature transformation *


 * scala> val W = sqlContext.load("com.databricks.spark.csv", Map("path" -> "W.csv", "header" -> "false"))
 * scala> val H = sqlContext.load("com.databricks.spark.csv", Map("path" -> "H.csv", "header" -> "false"))
 * scala> val V = sqlContext.load("com.databricks.spark.csv", Map("path" -> "V.csv", "header" -> "false"))
 * 
*

* Create MLContext *


 * scala> val ml = new MLContext(sc)
 * 
*

* Register input and output DataFrame/RDD * Supported format: *

    *
  1. DataFrame *
  2. CSV/Text (as JavaRDD or JavaPairRDD) *
  3. Binary blocked RDD (JavaPairRDD)) *
* Also overloaded to support metadata information such as format, rlen, clen, ... * Please note the variable names given below in quotes correspond to the variables in DML script. * These variables need to have corresponding read/write associated in DML script. * Currently, only matrix variables are supported through registerInput/registerOutput interface. * To pass scalar variables, use named/positional arguments (described later) or wrap them into matrix variable. *

 * scala> ml.registerInput("V", V)
 * scala> ml.registerInput("W", W)
 * scala> ml.registerInput("H", H)
 * scala> ml.registerOutput("H")
 * scala> ml.registerOutput("W")
 * 
*

* Call script with default arguments: *


 * scala> val outputs = ml.execute("GNMF.dml")
 * 
*

* Also supported: calling script with positional arguments (args) and named arguments (nargs): *

 
 * scala> val args = Array("V.mtx", "W.mtx",  "H.mtx",  "2000", "1500",  "50",  "1",  "WOut.mtx",  "HOut.mtx")
 * scala> val nargs = Map("maxIter"->"1", "V" -> "") 
 * scala> val outputs = ml.execute("GNMF.dml", args) # or ml.execute("GNMF_namedArgs.dml", nargs)
 * 
*

* To run the script again using different (or even same arguments), but using same registered input/outputs: *

 
 * scala> val new_outputs = ml.execute("GNMF.dml", new_args)
 * 
*

* However, to register new input/outputs, you need to first reset MLContext *

 
 * scala> ml.reset()
 * scala> ml.registerInput("V", newV)
 * 
*

* Experimental API: * To monitor performance (only supported for Spark 1.4.0 or higher), *


 * scala> val ml = new MLContext(sc, true)
 * 
*

* If monitoring performance is enabled, *

 
 * scala> print(ml.getMonitoringUtil().getExplainOutput())
 * scala> ml.getMonitoringUtil().getRuntimeInfoInHTML("runtime.html")
 * 
*

* Note: The execute(...) methods does not support parallel calls from same or different MLContext. * This is because current SystemML engine does not allow multiple invocation in same JVM. * So, if you plan to create a system which potentially creates multiple MLContext, * it is recommended to guard the execute(...) call using *

  
 * synchronized(MLContext.class) { ml.execute(...); }
 * 
*/ public class MLContext { // ---------------------------------------------------- // TODO: To make MLContext multi-threaded, track getCurrentMLContext and also all singletons and // static variables in SystemML codebase. private static MLContext _activeMLContext = null; // Package protected so as to maintain a clean public API for MLContext. // Use MLContextProxy.getActiveMLContext() if necessary static MLContext getActiveMLContext() { return _activeMLContext; } // ---------------------------------------------------- private SparkContext _sc = null; // Read while creating SystemML's spark context public SparkContext getSparkContext() { if(_sc == null) { throw new RuntimeException("No spark context set in MLContext"); } return _sc; } private ArrayList _inVarnames = null; private ArrayList _outVarnames = null; private LocalVariableMap _variables = null; // temporary symbol table private Program _rtprog = null; private HashMap _additionalConfigs = new HashMap(); // -------------------------------------------------- // _monitorUtils is set only when MLContext(sc, true) private SparkMonitoringUtil _monitorUtils = null; /** * Experimental API. Not supported in Python MLContext API. * @return */ public SparkMonitoringUtil getMonitoringUtil() { return _monitorUtils; } // -------------------------------------------------- /** * Create an associated MLContext for given spark session. * @param sc * @throws DMLRuntimeException */ public MLContext(SparkContext sc) throws DMLRuntimeException { initializeSpark(sc, false, false); } /** * Create an associated MLContext for given spark session. * @param sc * @throws DMLRuntimeException */ public MLContext(JavaSparkContext sc) throws DMLRuntimeException { initializeSpark(sc.sc(), false, false); } /** * Allow users to provide custom named-value configuration. * @param paramName * @param paramVal */ public void setConfig(String paramName, String paramVal) { _additionalConfigs.put(paramName, paramVal); } // ==================================================================================== // Register input APIs // 1. DataFrame /** * Register DataFrame as input. DataFrame is assumed to be in row format and each cell can be converted into double * through Double.parseDouble(cell.toString()). This is suitable for passing dense matrices. For sparse matrices, * consider passing through text format (using JavaRDD, format="text") *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param df * @throws DMLRuntimeException */ public void registerInput(String varName, DataFrame df) throws DMLRuntimeException { registerInput(varName, df, false); } /** * Register DataFrame as input. * Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param df * @param containsID false if the DataFrame has an column ID which denotes the row ID. * @throws DMLRuntimeException */ public void registerInput(String varName, DataFrame df, boolean containsID) throws DMLRuntimeException { MatrixCharacteristics mcOut = new MatrixCharacteristics(); JavaPairRDD rdd = RDDConverterUtilsExt.dataFrameToBinaryBlock(new JavaSparkContext(_sc), df, mcOut, containsID); registerInput(varName, rdd, mcOut); } /** * Experimental API. Not supported in Python MLContext API. * @param varName * @param df * @throws DMLRuntimeException */ public void registerInput(String varName, MLMatrix df) throws DMLRuntimeException { registerInput(varName, MLMatrix.getRDDLazily(df), df.mc); } // ------------------------------------------------------------------------------------ // 2. CSV/Text: Usually JavaRDD, but also supports JavaPairRDD /** * Register CSV/Text as inputs: Method for supplying csv file format properties, but without dimensions or nnz *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param hasHeader * @param delim * @param fill * @param missingValue * @throws DMLRuntimeException */ public void registerInput(String varName, JavaRDD rdd, String format, boolean hasHeader, String delim, boolean fill, double missingValue) throws DMLRuntimeException { registerInput(varName, rdd, format, hasHeader, delim, fill, missingValue, -1, -1, -1); } /** * Register CSV/Text as inputs: Method for supplying csv file format properties, but without dimensions or nnz *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param hasHeader * @param delim * @param fill * @param missingValue * @throws DMLRuntimeException */ public void registerInput(String varName, RDD rdd, String format, boolean hasHeader, String delim, boolean fill, double missingValue) throws DMLRuntimeException { registerInput(varName, rdd.toJavaRDD(), format, hasHeader, delim, fill, missingValue, -1, -1, -1); } /** * Register CSV/Text as inputs: Method for supplying csv file format properties along with dimensions or nnz *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param hasHeader * @param delim * @param fill * @param missingValue * @param rlen * @param clen * @param nnz * @throws DMLRuntimeException */ public void registerInput(String varName, RDD rdd, String format, boolean hasHeader, String delim, boolean fill, double missingValue, long rlen, long clen, long nnz) throws DMLRuntimeException { registerInput(varName, rdd.toJavaRDD(), format, hasHeader, delim, fill, missingValue, -1, -1, -1); } /** * Register CSV/Text as inputs: Method for supplying csv file format properties along with dimensions or nnz *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param hasHeader * @param delim * @param fill * @param missingValue * @param rlen * @param clen * @param nnz * @throws DMLRuntimeException */ public void registerInput(String varName, JavaRDD rdd, String format, boolean hasHeader, String delim, boolean fill, double missingValue, long rlen, long clen, long nnz) throws DMLRuntimeException { RDDProperties properties = new RDDProperties(); properties.setHasHeader(hasHeader); properties.setFill(fill); properties.setDelim(delim); properties.setMissingValue(missingValue); registerInput(varName, rdd.mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, nnz, properties); } /** * Register CSV/Text as inputs: Convenience method without dimensions and nnz. It uses default file properties (example: delim, fill, ..) *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @throws DMLRuntimeException */ public void registerInput(String varName, RDD rdd, String format) throws DMLRuntimeException { registerInput(varName, rdd.toJavaRDD().mapToPair(new ConvertStringToLongTextPair()), format, -1, -1, -1, null); } /** * Register CSV/Text as inputs: Convenience method without dimensions and nnz. It uses default file properties (example: delim, fill, ..) *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @throws DMLRuntimeException */ public void registerInput(String varName, JavaRDD rdd, String format) throws DMLRuntimeException { registerInput(varName, rdd.mapToPair(new ConvertStringToLongTextPair()), format, -1, -1, -1, null); } /** * Register CSV/Text as inputs: Convenience method with dimensions and but no nnz. It uses default file properties (example: delim, fill, ..) *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param rlen * @param clen * @throws DMLRuntimeException */ public void registerInput(String varName, JavaRDD rdd, String format, long rlen, long clen) throws DMLRuntimeException { registerInput(varName, rdd.mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, -1, null); } /** * Register CSV/Text as inputs: Convenience method with dimensions and but no nnz. It uses default file properties (example: delim, fill, ..) *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param rlen * @param clen * @throws DMLRuntimeException */ public void registerInput(String varName, RDD rdd, String format, long rlen, long clen) throws DMLRuntimeException { registerInput(varName, rdd.toJavaRDD().mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, -1, null); } /** * Register CSV/Text as inputs: with dimensions and nnz. It uses default file properties (example: delim, fill, ..) *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param rlen * @param clen * @param nnz * @throws DMLRuntimeException */ public void registerInput(String varName, JavaRDD rdd, String format, long rlen, long clen, long nnz) throws DMLRuntimeException { registerInput(varName, rdd.mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, nnz, null); } /** * Register CSV/Text as inputs: with dimensions and nnz. It uses default file properties (example: delim, fill, ..) *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param format * @param rlen * @param clen * @param nnz * @throws DMLRuntimeException */ public void registerInput(String varName, RDD rdd, String format, long rlen, long clen, long nnz) throws DMLRuntimeException { registerInput(varName, rdd.toJavaRDD().mapToPair(new ConvertStringToLongTextPair()), format, rlen, clen, nnz, null); } // All CSV related methods call this ... It provides access to dimensions, nnz, file properties. private void registerInput(String varName, JavaPairRDD textOrCsv_rdd, String format, long rlen, long clen, long nnz, RDDProperties properties) throws DMLRuntimeException { if(!(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)) { throw new DMLRuntimeException("The registerInput functionality only supported for spark runtime. Please use MLContext(sc) instead of default constructor."); } if(_variables == null) _variables = new LocalVariableMap(); if(_inVarnames == null) _inVarnames = new ArrayList(); MatrixObject mo = null; if(format.compareTo("csv") == 0) { MatrixCharacteristics mc = new MatrixCharacteristics(rlen, clen, DMLTranslator.DMLBlockSize, DMLTranslator.DMLBlockSize, nnz); mo = new MatrixObject(ValueType.DOUBLE, null, new MatrixFormatMetaData(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo)); } else if(format.compareTo("text") == 0) { if(rlen == -1 || clen == -1) { throw new DMLRuntimeException("The metadata is required in registerInput for format:" + format); } MatrixCharacteristics mc = new MatrixCharacteristics(rlen, clen, DMLTranslator.DMLBlockSize, DMLTranslator.DMLBlockSize, nnz); mo = new MatrixObject(ValueType.DOUBLE, null, new MatrixFormatMetaData(mc, OutputInfo.TextCellOutputInfo, InputInfo.TextCellInputInfo)); } else if(format.compareTo("mm") == 0) { // TODO: Handle matrix market throw new DMLRuntimeException("Matrixmarket format is not yet implemented in registerInput: " + format); } else { throw new DMLRuntimeException("Incorrect format in registerInput: " + format); } JavaPairRDD rdd = textOrCsv_rdd.mapToPair(new CopyTextInputFunction()); if(properties != null) { mo.setRddProperties(properties); } mo.setRDDHandle(new RDDObject(rdd, varName)); _variables.put(varName, mo); _inVarnames.add(varName); checkIfRegisteringInputAllowed(); } // ------------------------------------------------------------------------------------ // 3. Binary blocked RDD: Support JavaPairRDD /** * Register binary blocked RDD with given dimensions, default block sizes and no nnz *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param rlen * @param clen * @throws DMLRuntimeException */ public void registerInput(String varName, JavaPairRDD rdd, long rlen, long clen) throws DMLRuntimeException { registerInput(varName, rdd, rlen, clen, DMLTranslator.DMLBlockSize, DMLTranslator.DMLBlockSize); } /** * Register binary blocked RDD with given dimensions, given block sizes and no nnz *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param rlen * @param clen * @param brlen * @param bclen * @throws DMLRuntimeException */ public void registerInput(String varName, JavaPairRDD rdd, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { registerInput(varName, rdd, rlen, clen, brlen, bclen, -1); } /** * Register binary blocked RDD with given dimensions, given block sizes and given nnz (preferred). *

* Marks the variable in the DML script as input variable. * Note that this expects a "varName = read(...)" statement in the DML script which through non-MLContext invocation * would have been created by reading a HDFS file. * @param varName * @param rdd * @param rlen * @param clen * @param brlen * @param bclen * @param nnz * @throws DMLRuntimeException */ public void registerInput(String varName, JavaPairRDD rdd, long rlen, long clen, int brlen, int bclen, long nnz) throws DMLRuntimeException { if(rlen == -1 || clen == -1) { throw new DMLRuntimeException("The metadata is required in registerInput for binary format"); } MatrixCharacteristics mc = new MatrixCharacteristics(rlen, clen, brlen, bclen, nnz); registerInput(varName, rdd, mc); } // All binary blocked method call this. public void registerInput(String varName, JavaPairRDD rdd, MatrixCharacteristics mc) throws DMLRuntimeException { if(_variables == null) _variables = new LocalVariableMap(); if(_inVarnames == null) _inVarnames = new ArrayList(); // Bug in Spark is messing up blocks and indexes due to too eager reuse of data structures JavaPairRDD copyRDD = rdd.mapToPair( new CopyBlockPairFunction() ); MatrixObject mo = new MatrixObject(ValueType.DOUBLE, "temp", new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo)); mo.setRDDHandle(new RDDObject(copyRDD, varName)); _variables.put(varName, mo); _inVarnames.add(varName); checkIfRegisteringInputAllowed(); } // ============================================================================================= /** * Marks the variable in the DML script as output variable. * Note that this expects a "write(varName, ...)" statement in the DML script which through non-MLContext invocation * would have written the matrix to HDFS. * @param varName * @throws DMLRuntimeException */ public void registerOutput(String varName) throws DMLRuntimeException { if(!(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)) { throw new DMLRuntimeException("The registerOutput functionality only supported for spark runtime. Please use MLContext(sc) instead of default constructor."); } if(_outVarnames == null) _outVarnames = new ArrayList(); _outVarnames.add(varName); if(_variables == null) _variables = new LocalVariableMap(); } // ============================================================================================= /** * Execute DML script by passing named arguments using specified config file. * @param dmlScriptFilePath the dml script can be in local filesystem or in HDFS * @param namedArgs * @param parsePyDML * @param configFilePath * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, HashMap namedArgs, boolean parsePyDML, String configFilePath) throws IOException, DMLException, ParseException { String [] args = new String[namedArgs.size()]; int i = 0; for(Entry entry : namedArgs.entrySet()) { if(entry.getValue().trim().compareTo("") == 0) args[i] = entry.getKey() + "=\"" + entry.getValue() + "\""; else args[i] = entry.getKey() + "=" + entry.getValue(); i++; } return compileAndExecuteScript(dmlScriptFilePath, args, true, parsePyDML, configFilePath); } /** * Execute DML script by passing named arguments using specified config file. * @param dmlScriptFilePath the dml script can be in local filesystem or in HDFS * @param namedArgs * @param configFilePath * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, HashMap namedArgs, String configFilePath) throws IOException, DMLException, ParseException { String [] args = new String[namedArgs.size()]; int i = 0; for(Entry entry : namedArgs.entrySet()) { if(entry.getValue().trim().compareTo("") == 0) args[i] = entry.getKey() + "=\"" + entry.getValue() + "\""; else args[i] = entry.getKey() + "=" + entry.getValue(); i++; } return compileAndExecuteScript(dmlScriptFilePath, args, true, false, configFilePath); } /** * Execute DML script by passing named arguments with default configuration. * @param dmlScriptFilePath the dml script can be in local filesystem or in HDFS * @param namedArgs * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, HashMap namedArgs) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, namedArgs, false, null); } /** * Execute DML script by passing named arguments. * @param dmlScriptFilePath * @param namedArgs * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, scala.collection.immutable.Map namedArgs) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, new HashMap(scala.collection.JavaConversions.mapAsJavaMap(namedArgs))); } /** * Experimental: Execute PyDML script by passing named arguments if parsePyDML=true. * @param dmlScriptFilePath * @param namedArgs * @param parsePyDML * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, HashMap namedArgs, boolean parsePyDML) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, namedArgs, parsePyDML, null); } /** * Experimental: Execute PyDML script by passing named arguments if parsePyDML=true. * @param dmlScriptFilePath * @param namedArgs * @param parsePyDML * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, scala.collection.immutable.Map namedArgs, boolean parsePyDML) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, new HashMap(scala.collection.JavaConversions.mapAsJavaMap(namedArgs)), parsePyDML); } /** * Execute DML script by passing positional arguments using specified config file * @param dmlScriptFilePath * @param args * @param configFilePath * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, String [] args, String configFilePath) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, args, false, configFilePath); } /** * Execute DML script by passing positional arguments using specified config file * This method is implemented for compatibility with Python MLContext. * Java/Scala users should use 'MLOutput execute(String dmlScriptFilePath, String [] args, String configFilePath)' instead as * equivalent scala collections (Seq/ArrayBuffer) is not implemented. * @param dmlScriptFilePath * @param args * @param configFilePath * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, ArrayList args, String configFilePath) throws IOException, DMLException, ParseException { String [] argsArr = new String[args.size()]; argsArr = args.toArray(argsArr); return execute(dmlScriptFilePath, argsArr, false, configFilePath); } /** * Execute DML script by passing positional arguments using default configuration * @param dmlScriptFilePath * @param args * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, String [] args) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, args, false, null); } /** * Execute DML script by passing positional arguments using default configuration. * This method is implemented for compatibility with Python MLContext. * Java/Scala users should use 'MLOutput execute(String dmlScriptFilePath, String [] args)' instead as * equivalent scala collections (Seq/ArrayBuffer) is not implemented. * @param dmlScriptFilePath * @param args * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, ArrayList args) throws IOException, DMLException, ParseException { String [] argsArr = new String[args.size()]; argsArr = args.toArray(argsArr); return execute(dmlScriptFilePath, argsArr, false, null); } /** * Experimental: Execute DML script by passing positional arguments if parsePyDML=true, using default configuration. * This method is implemented for compatibility with Python MLContext. * Java/Scala users should use 'MLOutput execute(String dmlScriptFilePath, String [] args, boolean parsePyDML)' instead as * equivalent scala collections (Seq/ArrayBuffer) is not implemented. * @param dmlScriptFilePath * @param args * @param parsePyDML * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, ArrayList args, boolean parsePyDML) throws IOException, DMLException, ParseException { String [] argsArr = new String[args.size()]; argsArr = args.toArray(argsArr); return execute(dmlScriptFilePath, argsArr, parsePyDML, null); } /** * Experimental: Execute DML script by passing positional arguments if parsePyDML=true, using specified config file. * This method is implemented for compatibility with Python MLContext. * Java/Scala users should use 'MLOutput execute(String dmlScriptFilePath, String [] args, boolean parsePyDML, String configFilePath)' instead as * equivalent scala collections (Seq/ArrayBuffer) is not implemented. * @param dmlScriptFilePath * @param args * @param parsePyDML * @param configFilePath * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, ArrayList args, boolean parsePyDML, String configFilePath) throws IOException, DMLException, ParseException { String [] argsArr = new String[args.size()]; argsArr = args.toArray(argsArr); return execute(dmlScriptFilePath, argsArr, parsePyDML, configFilePath); } /** * Experimental: Execute DML script by passing positional arguments if parsePyDML=true, using specified config file. * @param dmlScriptFilePath * @param args * @param parsePyDML * @param configFilePath * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, String [] args, boolean parsePyDML, String configFilePath) throws IOException, DMLException, ParseException { return compileAndExecuteScript(dmlScriptFilePath, args, false, parsePyDML, configFilePath); } /** * Experimental: Execute DML script by passing positional arguments if parsePyDML=true, using default configuration. * @param dmlScriptFilePath * @param args * @param parsePyDML * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, String [] args, boolean parsePyDML) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, args, parsePyDML, null); } /** * Execute DML script without any arguments using specified config path * @param dmlScriptFilePath * @param configFilePath * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, String configFilePath) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, false, configFilePath); } /** * Execute DML script without any arguments using default configuration. * @param dmlScriptFilePath * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, false, null); } /** * Experimental: Execute DML script without any arguments if parsePyDML=true, using specified config path. * @param dmlScriptFilePath * @param parsePyDML * @param configFilePath * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, boolean parsePyDML, String configFilePath) throws IOException, DMLException, ParseException { return compileAndExecuteScript(dmlScriptFilePath, null, false, parsePyDML, configFilePath); } /** * Experimental: Execute DML script without any arguments if parsePyDML=true, using default configuration. * @param dmlScriptFilePath * @param parsePyDML * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput execute(String dmlScriptFilePath, boolean parsePyDML) throws IOException, DMLException, ParseException { return execute(dmlScriptFilePath, parsePyDML, null); } // -------------------------------- Utility methods begins ---------------------------------------------------------- /** * Call this method if you want to clear any RDDs set via registerInput, registerOutput. * This is required if ml.execute(..) has been called earlier and you want to call a new DML script. * Note: By default this doesnot clean up configuration set using setConfig method. * To clean the configuration as along with registered input/outputs, please use reset(true); * @throws DMLRuntimeException */ public void reset() throws DMLRuntimeException { reset(false); } public void reset(boolean cleanupConfig) throws DMLRuntimeException { //cleanup variables from bufferpool, incl evicted files //(otherwise memory leak because bufferpool holds references) CacheableData.cleanupCacheDir(); //clear mlcontext state _inVarnames = null; _outVarnames = null; _variables = null; if(cleanupConfig) _additionalConfigs.clear(); } /** * Used internally * @param source * @param target * @throws LanguageException */ void setAppropriateVarsForRead(Expression source, String target) throws LanguageException { boolean isTargetRegistered = isRegisteredAsInput(target); boolean isReadExpression = (source instanceof DataExpression && ((DataExpression) source).isRead()); if(isTargetRegistered && isReadExpression) { // Do not check metadata file for registered reads ((DataExpression) source).setCheckMetadata(false); MatrixObject mo = null; try { mo = getMatrixObject(target); int blp = source.getBeginLine(); int bcp = source.getBeginColumn(); int elp = source.getEndLine(); int ecp = source.getEndColumn(); ((DataExpression) source).addVarParam(DataExpression.READROWPARAM, new IntIdentifier(mo.getNumRows(), source.getFilename(), blp, bcp, elp, ecp)); ((DataExpression) source).addVarParam(DataExpression.READCOLPARAM, new IntIdentifier(mo.getNumColumns(), source.getFilename(), blp, bcp, elp, ecp)); ((DataExpression) source).addVarParam(DataExpression.READNUMNONZEROPARAM, new IntIdentifier(mo.getNnz(), source.getFilename(), blp, bcp, elp, ecp)); ((DataExpression) source).addVarParam(DataExpression.DATATYPEPARAM, new StringIdentifier("matrix", source.getFilename(), blp, bcp, elp, ecp)); ((DataExpression) source).addVarParam(DataExpression.VALUETYPEPARAM, new StringIdentifier("double", source.getFilename(), blp, bcp, elp, ecp)); if(mo.getMetaData() instanceof MatrixFormatMetaData) { MatrixFormatMetaData metaData = (MatrixFormatMetaData) mo.getMetaData(); if(metaData.getOutputInfo() == OutputInfo.CSVOutputInfo) { ((DataExpression) source).addVarParam(DataExpression.FORMAT_TYPE, new StringIdentifier(DataExpression.FORMAT_TYPE_VALUE_CSV, source.getFilename(), blp, bcp, elp, ecp)); } else if(metaData.getOutputInfo() == OutputInfo.TextCellOutputInfo) { ((DataExpression) source).addVarParam(DataExpression.FORMAT_TYPE, new StringIdentifier(DataExpression.FORMAT_TYPE_VALUE_TEXT, source.getFilename(), blp, bcp, elp, ecp)); } else if(metaData.getOutputInfo() == OutputInfo.BinaryBlockOutputInfo) { ((DataExpression) source).addVarParam(DataExpression.ROWBLOCKCOUNTPARAM, new IntIdentifier(mo.getNumRowsPerBlock(), source.getFilename(), blp, bcp, elp, ecp)); ((DataExpression) source).addVarParam(DataExpression.COLUMNBLOCKCOUNTPARAM, new IntIdentifier(mo.getNumColumnsPerBlock(), source.getFilename(), blp, bcp, elp, ecp)); ((DataExpression) source).addVarParam(DataExpression.FORMAT_TYPE, new StringIdentifier(DataExpression.FORMAT_TYPE_VALUE_BINARY, source.getFilename(), blp, bcp, elp, ecp)); } else { throw new LanguageException("Unsupported format through MLContext"); } } } catch (DMLRuntimeException e) { throw new LanguageException(e); } } } /** * Used internally * @param tmp * @return */ ArrayList performCleanupAfterRecompilation(ArrayList tmp) { String [] outputs = null; if(_outVarnames != null) { outputs = _outVarnames.toArray(new String[0]); } else { outputs = new String[0]; } // No need to clean up entire program as this method is only called for last level program block // JMLCUtils.cleanupRuntimeProgram(_rtprog, outputs); for( int i=0; i version2) { return 1; } } if(s1.hasNextInt()) return 1; } finally { if(s1 != null) s1.close(); if(s2 != null) s2.close(); } return 0; } private void initializeSpark(SparkContext sc, boolean monitorPerformance, boolean setForcedSparkExecType) throws DMLRuntimeException { MLContextProxy.setActive(true); this._sc = sc; if(compareVersion(sc.version(), "1.3.0") < 0 ) { throw new DMLRuntimeException("Expected spark version >= 1.3.0 for running SystemML"); } if(setForcedSparkExecType) DMLScript.rtplatform = RUNTIME_PLATFORM.SPARK; else DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK; if(monitorPerformance) { initializeSparkListener(sc); } } private void initializeSparkListener(SparkContext sc) throws DMLRuntimeException { if(compareVersion(sc.version(), "1.4.0") < 0 ) { throw new DMLRuntimeException("Expected spark version >= 1.4.0 for monitoring MLContext performance"); } SparkListener sparkListener = new SparkListener(sc); _monitorUtils = new SparkMonitoringUtil(sparkListener); sc.addSparkListener(sparkListener); } /** * Experimental API. Not supported in Python MLContext API. * @param dmlScript * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLOutput executeScript(String dmlScript) throws IOException, DMLException, ParseException { return compileAndExecuteScript(dmlScript, null, false, false, false, null); } public MLOutput executeScript(String dmlScript, String configFilePath) throws IOException, DMLException, ParseException { return compileAndExecuteScript(dmlScript, null, false, false, false, configFilePath); } private void checkIfRegisteringInputAllowed() throws DMLRuntimeException { if(!(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)) { throw new DMLRuntimeException("ERROR: registerInput is only allowed for spark execution mode"); } } private MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] args, boolean isNamedArgument, boolean isPyDML, String configFilePath) throws IOException, DMLException, ParseException { return compileAndExecuteScript(dmlScriptFilePath, args, true, isNamedArgument, isPyDML, configFilePath); } /** * All the execute() methods call this, which after setting appropriate input/output variables * calls _compileAndExecuteScript * We have explicitly synchronized this function because MLContext/SystemML does not yet support multi-threading. * @param dmlScriptFilePath * @param args * @param isNamedArgument * @return * @throws IOException * @throws DMLException * @throws ParseException */ private synchronized MLOutput compileAndExecuteScript(String dmlScriptFilePath, String [] args, boolean isFile, boolean isNamedArgument, boolean isPyDML, String configFilePath) throws IOException, DMLException, ParseException { try { if(getActiveMLContext() != null) { throw new DMLRuntimeException("SystemML (and hence by definition MLContext) doesnot support parallel execute() calls from same or different MLContexts. " + "As a temporary fix, please do explicit synchronization, i.e. synchronized(MLContext.class) { ml.execute(...) } "); } // Set active MLContext. _activeMLContext = this; // Setup parser parameters // TODO In the process of hardening mlcontext, we should also reinvestigate if we // could be more restrictive and require known dimensions (rm REJECT_READ_WRITE_UNKNOWNS). AParserWrapper.IGNORE_UNSPECIFIED_ARGS = true; DataExpression.REJECT_READ_WRITE_UNKNOWNS = false; if(_monitorUtils != null) { _monitorUtils.resetMonitoringData(); } if(DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK || DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK) { HashMap> retVal = null; // Depending on whether registerInput/registerOutput was called initialize the variables String[] inputs = null; String[] outputs = null; if(_inVarnames != null) { inputs = _inVarnames.toArray(new String[0]); } else { inputs = new String[0]; } if(_outVarnames != null) { outputs = _outVarnames.toArray(new String[0]); } else { outputs = new String[0]; } HashMap outMetadata = new HashMap(); HashMap argVals = DMLScript.createArgumentsMap(isNamedArgument, args); // Run the DML script ExecutionContext ec = executeUsingSimplifiedCompilationChain(dmlScriptFilePath, isFile, argVals, isPyDML, inputs, outputs, _variables, configFilePath); // Now collect the output if(_outVarnames != null) { if(_variables == null) { throw new DMLRuntimeException("The symbol table returned after executing the script is empty"); } for( String ovar : _outVarnames ) { if( _variables.keySet().contains(ovar) ) { if(retVal == null) { retVal = new HashMap>(); } retVal.put(ovar, ((SparkExecutionContext) ec).getBinaryBlockRDDHandleForVariable(ovar)); outMetadata.put(ovar, ((SparkExecutionContext) ec).getMatrixCharacteristics(ovar)); // For converting output to dataframe } else { throw new DMLException("Error: The variable " + ovar + " is not available as output after the execution of the DMLScript."); } } } return new MLOutput(retVal, outMetadata); } else { throw new DMLRuntimeException("Unsupported runtime:" + DMLScript.rtplatform.name()); } } finally { // Reset active MLContext. _activeMLContext = null; // Reset parser parameters AParserWrapper.IGNORE_UNSPECIFIED_ARGS = false; DataExpression.REJECT_READ_WRITE_UNKNOWNS = true; } } /** * This runs the DML script and returns the ExecutionContext for the caller to extract the output variables. * The caller (which is compileAndExecuteScript) is expected to set inputSymbolTable with appropriate matrix representation (RDD, MatrixObject). * * @param dmlScriptFilePath * @param args * @param isNamedArgument * @param parsePyDML * @param inputs * @param outputs * @param inputSymbolTable * @return * @throws IOException * @throws DMLException * @throws ParseException */ private ExecutionContext executeUsingSimplifiedCompilationChain(String dmlScriptFilePath, boolean isFile, HashMap argVals, boolean parsePyDML, String[] inputs, String[] outputs, LocalVariableMap inputSymbolTable, String configFilePath) throws IOException, DMLException, ParseException { DMLConfig config = null; if(configFilePath == null) { config = new DMLConfig(); } else { config = new DMLConfig(configFilePath); } for(Entry param : _additionalConfigs.entrySet()) { config.setTextValue(param.getKey(), param.getValue()); } ConfigurationManager.setConfig(config); String dmlScriptStr = null; if(isFile) dmlScriptStr = DMLScript.readDMLScript("-f", dmlScriptFilePath); else dmlScriptStr = DMLScript.readDMLScript("-s", dmlScriptFilePath); if(_monitorUtils != null) { _monitorUtils.setDMLString(dmlScriptStr); } //simplified compilation chain _rtprog = null; //parsing AParserWrapper parser = AParserWrapper.createParser(parsePyDML); DMLProgram prog = parser.parse(dmlScriptFilePath, dmlScriptStr, argVals); //language validate DMLTranslator dmlt = new DMLTranslator(prog); dmlt.liveVariableAnalysis(prog); dmlt.validateParseTree(prog); //hop construct/rewrite dmlt.constructHops(prog); dmlt.rewriteHopsDAG(prog); Explain.explain(prog); //rewrite persistent reads/writes if(inputSymbolTable != null) { RewriteRemovePersistentReadWrite rewrite = new RewriteRemovePersistentReadWrite(inputs, outputs); ProgramRewriter rewriter2 = new ProgramRewriter(rewrite); rewriter2.rewriteProgramHopDAGs(prog); } //lop construct and runtime prog generation dmlt.constructLops(prog); _rtprog = prog.getRuntimeProgram(config); //optional global data flow optimization if(OptimizerUtils.isOptLevel(OptimizationLevel.O4_GLOBAL_TIME_MEMORY) ) { _rtprog = GlobalOptimizerWrapper.optimizeProgram(prog, _rtprog); } // launch SystemML appmaster not required as it is already launched //count number compiled MR jobs / SP instructions ExplainCounts counts = Explain.countDistributedOperations(_rtprog); Statistics.resetNoOfCompiledJobs( counts.numJobs ); // Initialize caching and scratch space DMLScript.initHadoopExecution(config); //final cleanup runtime prog JMLCUtils.cleanupRuntimeProgram(_rtprog, outputs); //create and populate execution context ExecutionContext ec = ExecutionContextFactory.createContext(_rtprog); if(inputSymbolTable != null) { ec.setVariables(inputSymbolTable); } //core execute runtime program _rtprog.execute( ec ); if(_monitorUtils != null) _monitorUtils.setExplainOutput(Explain.explain(_rtprog)); return ec; } // -------------------------------- Private methods ends ---------------------------------------------------------- // TODO: Add additional create to provide sep, missing values, etc. for CSV /** * Experimental API: Might be discontinued in future release * @param sqlContext * @param filePath * @param format * @return * @throws IOException * @throws DMLException * @throws ParseException */ public MLMatrix read(SQLContext sqlContext, String filePath, String format) throws IOException, DMLException, ParseException { this.reset(); this.registerOutput("output"); MLOutput out = this.executeScript("output = read(\"" + filePath + "\", format=\"" + format + "\"); " + MLMatrix.writeStmt); JavaPairRDD blocks = out.getBinaryBlockedRDD("output"); MatrixCharacteristics mcOut = out.getMatrixCharacteristics("output"); return MLMatrix.createMLMatrix(this, sqlContext, blocks, mcOut); } // // TODO: Test this in different scenarios: sparse/dense/mixed // /** // * Experimental unstable API: Might be discontinued in future release // * @param ml // * @param sqlContext // * @param mllibMatrix // * @return // * @throws DMLRuntimeException // */ // public MLMatrix read(SQLContext sqlContext, BlockMatrix mllibMatrix) throws DMLRuntimeException { // long nnz = -1; // TODO: Find number of non-zeros from mllibMatrix ... This is important !! // // JavaPairRDD, Matrix> mllibBlocks = JavaPairRDD.fromJavaRDD(mllibMatrix.blocks().toJavaRDD()); // long rlen = mllibMatrix.numRows(); long clen = mllibMatrix.numCols(); // int brlen = mllibMatrix.numRowBlocks(); // int bclen = mllibMatrix.numColBlocks(); // if(mllibMatrix.numRowBlocks() != DMLTranslator.DMLBlockSize && mllibMatrix.numColBlocks() != DMLTranslator.DMLBlockSize) { // System.err.println("WARNING: Since the block size of mllib matrix is not " + DMLTranslator.DMLBlockSize + ", it may cause " // + "reblocks"); // } // // JavaPairRDD blocks = mllibBlocks // .mapToPair(new ConvertMLLibBlocksToBinaryBlocks(rlen, clen, brlen, bclen)); // // MatrixCharacteristics mc = new MatrixCharacteristics(rlen, clen, brlen, bclen, nnz); // return MLMatrix.createMLMatrix(this, sqlContext, blocks, mc); // } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy