org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.instructions.spark.utils;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;

import org.apache.hadoop.io.Text;
import org.apache.spark.Accumulator;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import scala.Tuple2;

import org.apache.sysml.api.MLOutput.ConvertDoubleArrayToRows;
import org.apache.sysml.api.MLOutput.ProjectRows;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.instructions.spark.functions.ConvertMatrixBlockToIJVLines;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixCell;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue;
import org.apache.sysml.runtime.matrix.mapred.ReblockBuffer;
import org.apache.sysml.runtime.util.FastStringTokenizer;
import org.apache.sysml.runtime.util.UtilFunctions;

/**
 * NOTE: These are experimental converter utils. Once thoroughly tested, they
 * can be moved to RDDConverterUtils.
 */
@SuppressWarnings("unused")
public class RDDConverterUtilsExt 
{
	public enum RDDConverterTypes {
		TEXT_TO_MATRIX_CELL, 
		MATRIXENTRY_TO_MATRIXCELL,
		TEXT_TO_DOUBLEARR, 
		ROW_TO_DOUBLEARR, 
		VECTOR_TO_DOUBLEARR
	}
	
	
	/**
	 * Example usage:
	 * 
	 * import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt
	 * import org.apache.sysml.runtime.matrix.MatrixCharacteristics
	 * import org.apache.spark.api.java.JavaSparkContext
	 * import org.apache.spark.mllib.linalg.distributed.MatrixEntry
	 * import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
	 * val matRDD = sc.textFile("ratings.text").map(_.split(" ")).map(x => new MatrixEntry(x(0).toLong, x(1).toLong, x(2).toDouble)).filter(_.value != 0).cache
	 * require(matRDD.filter(x => x.i == 0 || x.j == 0).count == 0, "Expected 1-based ratings file")
	 * val nnz = matRDD.count
	 * val numRows = matRDD.map(_.i).max
	 * val numCols = matRDD.map(_.j).max
	 * val coordinateMatrix = new CoordinateMatrix(matRDD, numRows, numCols)
	 * val mc = new MatrixCharacteristics(numRows, numCols, 1000, 1000, nnz)
	 * val binBlocks = RDDConverterUtilsExt.coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), coordinateMatrix, mc, true)
	 * 
	 * 
	 * @param sc
	 * @param input
	 * @param mcIn
	 * @param outputEmptyBlocks
	 * @return
	 * @throws DMLRuntimeException
	 */
	public static JavaPairRDD coordinateMatrixToBinaryBlock(JavaSparkContext sc,
			CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks) throws DMLRuntimeException 
	{
		//convert matrix entry rdd to binary block rdd (w/ partial blocks)
		JavaPairRDD out = input.entries().toJavaRDD()
				.mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn));
		
		//inject empty blocks (if necessary) 
		if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) {
			out = out.union( 
				SparkUtils.getEmptyBlockRDD(sc, mcIn) );
		}
		
		//aggregate partial matrix blocks
		out = RDDAggregateUtils.mergeByKey( out ); 
		
		return out;
	}
	
	public static JavaPairRDD coordinateMatrixToBinaryBlock(SparkContext sc,
			CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks) throws DMLRuntimeException 
	{
		return coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), input, mcIn, true);
	}
	
	// Useful for printing, testing binary blocked RDD and also for external use.
	public static JavaRDD binaryBlockToStringRDD(JavaPairRDD input, MatrixCharacteristics mcIn, String format) throws DMLRuntimeException {
		if(format.compareTo("text") == 0) {
			JavaRDD ijv = input.flatMap(new ConvertMatrixBlockToIJVLines(mcIn.getRowsPerBlock(), mcIn.getColsPerBlock()));
			return ijv;
		}
//		else if(format.compareTo("csv") == 0) {
//			
//		}
		else {
			throw new DMLRuntimeException("The output format:" + format + " is not implemented yet.");
		}
	}
	
	public static JavaPairRDD vectorDataFrameToBinaryBlock(SparkContext sc,
			DataFrame inputDF, MatrixCharacteristics mcOut, boolean containsID, String vectorColumnName) throws DMLRuntimeException {
		return vectorDataFrameToBinaryBlock(new JavaSparkContext(sc), inputDF, mcOut, containsID, vectorColumnName);
	}
	
	public static JavaPairRDD vectorDataFrameToBinaryBlock(JavaSparkContext sc,
			DataFrame inputDF, MatrixCharacteristics mcOut, boolean containsID, String vectorColumnName)
			throws DMLRuntimeException {
		
		if(containsID) {
			inputDF = dropColumn(inputDF.sort("ID"), "ID");
		}
		
		DataFrame df = inputDF.select(vectorColumnName);
			
		//determine unknown dimensions and sparsity if required
		if( !mcOut.dimsKnown(true) ) {
			Accumulator aNnz = sc.accumulator(0L);
			JavaRDD tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, true));
			long rlen = tmp.count();
			long clen = ((Vector) tmp.first().get(0)).size();
			long nnz = UtilFunctions.toLong(aNnz.value());
			mcOut.set(rlen, clen, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), nnz);
		}
		
		JavaPairRDD prepinput = df.javaRDD()
				.zipWithIndex(); //zip row index
		
		//convert csv rdd to binary block rdd (w/ partial blocks)
		JavaPairRDD out = 
				prepinput.mapPartitionsToPair(
					new DataFrameToBinaryBlockFunction(mcOut, true));
		
		//aggregate partial matrix blocks
		out = RDDAggregateUtils.mergeByKey( out ); 
		
		return out;
	}
	
	/**
	 * Adding utility to support for dropping columns for older Spark versions.
	 * @param df
	 * @param column
	 * @return
	 * @throws DMLRuntimeException
	 */
	public static DataFrame dropColumn(DataFrame df, String column) throws DMLRuntimeException {
		ArrayList columnToSelect = new ArrayList();
		String firstCol = null;
		boolean colPresent = false;
		for(String col : df.columns()) {
			if(col.compareTo(column) == 0) {
				colPresent = true;
			}
			else if(firstCol == null) {
				firstCol = col;
			}
			else {
				columnToSelect.add(col);
			}
		}
		
		if(!colPresent) {
			throw new DMLRuntimeException("The column \"" + column + "\" is not present in the dataframe.");
		}
		else if(firstCol == null) {
			throw new DMLRuntimeException("No column other than \"" + column + "\" present in the dataframe.");
		}
		
		// Round about way to do in Java (not exposed in Spark 1.3.0): df = df.drop("ID");
		return df.select(firstCol, scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList());
	}
	
	public static DataFrame projectColumns(DataFrame df, ArrayList columns) throws DMLRuntimeException {
		ArrayList columnToSelect = new ArrayList();
		for(int i = 1; i < columns.size(); i++) {
			columnToSelect.add(columns.get(i));
		}
		return df.select(columns.get(0), scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList());
	}
	
	public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, boolean containsID) throws DMLRuntimeException {
		return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, containsID, null);
	}
	
	public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, String [] columns) throws DMLRuntimeException {
		ArrayList columns1 = new ArrayList(Arrays.asList(columns));
		return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, false, columns1);
	}
	
	public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, ArrayList columns) throws DMLRuntimeException {
		return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, false, columns);
	}
	
	public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, boolean containsID, String [] columns) 
			throws DMLRuntimeException {
		ArrayList columns1 = new ArrayList(Arrays.asList(columns));
		return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, containsID, columns1);
	}
	
	public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, boolean containsID, ArrayList columns) 
			throws DMLRuntimeException {
		return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, containsID, columns);
	}
	
	public static JavaPairRDD dataFrameToBinaryBlock(JavaSparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, boolean containsID) throws DMLRuntimeException {
		return dataFrameToBinaryBlock(sc, df, mcOut, containsID, null);
	}
	
	public static JavaPairRDD dataFrameToBinaryBlock(JavaSparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, ArrayList columns) throws DMLRuntimeException {
		return dataFrameToBinaryBlock(sc, df, mcOut, false, columns);
	}
	
	/**
	 * Converts DataFrame into binary blocked RDD. 
	 * Note: mcOut will be set if you don't know the dimensions.
	 * @param sc
	 * @param df
	 * @param mcOut
	 * @param containsID
	 * @param columns
	 * @return
	 * @throws DMLRuntimeException
	 */
	public static JavaPairRDD dataFrameToBinaryBlock(JavaSparkContext sc,
			DataFrame df, MatrixCharacteristics mcOut, boolean containsID, ArrayList columns) 
			throws DMLRuntimeException {
		if(columns != null) {
			df = projectColumns(df, columns);
		}
		
		if(containsID) {
			df = dropColumn(df.sort("ID"), "ID");
		}
			
		//determine unknown dimensions and sparsity if required
		if( !mcOut.dimsKnown(true) ) {
			Accumulator aNnz = sc.accumulator(0L);
			JavaRDD tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, false));
			long rlen = tmp.count();
			long clen = containsID ? (df.columns().length - 1) : df.columns().length;
			long nnz = UtilFunctions.toLong(aNnz.value());
			mcOut.set(rlen, clen, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), nnz);
		}
		
		JavaPairRDD prepinput = df.javaRDD()
				.zipWithIndex(); //zip row index
		
		//convert csv rdd to binary block rdd (w/ partial blocks)
		JavaPairRDD out = 
				prepinput.mapPartitionsToPair(
					new DataFrameToBinaryBlockFunction(mcOut, false));
		
		//aggregate partial matrix blocks
		out = RDDAggregateUtils.mergeByKey( out ); 
		
		return out;
	}
	
	public static DataFrame binaryBlockToVectorDataFrame(JavaPairRDD binaryBlockRDD, 
			MatrixCharacteristics mc, SQLContext sqlContext) throws DMLRuntimeException {
		long rlen = mc.getRows(); long clen = mc.getCols();
		int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock();
		// Very expensive operation here: groupByKey (where number of keys might be too large)
		JavaRDD rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen))
				.groupByKey().map(new ConvertDoubleArrayToRows(clen, bclen, true));
		
		int numColumns = (int) clen;
		if(numColumns <= 0) {
			throw new DMLRuntimeException("Output dimensions unknown after executing the script and hence cannot create the dataframe");
		}
		
		List fields = new ArrayList();
		// LongTypes throw an error: java.lang.Double incompatible with java.lang.Long
		fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false));
		fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
		// fields.add(DataTypes.createStructField("C1", DataTypes.createArrayType(DataTypes.DoubleType), false));
		
		// This will cause infinite recursion due to bug in Spark
		// https://issues.apache.org/jira/browse/SPARK-6999
		// return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList colNames
		return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
	}
	
	public static class AddRowID implements Function, Row> {
		private static final long serialVersionUID = -3733816995375745659L;

		@Override
		public Row call(Tuple2 arg0) throws Exception {
			int oldNumCols = arg0._1.length();
			Object [] fields = new Object[oldNumCols + 1];
			for(int i = 0; i < oldNumCols; i++) {
				fields[i] = arg0._1.get(i);
			}
			fields[oldNumCols] = new Double(arg0._2 + 1);
			return RowFactory.create(fields);
		}
		
	}
	public static DataFrame addIDToDataFrame(DataFrame df, SQLContext sqlContext, String nameOfCol) {
		StructField[] oldSchema = df.schema().fields();
		StructField[] newSchema = new StructField[oldSchema.length + 1];
		for(int i = 0; i < oldSchema.length; i++) {
			newSchema[i] = oldSchema[i];
		}
		newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false);
		// JavaRDD newRows = df.rdd().toJavaRDD().map(new AddRowID());
		JavaRDD newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID());
		return sqlContext.createDataFrame(newRows, new StructType(newSchema));
	}
	
	public static DataFrame binaryBlockToDataFrame(JavaPairRDD binaryBlockRDD, 
			MatrixCharacteristics mc, SQLContext sqlContext) throws DMLRuntimeException {
		long rlen = mc.getRows(); long clen = mc.getCols();
		int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock();
		
		// Very expensive operation here: groupByKey (where number of keys might be too large)
		JavaRDD rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen))
				.groupByKey().map(new ConvertDoubleArrayToRows(clen, bclen, false));
		
		int numColumns = (int) clen;
		if(numColumns <= 0) {
			// numColumns = rowsRDD.first().length() - 1; // Ugly, so instead prefer to throw
			throw new DMLRuntimeException("Output dimensions unknown after executing the script and hence cannot create the dataframe");
		}
		
		List fields = new ArrayList();
		// LongTypes throw an error: java.lang.Double incompatible with java.lang.Long
		fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false)); 
		for(int i = 1; i <= numColumns; i++) {
			fields.add(DataTypes.createStructField("C" + i, DataTypes.DoubleType, false));
		}
		
		// This will cause infinite recursion due to bug in Spark
		// https://issues.apache.org/jira/browse/SPARK-6999
		// return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList colNames
		return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
	}
	
	private static class MatrixEntryToBinaryBlockFunction implements PairFlatMapFunction,MatrixIndexes,MatrixBlock> 
	{
		private static final long serialVersionUID = 4907483236186747224L;
		
		private IJVToBinaryBlockFunctionHelper helper = null;
		public MatrixEntryToBinaryBlockFunction(MatrixCharacteristics mc) throws DMLRuntimeException {
			helper = new IJVToBinaryBlockFunctionHelper(mc);
		}

		@Override
		public Iterable> call(Iterator arg0) throws Exception {
			return helper.convertToBinaryBlock(arg0, RDDConverterTypes.MATRIXENTRY_TO_MATRIXCELL);
		}

	}

	private static class DataFrameAnalysisFunction implements Function  {
		private static final long serialVersionUID = 5705371332119770215L;
		private RowAnalysisFunctionHelper helper = null;
		boolean isVectorBasedRDD;
		public DataFrameAnalysisFunction( Accumulator aNnz, boolean isVectorBasedRDD) {
			helper = new RowAnalysisFunctionHelper(aNnz);
			this.isVectorBasedRDD = isVectorBasedRDD;
		}

		@Override
		public Row call(Row arg0) throws Exception {
			if(isVectorBasedRDD)
				return helper.analyzeVector(arg0);
			else
				return helper.analyzeRow(arg0);
		}
		
	}
	
	private static class CSVToBinaryBlockFunction implements PairFlatMapFunction>,MatrixIndexes,MatrixBlock> {
		private static final long serialVersionUID = 1501589201971233542L;
		
		private RowToBinaryBlockFunctionHelper helper = null; 
		
		public CSVToBinaryBlockFunction(MatrixCharacteristics mc, String delim, boolean fill, double fillValue) {
			helper = new RowToBinaryBlockFunctionHelper(mc, delim, fill, fillValue);
		}
		
		@Override
		public Iterable> call(Iterator> arg0) throws Exception {
			return helper.convertToBinaryBlock(arg0, RDDConverterTypes.TEXT_TO_DOUBLEARR);
		}
		
	}
	
	private static class DataFrameToBinaryBlockFunction implements PairFlatMapFunction>,MatrixIndexes,MatrixBlock> {
		private static final long serialVersionUID = 653447740362447236L;
		private RowToBinaryBlockFunctionHelper helper = null; 
		boolean isVectorBasedDF;
		
		public DataFrameToBinaryBlockFunction(MatrixCharacteristics mc, boolean isVectorBasedDF) {
			helper = new RowToBinaryBlockFunctionHelper(mc);
			this.isVectorBasedDF = isVectorBasedDF;
		}
		
		@Override
		public Iterable> call(Iterator> arg0) throws Exception {
			if(isVectorBasedDF)
				return helper.convertToBinaryBlock(arg0, RDDConverterTypes.VECTOR_TO_DOUBLEARR);
			else
				return helper.convertToBinaryBlock(arg0, RDDConverterTypes.ROW_TO_DOUBLEARR);
		}
		
	}
	
	private static class RowAnalysisFunctionHelper implements Serializable 
	{
		private static final long serialVersionUID = 2310303223289674477L;

		private Accumulator _aNnz = null;
		private String _delim = null;
		
		public RowAnalysisFunctionHelper( Accumulator aNnz ) {
			_aNnz = aNnz;
		}
		
		public RowAnalysisFunctionHelper( Accumulator aNnz, String delim ) {
			_aNnz = aNnz;
			_delim = delim;
		}
		
		public String analyzeText(Text v1) throws Exception {
			//parse input line
			String line = v1.toString();
			String[] cols = IOUtilFunctions.split(line, _delim);
			
			//determine number of non-zeros of row (w/o string parsing)
			long lnnz = 0;
			for( String col : cols ) {
				if( !col.isEmpty() && !col.equals("0") && !col.equals("0.0") ) {
					lnnz++;
				}
			}
			
			//update counters
			_aNnz.add( (double)lnnz );
			
			return line;
		}
		
		public Row analyzeRow(Row arg0) throws Exception {
			//determine number of non-zeros of row
			long lnnz = 0;
			if(arg0 != null) {
				for(int i = 0; i < arg0.length(); i++) {
					if(RowToBinaryBlockFunctionHelper.getDoubleValue(arg0, i) != 0) {
						lnnz++;
					}
				}
			}
			else {
				throw new Exception("Error while analyzing row");
			}
			
			//update counters
			_aNnz.add( (double)lnnz );
			
			return arg0;
		}
		
		public Row analyzeVector(Row row)  {
			Vector vec = (Vector) row.get(0); // assumption: 1 column DF
			long lnnz = 0;
			for(int i = 0; i < vec.size(); i++) {
				if(vec.apply(i) != 0) { 
					lnnz++;
				}
			}
			
			//update counters
			_aNnz.add( (double)lnnz );
			return row;
		}
	}
	
	private static class IJVToBinaryBlockFunctionHelper implements Serializable {
		private static final long serialVersionUID = -7952801318564745821L;
		//internal buffer size (aligned w/ default matrix block size)
		private static final int BUFFER_SIZE = 4 * 1000 * 1000; //4M elements (32MB)
		private int _bufflen = -1;
		
		private long _rlen = -1;
		private long _clen = -1;
		private int _brlen = -1;
		private int _bclen = -1;
		
		public IJVToBinaryBlockFunctionHelper(MatrixCharacteristics mc) throws DMLRuntimeException
		{
			if(!mc.dimsKnown()) {
				throw new DMLRuntimeException("The dimensions need to be known in given MatrixCharacteristics for given input RDD");
			}
			_rlen = mc.getRows();
			_clen = mc.getCols();
			_brlen = mc.getRowsPerBlock();
			_bclen = mc.getColsPerBlock();
			
			//determine upper bounded buffer len
			_bufflen = (int) Math.min(_rlen*_clen, BUFFER_SIZE);
			
		}
		
		// ----------------------------------------------------
		// Can extend this by having type hierarchy
		public Tuple2 textToMatrixCell(Text txt) {
			FastStringTokenizer st = new FastStringTokenizer(' ');
			//get input string (ignore matrix market comments)
			String strVal = txt.toString();
			if( strVal.startsWith("%") ) 
				return null;
			
			//parse input ijv triple
			st.reset( strVal );
			long row = st.nextLong();
			long col = st.nextLong();
			double val = st.nextDouble();
			MatrixIndexes indx = new MatrixIndexes(row, col);
			MatrixCell cell = new MatrixCell(val);
			return new Tuple2(indx, cell);
		}
		
		public Tuple2 matrixEntryToMatrixCell(MatrixEntry entry) {
			MatrixIndexes indx = new MatrixIndexes(entry.i(), entry.j());
			MatrixCell cell = new MatrixCell(entry.value());
			return new Tuple2(indx, cell);
		}
		
		// ----------------------------------------------------
		
		Iterable> convertToBinaryBlock(Object arg0, RDDConverterTypes converter)  throws Exception {
			ArrayList> ret = new ArrayList>();
			ReblockBuffer rbuff = new ReblockBuffer(_bufflen, _rlen, _clen, _brlen, _bclen);
		
			Iterator iter = (Iterator) arg0;
			while( iter.hasNext() ) {
				Tuple2 cell = null;
				switch(converter) {
					case MATRIXENTRY_TO_MATRIXCELL:
						cell = matrixEntryToMatrixCell((MatrixEntry) iter.next());
						break;
						
					case TEXT_TO_MATRIX_CELL:
						cell = textToMatrixCell((Text) iter.next());
						break;
					
					default:
						throw new Exception("Invalid converter for IJV data:" + converter.toString());
				}
				
				if(cell == null) {
					continue;
				}
				
				//flush buffer if necessary
				if( rbuff.getSize() >= rbuff.getCapacity() )
					flushBufferToList(rbuff, ret);
				
				//add value to reblock buffer
				rbuff.appendCell(cell._1.getRowIndex(), cell._1.getColumnIndex(), cell._2.getValue());
			}
			
			//final flush buffer
			flushBufferToList(rbuff, ret);
		
			return ret;
		}
		
		/**
		 * 
		 * @param rbuff
		 * @param ret
		 * @throws IOException 
		 * @throws DMLRuntimeException 
		 */
		private void flushBufferToList( ReblockBuffer rbuff,  ArrayList> ret ) 
			throws IOException, DMLRuntimeException
		{
			//temporary list of indexed matrix values to prevent library dependencies
			ArrayList rettmp = new ArrayList();					
			rbuff.flushBufferToBinaryBlocks(rettmp);
			ret.addAll(SparkUtils.fromIndexedMatrixBlock(rettmp));
		}
	}
	
	/**
	 * This functions allows to map rdd partitions of csv rows into a set of partial binary blocks.
	 * 
	 * NOTE: For this csv to binary block function, we need to hold all output blocks per partition 
	 * in-memory. Hence, we keep state of all column blocks and aggregate row segments into these blocks. 
	 * In terms of memory consumption this is better than creating partial blocks of row segments.
	 * 
	 */
	private static class RowToBinaryBlockFunctionHelper implements Serializable 
	{
		private static final long serialVersionUID = -4948430402942717043L;
		
		private long _rlen = -1;
		private long _clen = -1;
		private int _brlen = -1;
		private int _bclen = -1;
		private String _delim = null;
		private boolean _fill = false;
		private double _fillValue = 0;
		
		public RowToBinaryBlockFunctionHelper(MatrixCharacteristics mc)
		{
			_rlen = mc.getRows();
			_clen = mc.getCols();
			_brlen = mc.getRowsPerBlock();
			_bclen = mc.getColsPerBlock();
		}
		
		public RowToBinaryBlockFunctionHelper(MatrixCharacteristics mc, String delim, boolean fill, double fillValue)
		{
			_rlen = mc.getRows();
			_clen = mc.getCols();
			_brlen = mc.getRowsPerBlock();
			_bclen = mc.getColsPerBlock();
			_delim = delim;
			_fill = fill;
			_fillValue = fillValue;
		}
		
		boolean emptyFound = false;
		
		// ----------------------------------------------------
		public double[] textToDoubleArray(Text row) {
			String[] parts = IOUtilFunctions.split(row.toString(), _delim);
			double[] ret = new double[parts.length];
			int ix = 0;
			for(String part : parts) {
				emptyFound |= part.isEmpty() && !_fill;
				double val = (part.isEmpty() && _fill) ?
						_fillValue : Double.parseDouble(part);
				ret[ix++] = val;
			}
			return ret;
		}
		public double[] rowToDoubleArray(Row row) throws Exception {
			double[] ret = new double[row.length()];
			for(int i = 0; i < row.length(); i++) {
				ret[i] = getDoubleValue(row, i);
			}
			return ret;
		}
		
		public double[] vectorToDoubleArray(Vector arg) throws Exception {
			return arg.toDense().values();
		}
		// ----------------------------------------------------

		public Iterable> convertToBinaryBlock(Object arg0, RDDConverterTypes converter) 
			throws Exception 
		{
			ArrayList> ret = new ArrayList>();

			int ncblks = (int)Math.ceil((double)_clen/_bclen);
			MatrixIndexes[] ix = new MatrixIndexes[ncblks];
			MatrixBlock[] mb = new MatrixBlock[ncblks];
			
			@SuppressWarnings("unchecked")
			Iterator> iter = (Iterator>) arg0;
			while( iter.hasNext() )
			{
				Tuple2 tmp = iter.next();
				// String row = tmp._1();
				long rowix = tmp._2() + 1;
				
				long rix = UtilFunctions.computeBlockIndex(rowix, _brlen);
				int pos = UtilFunctions.computeCellInBlock(rowix, _brlen);
			
				//create new blocks for entire row
				if( ix[0] == null || ix[0].getRowIndex() != rix ) {
					if( ix[0] !=null )
						flushBlocksToList(ix, mb, ret);
					long len = UtilFunctions.computeBlockSize(_rlen, rix, _brlen);
					createBlocks(rowix, (int)len, ix, mb);
				}
				
				//process row data
				emptyFound = false;
				double[] parts = null;
				switch(converter) {
					case TEXT_TO_DOUBLEARR:
						parts = textToDoubleArray((Text) tmp._1());
						break;
					case ROW_TO_DOUBLEARR:
						parts = rowToDoubleArray((Row) tmp._1());
						break;
					case VECTOR_TO_DOUBLEARR:
						parts = vectorToDoubleArray((Vector) ((Row) tmp._1()).get(0));
						break;
					default:
						throw new Exception("Invalid converter for row-based data:" + converter.toString());
				}
				
				for( int cix=1, pix=0; cix<=ncblks; cix++ ) 
				{
					int lclen = (int)UtilFunctions.computeBlockSize(_clen, cix, _bclen);				
					for( int j=0; j> ret ) 
			throws DMLRuntimeException
		{
			int len = ix.length;			
			for( int i=0; i(ix[i],mb[i]));
					mb[i].examSparsity(); //ensure right representation
				}	
		}
		
		public static double getDoubleValue(Row row, int index) throws Exception {
			try {
				return row.getDouble(index);
			} catch(Exception e) {
				try {
					// Causes lock-contention for Java 7
					return Double.parseDouble(row.get(index).toString());
				}
				catch(Exception e1) {
					throw new Exception("Only double types are supported as input to SystemML. The input argument is \'" + row.get(index) + "\'");
				}
			}
		}
	}
}