All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt Maven / Gradle / Ivy

There is a newer version: 1.2.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.instructions.spark.utils;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Scanner;

import org.apache.hadoop.io.Text;
import org.apache.spark.Accumulator;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.VectorUDT;
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix;
import org.apache.spark.mllib.linalg.distributed.MatrixEntry;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import scala.Tuple2;

import org.apache.sysml.api.MLOutput.ConvertDoubleArrayToRows;
import org.apache.sysml.api.MLOutput.ProjectRows;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.instructions.spark.functions.ConvertMatrixBlockToIJVLines;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.MatrixCell;
import org.apache.sysml.runtime.matrix.data.MatrixIndexes;
import org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue;
import org.apache.sysml.runtime.matrix.mapred.ReblockBuffer;
import org.apache.sysml.runtime.util.FastStringTokenizer;
import org.apache.sysml.runtime.util.UtilFunctions;

/**
 * NOTE: These are experimental converter utils. Once thoroughly tested, they
 * can be moved to RDDConverterUtils.
 */
@SuppressWarnings("unused")
public class RDDConverterUtilsExt 
{
	public enum RDDConverterTypes {
		TEXT_TO_MATRIX_CELL, 
		MATRIXENTRY_TO_MATRIXCELL,
		TEXT_TO_DOUBLEARR, 
		ROW_TO_DOUBLEARR, 
		VECTOR_TO_DOUBLEARR
	}
	
	
	/**
	 * Example usage:
	 * 

	 * import org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtilsExt
	 * import org.apache.sysml.runtime.matrix.MatrixCharacteristics
	 * import org.apache.spark.api.java.JavaSparkContext
	 * import org.apache.spark.mllib.linalg.distributed.MatrixEntry
	 * import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
	 * val matRDD = sc.textFile("ratings.text").map(_.split(" ")).map(x => new MatrixEntry(x(0).toLong, x(1).toLong, x(2).toDouble)).filter(_.value != 0).cache
	 * require(matRDD.filter(x => x.i == 0 || x.j == 0).count == 0, "Expected 1-based ratings file")
	 * val nnz = matRDD.count
	 * val numRows = matRDD.map(_.i).max
	 * val numCols = matRDD.map(_.j).max
	 * val coordinateMatrix = new CoordinateMatrix(matRDD, numRows, numCols)
	 * val mc = new MatrixCharacteristics(numRows, numCols, 1000, 1000, nnz)
	 * val binBlocks = RDDConverterUtilsExt.coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), coordinateMatrix, mc, true)
	 * 
* * @param sc * @param input * @param mcIn * @param outputEmptyBlocks * @return * @throws DMLRuntimeException */ public static JavaPairRDD coordinateMatrixToBinaryBlock(JavaSparkContext sc, CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks) throws DMLRuntimeException { //convert matrix entry rdd to binary block rdd (w/ partial blocks) JavaPairRDD out = input.entries().toJavaRDD() .mapPartitionsToPair(new MatrixEntryToBinaryBlockFunction(mcIn)); //inject empty blocks (if necessary) if( outputEmptyBlocks && mcIn.mightHaveEmptyBlocks() ) { out = out.union( SparkUtils.getEmptyBlockRDD(sc, mcIn) ); } //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey( out ); return out; } public static JavaPairRDD coordinateMatrixToBinaryBlock(SparkContext sc, CoordinateMatrix input, MatrixCharacteristics mcIn, boolean outputEmptyBlocks) throws DMLRuntimeException { return coordinateMatrixToBinaryBlock(new JavaSparkContext(sc), input, mcIn, true); } // Useful for printing, testing binary blocked RDD and also for external use. public static JavaRDD binaryBlockToStringRDD(JavaPairRDD input, MatrixCharacteristics mcIn, String format) throws DMLRuntimeException { if(format.compareTo("text") == 0) { JavaRDD ijv = input.flatMap(new ConvertMatrixBlockToIJVLines(mcIn.getRowsPerBlock(), mcIn.getColsPerBlock())); return ijv; } // else if(format.compareTo("csv") == 0) { // // } else { throw new DMLRuntimeException("The output format:" + format + " is not implemented yet."); } } public static JavaPairRDD vectorDataFrameToBinaryBlock(SparkContext sc, DataFrame inputDF, MatrixCharacteristics mcOut, boolean containsID, String vectorColumnName) throws DMLRuntimeException { return vectorDataFrameToBinaryBlock(new JavaSparkContext(sc), inputDF, mcOut, containsID, vectorColumnName); } public static JavaPairRDD vectorDataFrameToBinaryBlock(JavaSparkContext sc, DataFrame inputDF, MatrixCharacteristics mcOut, boolean containsID, String vectorColumnName) throws DMLRuntimeException { if(containsID) { inputDF = dropColumn(inputDF.sort("ID"), "ID"); } DataFrame df = inputDF.select(vectorColumnName); //determine unknown dimensions and sparsity if required if( !mcOut.dimsKnown(true) ) { Accumulator aNnz = sc.accumulator(0L); JavaRDD tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, true)); long rlen = tmp.count(); long clen = ((Vector) tmp.first().get(0)).size(); long nnz = UtilFunctions.toLong(aNnz.value()); mcOut.set(rlen, clen, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), nnz); } JavaPairRDD prepinput = df.javaRDD() .zipWithIndex(); //zip row index //convert csv rdd to binary block rdd (w/ partial blocks) JavaPairRDD out = prepinput.mapPartitionsToPair( new DataFrameToBinaryBlockFunction(mcOut, true)); //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey( out ); return out; } /** * Adding utility to support for dropping columns for older Spark versions. * @param df * @param column * @return * @throws DMLRuntimeException */ public static DataFrame dropColumn(DataFrame df, String column) throws DMLRuntimeException { ArrayList columnToSelect = new ArrayList(); String firstCol = null; boolean colPresent = false; for(String col : df.columns()) { if(col.compareTo(column) == 0) { colPresent = true; } else if(firstCol == null) { firstCol = col; } else { columnToSelect.add(col); } } if(!colPresent) { throw new DMLRuntimeException("The column \"" + column + "\" is not present in the dataframe."); } else if(firstCol == null) { throw new DMLRuntimeException("No column other than \"" + column + "\" present in the dataframe."); } // Round about way to do in Java (not exposed in Spark 1.3.0): df = df.drop("ID"); return df.select(firstCol, scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList()); } public static DataFrame projectColumns(DataFrame df, ArrayList columns) throws DMLRuntimeException { ArrayList columnToSelect = new ArrayList(); for(int i = 1; i < columns.size(); i++) { columnToSelect.add(columns.get(i)); } return df.select(columns.get(0), scala.collection.JavaConversions.asScalaBuffer(columnToSelect).toList()); } public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc, DataFrame df, MatrixCharacteristics mcOut, boolean containsID) throws DMLRuntimeException { return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, containsID, null); } public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc, DataFrame df, MatrixCharacteristics mcOut, String [] columns) throws DMLRuntimeException { ArrayList columns1 = new ArrayList(Arrays.asList(columns)); return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, false, columns1); } public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc, DataFrame df, MatrixCharacteristics mcOut, ArrayList columns) throws DMLRuntimeException { return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, false, columns); } public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc, DataFrame df, MatrixCharacteristics mcOut, boolean containsID, String [] columns) throws DMLRuntimeException { ArrayList columns1 = new ArrayList(Arrays.asList(columns)); return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, containsID, columns1); } public static JavaPairRDD dataFrameToBinaryBlock(SparkContext sc, DataFrame df, MatrixCharacteristics mcOut, boolean containsID, ArrayList columns) throws DMLRuntimeException { return dataFrameToBinaryBlock(new JavaSparkContext(sc), df, mcOut, containsID, columns); } public static JavaPairRDD dataFrameToBinaryBlock(JavaSparkContext sc, DataFrame df, MatrixCharacteristics mcOut, boolean containsID) throws DMLRuntimeException { return dataFrameToBinaryBlock(sc, df, mcOut, containsID, null); } public static JavaPairRDD dataFrameToBinaryBlock(JavaSparkContext sc, DataFrame df, MatrixCharacteristics mcOut, ArrayList columns) throws DMLRuntimeException { return dataFrameToBinaryBlock(sc, df, mcOut, false, columns); } /** * Converts DataFrame into binary blocked RDD. * Note: mcOut will be set if you don't know the dimensions. * @param sc * @param df * @param mcOut * @param containsID * @param columns * @return * @throws DMLRuntimeException */ public static JavaPairRDD dataFrameToBinaryBlock(JavaSparkContext sc, DataFrame df, MatrixCharacteristics mcOut, boolean containsID, ArrayList columns) throws DMLRuntimeException { if(columns != null) { df = projectColumns(df, columns); } if(containsID) { df = dropColumn(df.sort("ID"), "ID"); } //determine unknown dimensions and sparsity if required if( !mcOut.dimsKnown(true) ) { Accumulator aNnz = sc.accumulator(0L); JavaRDD tmp = df.javaRDD().map(new DataFrameAnalysisFunction(aNnz, false)); long rlen = tmp.count(); long clen = containsID ? (df.columns().length - 1) : df.columns().length; long nnz = UtilFunctions.toLong(aNnz.value()); mcOut.set(rlen, clen, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), nnz); } JavaPairRDD prepinput = df.javaRDD() .zipWithIndex(); //zip row index //convert csv rdd to binary block rdd (w/ partial blocks) JavaPairRDD out = prepinput.mapPartitionsToPair( new DataFrameToBinaryBlockFunction(mcOut, false)); //aggregate partial matrix blocks out = RDDAggregateUtils.mergeByKey( out ); return out; } public static DataFrame binaryBlockToVectorDataFrame(JavaPairRDD binaryBlockRDD, MatrixCharacteristics mc, SQLContext sqlContext) throws DMLRuntimeException { long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); // Very expensive operation here: groupByKey (where number of keys might be too large) JavaRDD rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen)) .groupByKey().map(new ConvertDoubleArrayToRows(clen, bclen, true)); int numColumns = (int) clen; if(numColumns <= 0) { throw new DMLRuntimeException("Output dimensions unknown after executing the script and hence cannot create the dataframe"); } List fields = new ArrayList(); // LongTypes throw an error: java.lang.Double incompatible with java.lang.Long fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false)); fields.add(DataTypes.createStructField("C1", new VectorUDT(), false)); // fields.add(DataTypes.createStructField("C1", DataTypes.createArrayType(DataTypes.DoubleType), false)); // This will cause infinite recursion due to bug in Spark // https://issues.apache.org/jira/browse/SPARK-6999 // return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList colNames return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); } public static class AddRowID implements Function, Row> { private static final long serialVersionUID = -3733816995375745659L; @Override public Row call(Tuple2 arg0) throws Exception { int oldNumCols = arg0._1.length(); Object [] fields = new Object[oldNumCols + 1]; for(int i = 0; i < oldNumCols; i++) { fields[i] = arg0._1.get(i); } fields[oldNumCols] = new Double(arg0._2 + 1); return RowFactory.create(fields); } } public static DataFrame addIDToDataFrame(DataFrame df, SQLContext sqlContext, String nameOfCol) { StructField[] oldSchema = df.schema().fields(); StructField[] newSchema = new StructField[oldSchema.length + 1]; for(int i = 0; i < oldSchema.length; i++) { newSchema[i] = oldSchema[i]; } newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false); // JavaRDD newRows = df.rdd().toJavaRDD().map(new AddRowID()); JavaRDD newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID()); return sqlContext.createDataFrame(newRows, new StructType(newSchema)); } public static DataFrame binaryBlockToDataFrame(JavaPairRDD binaryBlockRDD, MatrixCharacteristics mc, SQLContext sqlContext) throws DMLRuntimeException { long rlen = mc.getRows(); long clen = mc.getCols(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); // Very expensive operation here: groupByKey (where number of keys might be too large) JavaRDD rowsRDD = binaryBlockRDD.flatMapToPair(new ProjectRows(rlen, clen, brlen, bclen)) .groupByKey().map(new ConvertDoubleArrayToRows(clen, bclen, false)); int numColumns = (int) clen; if(numColumns <= 0) { // numColumns = rowsRDD.first().length() - 1; // Ugly, so instead prefer to throw throw new DMLRuntimeException("Output dimensions unknown after executing the script and hence cannot create the dataframe"); } List fields = new ArrayList(); // LongTypes throw an error: java.lang.Double incompatible with java.lang.Long fields.add(DataTypes.createStructField("ID", DataTypes.DoubleType, false)); for(int i = 1; i <= numColumns; i++) { fields.add(DataTypes.createStructField("C" + i, DataTypes.DoubleType, false)); } // This will cause infinite recursion due to bug in Spark // https://issues.apache.org/jira/browse/SPARK-6999 // return sqlContext.createDataFrame(rowsRDD, colNames); // where ArrayList colNames return sqlContext.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields)); } private static class MatrixEntryToBinaryBlockFunction implements PairFlatMapFunction,MatrixIndexes,MatrixBlock> { private static final long serialVersionUID = 4907483236186747224L; private IJVToBinaryBlockFunctionHelper helper = null; public MatrixEntryToBinaryBlockFunction(MatrixCharacteristics mc) throws DMLRuntimeException { helper = new IJVToBinaryBlockFunctionHelper(mc); } @Override public Iterable> call(Iterator arg0) throws Exception { return helper.convertToBinaryBlock(arg0, RDDConverterTypes.MATRIXENTRY_TO_MATRIXCELL); } } private static class DataFrameAnalysisFunction implements Function { private static final long serialVersionUID = 5705371332119770215L; private RowAnalysisFunctionHelper helper = null; boolean isVectorBasedRDD; public DataFrameAnalysisFunction( Accumulator aNnz, boolean isVectorBasedRDD) { helper = new RowAnalysisFunctionHelper(aNnz); this.isVectorBasedRDD = isVectorBasedRDD; } @Override public Row call(Row arg0) throws Exception { if(isVectorBasedRDD) return helper.analyzeVector(arg0); else return helper.analyzeRow(arg0); } } private static class CSVToBinaryBlockFunction implements PairFlatMapFunction>,MatrixIndexes,MatrixBlock> { private static final long serialVersionUID = 1501589201971233542L; private RowToBinaryBlockFunctionHelper helper = null; public CSVToBinaryBlockFunction(MatrixCharacteristics mc, String delim, boolean fill, double fillValue) { helper = new RowToBinaryBlockFunctionHelper(mc, delim, fill, fillValue); } @Override public Iterable> call(Iterator> arg0) throws Exception { return helper.convertToBinaryBlock(arg0, RDDConverterTypes.TEXT_TO_DOUBLEARR); } } private static class DataFrameToBinaryBlockFunction implements PairFlatMapFunction>,MatrixIndexes,MatrixBlock> { private static final long serialVersionUID = 653447740362447236L; private RowToBinaryBlockFunctionHelper helper = null; boolean isVectorBasedDF; public DataFrameToBinaryBlockFunction(MatrixCharacteristics mc, boolean isVectorBasedDF) { helper = new RowToBinaryBlockFunctionHelper(mc); this.isVectorBasedDF = isVectorBasedDF; } @Override public Iterable> call(Iterator> arg0) throws Exception { if(isVectorBasedDF) return helper.convertToBinaryBlock(arg0, RDDConverterTypes.VECTOR_TO_DOUBLEARR); else return helper.convertToBinaryBlock(arg0, RDDConverterTypes.ROW_TO_DOUBLEARR); } } private static class RowAnalysisFunctionHelper implements Serializable { private static final long serialVersionUID = 2310303223289674477L; private Accumulator _aNnz = null; private String _delim = null; public RowAnalysisFunctionHelper( Accumulator aNnz ) { _aNnz = aNnz; } public RowAnalysisFunctionHelper( Accumulator aNnz, String delim ) { _aNnz = aNnz; _delim = delim; } public String analyzeText(Text v1) throws Exception { //parse input line String line = v1.toString(); String[] cols = IOUtilFunctions.split(line, _delim); //determine number of non-zeros of row (w/o string parsing) long lnnz = 0; for( String col : cols ) { if( !col.isEmpty() && !col.equals("0") && !col.equals("0.0") ) { lnnz++; } } //update counters _aNnz.add( (double)lnnz ); return line; } public Row analyzeRow(Row arg0) throws Exception { //determine number of non-zeros of row long lnnz = 0; if(arg0 != null) { for(int i = 0; i < arg0.length(); i++) { if(RowToBinaryBlockFunctionHelper.getDoubleValue(arg0, i) != 0) { lnnz++; } } } else { throw new Exception("Error while analyzing row"); } //update counters _aNnz.add( (double)lnnz ); return arg0; } public Row analyzeVector(Row row) { Vector vec = (Vector) row.get(0); // assumption: 1 column DF long lnnz = 0; for(int i = 0; i < vec.size(); i++) { if(vec.apply(i) != 0) { lnnz++; } } //update counters _aNnz.add( (double)lnnz ); return row; } } private static class IJVToBinaryBlockFunctionHelper implements Serializable { private static final long serialVersionUID = -7952801318564745821L; //internal buffer size (aligned w/ default matrix block size) private static final int BUFFER_SIZE = 4 * 1000 * 1000; //4M elements (32MB) private int _bufflen = -1; private long _rlen = -1; private long _clen = -1; private int _brlen = -1; private int _bclen = -1; public IJVToBinaryBlockFunctionHelper(MatrixCharacteristics mc) throws DMLRuntimeException { if(!mc.dimsKnown()) { throw new DMLRuntimeException("The dimensions need to be known in given MatrixCharacteristics for given input RDD"); } _rlen = mc.getRows(); _clen = mc.getCols(); _brlen = mc.getRowsPerBlock(); _bclen = mc.getColsPerBlock(); //determine upper bounded buffer len _bufflen = (int) Math.min(_rlen*_clen, BUFFER_SIZE); } // ---------------------------------------------------- // Can extend this by having type hierarchy public Tuple2 textToMatrixCell(Text txt) { FastStringTokenizer st = new FastStringTokenizer(' '); //get input string (ignore matrix market comments) String strVal = txt.toString(); if( strVal.startsWith("%") ) return null; //parse input ijv triple st.reset( strVal ); long row = st.nextLong(); long col = st.nextLong(); double val = st.nextDouble(); MatrixIndexes indx = new MatrixIndexes(row, col); MatrixCell cell = new MatrixCell(val); return new Tuple2(indx, cell); } public Tuple2 matrixEntryToMatrixCell(MatrixEntry entry) { MatrixIndexes indx = new MatrixIndexes(entry.i(), entry.j()); MatrixCell cell = new MatrixCell(entry.value()); return new Tuple2(indx, cell); } // ---------------------------------------------------- Iterable> convertToBinaryBlock(Object arg0, RDDConverterTypes converter) throws Exception { ArrayList> ret = new ArrayList>(); ReblockBuffer rbuff = new ReblockBuffer(_bufflen, _rlen, _clen, _brlen, _bclen); Iterator iter = (Iterator) arg0; while( iter.hasNext() ) { Tuple2 cell = null; switch(converter) { case MATRIXENTRY_TO_MATRIXCELL: cell = matrixEntryToMatrixCell((MatrixEntry) iter.next()); break; case TEXT_TO_MATRIX_CELL: cell = textToMatrixCell((Text) iter.next()); break; default: throw new Exception("Invalid converter for IJV data:" + converter.toString()); } if(cell == null) { continue; } //flush buffer if necessary if( rbuff.getSize() >= rbuff.getCapacity() ) flushBufferToList(rbuff, ret); //add value to reblock buffer rbuff.appendCell(cell._1.getRowIndex(), cell._1.getColumnIndex(), cell._2.getValue()); } //final flush buffer flushBufferToList(rbuff, ret); return ret; } /** * * @param rbuff * @param ret * @throws IOException * @throws DMLRuntimeException */ private void flushBufferToList( ReblockBuffer rbuff, ArrayList> ret ) throws IOException, DMLRuntimeException { //temporary list of indexed matrix values to prevent library dependencies ArrayList rettmp = new ArrayList(); rbuff.flushBufferToBinaryBlocks(rettmp); ret.addAll(SparkUtils.fromIndexedMatrixBlock(rettmp)); } } /** * This functions allows to map rdd partitions of csv rows into a set of partial binary blocks. * * NOTE: For this csv to binary block function, we need to hold all output blocks per partition * in-memory. Hence, we keep state of all column blocks and aggregate row segments into these blocks. * In terms of memory consumption this is better than creating partial blocks of row segments. * */ private static class RowToBinaryBlockFunctionHelper implements Serializable { private static final long serialVersionUID = -4948430402942717043L; private long _rlen = -1; private long _clen = -1; private int _brlen = -1; private int _bclen = -1; private String _delim = null; private boolean _fill = false; private double _fillValue = 0; public RowToBinaryBlockFunctionHelper(MatrixCharacteristics mc) { _rlen = mc.getRows(); _clen = mc.getCols(); _brlen = mc.getRowsPerBlock(); _bclen = mc.getColsPerBlock(); } public RowToBinaryBlockFunctionHelper(MatrixCharacteristics mc, String delim, boolean fill, double fillValue) { _rlen = mc.getRows(); _clen = mc.getCols(); _brlen = mc.getRowsPerBlock(); _bclen = mc.getColsPerBlock(); _delim = delim; _fill = fill; _fillValue = fillValue; } boolean emptyFound = false; // ---------------------------------------------------- public double[] textToDoubleArray(Text row) { String[] parts = IOUtilFunctions.split(row.toString(), _delim); double[] ret = new double[parts.length]; int ix = 0; for(String part : parts) { emptyFound |= part.isEmpty() && !_fill; double val = (part.isEmpty() && _fill) ? _fillValue : Double.parseDouble(part); ret[ix++] = val; } return ret; } public double[] rowToDoubleArray(Row row) throws Exception { double[] ret = new double[row.length()]; for(int i = 0; i < row.length(); i++) { ret[i] = getDoubleValue(row, i); } return ret; } public double[] vectorToDoubleArray(Vector arg) throws Exception { return arg.toDense().values(); } // ---------------------------------------------------- public Iterable> convertToBinaryBlock(Object arg0, RDDConverterTypes converter) throws Exception { ArrayList> ret = new ArrayList>(); int ncblks = (int)Math.ceil((double)_clen/_bclen); MatrixIndexes[] ix = new MatrixIndexes[ncblks]; MatrixBlock[] mb = new MatrixBlock[ncblks]; @SuppressWarnings("unchecked") Iterator> iter = (Iterator>) arg0; while( iter.hasNext() ) { Tuple2 tmp = iter.next(); // String row = tmp._1(); long rowix = tmp._2() + 1; long rix = UtilFunctions.computeBlockIndex(rowix, _brlen); int pos = UtilFunctions.computeCellInBlock(rowix, _brlen); //create new blocks for entire row if( ix[0] == null || ix[0].getRowIndex() != rix ) { if( ix[0] !=null ) flushBlocksToList(ix, mb, ret); long len = UtilFunctions.computeBlockSize(_rlen, rix, _brlen); createBlocks(rowix, (int)len, ix, mb); } //process row data emptyFound = false; double[] parts = null; switch(converter) { case TEXT_TO_DOUBLEARR: parts = textToDoubleArray((Text) tmp._1()); break; case ROW_TO_DOUBLEARR: parts = rowToDoubleArray((Row) tmp._1()); break; case VECTOR_TO_DOUBLEARR: parts = vectorToDoubleArray((Vector) ((Row) tmp._1()).get(0)); break; default: throw new Exception("Invalid converter for row-based data:" + converter.toString()); } for( int cix=1, pix=0; cix<=ncblks; cix++ ) { int lclen = (int)UtilFunctions.computeBlockSize(_clen, cix, _bclen); for( int j=0; j> ret ) throws DMLRuntimeException { int len = ix.length; for( int i=0; i(ix[i],mb[i])); mb[i].examSparsity(); //ensure right representation } } public static double getDoubleValue(Row row, int index) throws Exception { try { return row.getDouble(index); } catch(Exception e) { try { // Causes lock-contention for Java 7 return Double.parseDouble(row.get(index).toString()); } catch(Exception e1) { throw new Exception("Only double types are supported as input to SystemML. The input argument is \'" + row.get(index) + "\'"); } } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy