org.apache.sysml.runtime.controlprogram.caching.MatrixObject Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.controlprogram.caching;
import java.io.IOException;
import java.lang.ref.SoftReference;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData;
import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
import org.apache.sysml.runtime.matrix.MetaData;
import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.util.DataConverter;
import org.apache.sysml.runtime.util.IndexRange;
import org.apache.sysml.runtime.util.MapReduceTool;
/**
* Represents a matrix in control program. This class contains method to read
* matrices from HDFS and convert them to a specific format/representation. It
* is also able to write several formats/representation of matrices to HDFS.
* IMPORTANT: Preserve one-to-one correspondence between {@link MatrixObject}
* and {@link MatrixBlock} objects, for cache purposes. Do not change a
* {@link MatrixBlock} object without informing its {@link MatrixObject} object.
*
*/
public class MatrixObject extends CacheableData
{
private static final long serialVersionUID = 6374712373206495637L;
public enum UpdateType {
COPY,
INPLACE,
INPLACE_PINNED;
public boolean isInPlace() {
return (this != COPY);
}
}
//additional matrix-specific flags
private UpdateType _updateType = UpdateType.COPY;
//information relevant to partitioned matrices.
private boolean _partitioned = false; //indicates if obj partitioned
private PDataPartitionFormat _partitionFormat = null; //indicates how obj partitioned
private int _partitionSize = -1; //indicates n for BLOCKWISE_N
private String _partitionCacheName = null; //name of cache block
private MatrixBlock _partitionInMemory = null;
/**
* Constructor that takes the value type and the HDFS filename.
*
* @param vt value type
* @param file file name
*/
public MatrixObject (ValueType vt, String file) {
this (vt, file, null); //HDFS file path
}
/**
* Constructor that takes the value type, HDFS filename and associated metadata.
*
* @param vt value type
* @param file file name
* @param mtd metadata
*/
public MatrixObject( ValueType vt, String file, MetaData mtd ) {
super (DataType.MATRIX, vt);
_metaData = mtd;
_hdfsFileName = file;
_cache = null;
_data = null;
}
/**
* Copy constructor that copies meta data but NO data.
*
* @param mo matrix object
*/
public MatrixObject( MatrixObject mo )
{
//base copy constructor
super(mo);
MatrixFormatMetaData metaOld = (MatrixFormatMetaData)mo.getMetaData();
_metaData = new MatrixFormatMetaData(new MatrixCharacteristics(metaOld.getMatrixCharacteristics()),
metaOld.getOutputInfo(), metaOld.getInputInfo());
_updateType = mo._updateType;
_partitioned = mo._partitioned;
_partitionFormat = mo._partitionFormat;
_partitionSize = mo._partitionSize;
_partitionCacheName = mo._partitionCacheName;
}
public void setUpdateType(UpdateType flag) {
_updateType = flag;
}
public UpdateType getUpdateType() {
return _updateType;
}
@Override
public void updateMatrixCharacteristics (MatrixCharacteristics mc) {
((MatrixDimensionsMetaData)_metaData).setMatrixCharacteristics( mc );
}
/**
* Make the matrix metadata consistent with the in-memory matrix data
*
* @throws CacheException if CacheException occurs
*/
@Override
public void refreshMetaData()
throws CacheException
{
if ( _data == null || _metaData ==null ) //refresh only for existing data
throw new CacheException("Cannot refresh meta data because there is no data or meta data. ");
//we need to throw an exception, otherwise input/output format cannot be inferred
MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics();
mc.setDimension( _data.getNumRows(),
_data.getNumColumns() );
mc.setNonZeros( _data.getNonZeros() );
}
public long getNumRows() {
MatrixCharacteristics mc = getMatrixCharacteristics();
return mc.getRows ();
}
public long getNumColumns() {
MatrixCharacteristics mc = getMatrixCharacteristics();
return mc.getCols ();
}
public long getNumRowsPerBlock() {
MatrixCharacteristics mc = getMatrixCharacteristics();
return mc.getRowsPerBlock();
}
public long getNumColumnsPerBlock() {
MatrixCharacteristics mc = getMatrixCharacteristics();
return mc.getColsPerBlock();
}
public long getNnz() {
MatrixCharacteristics mc = getMatrixCharacteristics();
return mc.getNonZeros();
}
public double getSparsity() {
MatrixCharacteristics mc = getMatrixCharacteristics();
return OptimizerUtils.getSparsity(mc);
}
// *********************************************
// *** ***
// *** HIGH-LEVEL PUBLIC METHODS ***
// *** FOR PARTITIONED MATRIX ACCESS ***
// *** (all other methods still usable) ***
// *** ***
// *********************************************
public void setPartitioned( PDataPartitionFormat format, int n )
{
_partitioned = true;
_partitionFormat = format;
_partitionSize = n;
}
public void unsetPartitioned()
{
_partitioned = false;
_partitionFormat = null;
_partitionSize = -1;
}
public boolean isPartitioned()
{
return _partitioned;
}
public PDataPartitionFormat getPartitionFormat()
{
return _partitionFormat;
}
public int getPartitionSize()
{
return _partitionSize;
}
public synchronized void setInMemoryPartition(MatrixBlock block)
{
_partitionInMemory = block;
}
/**
* NOTE: for reading matrix partitions, we could cache (in its real sense) the read block
* with soft references (no need for eviction, as partitioning only applied for read-only matrices).
* However, since we currently only support row- and column-wise partitioning caching is not applied yet.
* This could be changed once we also support column-block-wise and row-block-wise. Furthermore,
* as we reject to partition vectors and support only full row or column indexing, no metadata (apart from
* the partition flag) is required.
*
* @param pred index range
* @return matrix block
* @throws CacheException if CacheException occurs
*/
public synchronized MatrixBlock readMatrixPartition( IndexRange pred )
throws CacheException
{
if( LOG.isTraceEnabled() )
LOG.trace("Acquire partition "+getVarName()+" "+pred);
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if ( !_partitioned )
throw new CacheException ("MatrixObject not available to indexed read.");
//return static partition of set from outside of the program
if( _partitionInMemory != null )
return _partitionInMemory;
MatrixBlock mb = null;
try
{
boolean blockwise = (_partitionFormat==PDataPartitionFormat.ROW_BLOCK_WISE || _partitionFormat==PDataPartitionFormat.COLUMN_BLOCK_WISE);
//preparations for block wise access
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
int brlen = mc.getRowsPerBlock();
int bclen = mc.getColsPerBlock();
//get filename depending on format
String fname = getPartitionFileName( pred, brlen, bclen );
//probe cache
if( blockwise && _partitionCacheName != null && _partitionCacheName.equals(fname) )
{
mb = _cache.get(); //try getting block from cache
}
if( mb == null ) //block not in cache
{
//get rows and cols
long rows = -1;
long cols = -1;
switch( _partitionFormat )
{
case ROW_WISE:
rows = 1;
cols = mc.getCols();
break;
case ROW_BLOCK_WISE:
rows = brlen;
cols = mc.getCols();
break;
case ROW_BLOCK_WISE_N:
rows = _partitionSize;
cols = mc.getCols();
break;
case COLUMN_WISE:
rows = mc.getRows();
cols = 1;
break;
case COLUMN_BLOCK_WISE:
rows = mc.getRows();
cols = bclen;
break;
case COLUMN_BLOCK_WISE_N:
rows = mc.getRows();
cols = _partitionSize;
break;
default:
throw new CacheException("Unsupported partition format: "+_partitionFormat);
}
//read the
if( MapReduceTool.existsFileOnHDFS(fname) )
mb = readBlobFromHDFS( fname, rows, cols );
else
{
mb = new MatrixBlock((int)rows, (int)cols, true);
LOG.warn("Reading empty matrix partition "+fname);
}
}
//post processing
if( blockwise )
{
//put block into cache
_partitionCacheName = fname;
_cache = new SoftReference(mb);
if( _partitionFormat == PDataPartitionFormat.ROW_BLOCK_WISE )
{
int rix = (int)((pred.rowStart-1)%brlen);
mb = mb.sliceOperations(rix, rix, (int)(pred.colStart-1), (int)(pred.colEnd-1), new MatrixBlock());
}
if( _partitionFormat == PDataPartitionFormat.COLUMN_BLOCK_WISE )
{
int cix = (int)((pred.colStart-1)%bclen);
mb = mb.sliceOperations((int)(pred.rowStart-1), (int)(pred.rowEnd-1), cix, cix, new MatrixBlock());
}
}
//NOTE: currently no special treatment of non-existing partitions necessary
// because empty blocks are written anyway
}
catch(Exception ex)
{
throw new CacheException(ex);
}
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireRTime(t1-t0);
}
return mb;
}
public String getPartitionFileName( IndexRange pred, int brlen, int bclen )
throws CacheException
{
if ( !_partitioned )
throw new CacheException ("MatrixObject not available to indexed read.");
StringBuilder sb = new StringBuilder();
sb.append(_hdfsFileName);
switch( _partitionFormat )
{
case ROW_WISE:
sb.append(Lop.FILE_SEPARATOR);
sb.append(pred.rowStart);
break;
case ROW_BLOCK_WISE:
sb.append(Lop.FILE_SEPARATOR);
sb.append((pred.rowStart-1)/brlen+1);
break;
case ROW_BLOCK_WISE_N:
sb.append(Lop.FILE_SEPARATOR);
sb.append((pred.rowStart-1)/_partitionSize+1);
break;
case COLUMN_WISE:
sb.append(Lop.FILE_SEPARATOR);
sb.append(pred.colStart);
break;
case COLUMN_BLOCK_WISE:
sb.append(Lop.FILE_SEPARATOR);
sb.append((pred.colStart-1)/bclen+1);
break;
case COLUMN_BLOCK_WISE_N:
sb.append(Lop.FILE_SEPARATOR);
sb.append((pred.colStart-1)/_partitionSize+1);
break;
default:
throw new CacheException ("MatrixObject not available to indexed read.");
}
return sb.toString();
}
// *********************************************
// *** ***
// *** LOW-LEVEL PROTECTED METHODS ***
// *** EXTEND CACHEABLE DATA ***
// *** ONLY CALLED BY THE SUPERCLASS ***
// *** ***
// *********************************************
@Override
protected boolean isBelowCachingThreshold() {
return super.isBelowCachingThreshold()
|| getUpdateType() == UpdateType.INPLACE_PINNED;
}
@Override
protected MatrixBlock readBlobFromCache(String fname) throws IOException {
return (MatrixBlock)LazyWriteBuffer.readBlock(fname, true);
}
@Override
protected MatrixBlock readBlobFromHDFS(String fname, long rlen, long clen)
throws IOException
{
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
long begin = 0;
if( LOG.isTraceEnabled() ) {
LOG.trace("Reading matrix from HDFS... " + getVarName() + " Path: " + fname
+ ", dimensions: [" + mc.getRows() + ", " + mc.getCols() + ", " + mc.getNonZeros() + "]");
begin = System.currentTimeMillis();
}
double sparsity = ( mc.getNonZeros() >= 0 ? ((double)mc.getNonZeros())/(mc.getRows()*mc.getCols()) : 1.0d) ;
MatrixBlock newData = DataConverter.readMatrixFromHDFS(fname, iimd.getInputInfo(), rlen, clen,
mc.getRowsPerBlock(), mc.getColsPerBlock(), sparsity, getFileFormatProperties());
//sanity check correct output
if( newData == null )
throw new IOException("Unable to load matrix from file: "+fname);
if( LOG.isTraceEnabled() )
LOG.trace("Reading Completed: " + (System.currentTimeMillis()-begin) + " msec.");
return newData;
}
@Override
protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus)
throws IOException
{
//note: the read of a matrix block from an RDD might trigger
//lazy evaluation of pending transformations.
RDDObject lrdd = rdd;
//prepare return status (by default only collect)
writeStatus.setValue(false);
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
InputInfo ii = iimd.getInputInfo();
MatrixBlock mb = null;
try
{
//prevent unnecessary collect through rdd checkpoint
if( rdd.allowsShortCircuitCollect() ) {
lrdd = (RDDObject)rdd.getLineageChilds().get(0);
}
//obtain matrix block from RDD
int rlen = (int)mc.getRows();
int clen = (int)mc.getCols();
int brlen = (int)mc.getRowsPerBlock();
int bclen = (int)mc.getColsPerBlock();
long nnz = mc.getNonZeros();
//guarded rdd collect
if( ii == InputInfo.BinaryBlockInputInfo && //guarded collect not for binary cell
!OptimizerUtils.checkSparkCollectMemoryBudget(mc, getPinnedSize()+getBroadcastSize()) ) {
//write RDD to hdfs and read to prevent invalid collect mem consumption
//note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower
if( !MapReduceTool.existsFileOnHDFS(_hdfsFileName) ) { //prevent overwrite existing file
long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo());
((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz);
((RDDObject)rdd).setHDFSFile(true); //mark rdd as hdfs file (for restore)
writeStatus.setValue(true); //mark for no cache-write on read
}
mb = readBlobFromHDFS(_hdfsFileName);
}
else if( ii == InputInfo.BinaryCellInputInfo ) {
//collect matrix block from binary block RDD
mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, nnz);
}
else {
//collect matrix block from binary cell RDD
mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz);
}
}
catch(DMLRuntimeException ex) {
throw new IOException(ex);
}
//sanity check correct output
if( mb == null ) {
throw new IOException("Unable to load matrix from rdd: "+lrdd.getVarName());
}
return mb;
}
/**
* Writes in-memory matrix to HDFS in a specified format.
*/
@Override
protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop)
throws IOException, DMLRuntimeException
{
long begin = 0;
if( LOG.isTraceEnabled() ){
LOG.trace (" Writing matrix to HDFS... " + getVarName() + " Path: " + fname + ", Format: " +
(ofmt != null ? ofmt : "inferred from metadata"));
begin = System.currentTimeMillis();
}
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
if (_data != null)
{
// Get the dimension information from the metadata stored within MatrixObject
MatrixCharacteristics mc = iimd.getMatrixCharacteristics ();
// Write the matrix to HDFS in requested format
OutputInfo oinfo = (ofmt != null ? OutputInfo.stringToOutputInfo (ofmt) :
InputInfo.getMatchingOutputInfo (iimd.getInputInfo ()));
// when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions
// note: this is only required if singlenode (due to binarycell default)
if ( oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE &&
(mc.getRowsPerBlock() != ConfigurationManager.getBlocksize() || mc.getColsPerBlock() != ConfigurationManager.getBlocksize()) )
{
DataConverter.writeMatrixToHDFS(_data, fname, oinfo, new MatrixCharacteristics(mc.getRows(), mc.getCols(), ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), mc.getNonZeros()), rep, fprop);
}
else {
DataConverter.writeMatrixToHDFS(_data, fname, oinfo, mc, rep, fprop);
}
if( LOG.isTraceEnabled() )
LOG.trace("Writing matrix to HDFS ("+fname+") - COMPLETED... " + (System.currentTimeMillis()-begin) + " msec.");
}
else if( LOG.isTraceEnabled() ) {
LOG.trace ("Writing matrix to HDFS ("+fname+") - NOTHING TO WRITE (_data == null).");
}
if( DMLScript.STATISTICS )
CacheStatistics.incrementHDFSWrites();
}
@Override
protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String outputFormat)
throws IOException, DMLRuntimeException
{
//prepare output info
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo (outputFormat)
: InputInfo.getMatchingOutputInfo (iimd.getInputInfo ()));
//note: the write of an RDD to HDFS might trigger
//lazy evaluation of pending transformations.
long newnnz = SparkExecutionContext.writeRDDtoHDFS(rdd, fname, oinfo);
((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics().setNonZeros(newnnz);
}
}