org.apache.sysml.runtime.controlprogram.caching.CacheableData Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.controlprogram.caching;
import java.io.File;
import java.io.IOException;
import java.lang.ref.SoftReference;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.parser.Expression.DataType;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.LazyWriteBuffer.RPolicy;
import org.apache.sysml.runtime.instructions.gpu.context.GPUObject;
import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence;
import org.apache.sysml.runtime.instructions.cp.Data;
import org.apache.sysml.runtime.instructions.spark.data.BroadcastObject;
import org.apache.sysml.runtime.instructions.spark.data.RDDObject;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.MatrixDimensionsMetaData;
import org.apache.sysml.runtime.matrix.MatrixFormatMetaData;
import org.apache.sysml.runtime.matrix.MetaData;
import org.apache.sysml.runtime.matrix.data.FileFormatProperties;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.util.LocalFileUtils;
import org.apache.sysml.runtime.util.MapReduceTool;
/**
* Each object of this class is a cache envelope for some large piece of data
* called "cache block". For example, the body of a matrix can be the cache block.
* The term cache block refers strictly to the cacheable portion of the data object,
* often excluding metadata and auxiliary parameters, as defined in the subclasses.
* Under the protection of the envelope, the data blob may be evicted to
* the file system; then the subclass must set its reference to null
* to allow Java garbage collection. If other parts of the system continue
* keep references to the cache block, its eviction will not release any memory.
*/
public abstract class CacheableData extends Data
{
private static final long serialVersionUID = -413810592207212835L;
/** Global logging instance for all subclasses of CacheableData */
protected static final Log LOG = LogFactory.getLog(CacheableData.class.getName());
// global constant configuration parameters
public static final long CACHING_THRESHOLD = 4*1024; //obj not s.t. caching if below threshold [in bytes]
public static final double CACHING_BUFFER_SIZE = 0.15;
public static final RPolicy CACHING_BUFFER_POLICY = RPolicy.FIFO;
public static final boolean CACHING_BUFFER_PAGECACHE = false;
public static final boolean CACHING_WRITE_CACHE_ON_READ = false;
public static final String CACHING_COUNTER_GROUP_NAME = "SystemML Caching Counters";
public static final String CACHING_EVICTION_FILEEXTENSION = ".dat";
public static final boolean CACHING_ASYNC_FILECLEANUP = true;
/**
* Defines all possible cache status types for a data blob.
* An object of class {@link CacheableData} can be in one of the following
* five status types:
*
* EMPTY
: Either there is no data blob at all, or the data blob
* resides in a specified import file and has never been downloaded yet.
* READ
: The data blob is in main memory; one or more threads are
* referencing and reading it (shared "read-only" lock). This status uses a
* counter. Eviction is NOT allowed.
* MODIFY
: The data blob is in main memory; exactly one thread is
* referencing and modifying it (exclusive "write" lock). Eviction is NOT allowed.
* CACHED
: The data blob is in main memory, and nobody is using nor referencing it.
* There is always an persistent recovery object for it
**/
protected enum CacheStatus {
EMPTY,
READ,
MODIFY,
CACHED,
CACHED_NOWRITE,
};
/** Global flag indicating if caching is enabled (controls eviction) */
private static boolean _activeFlag = false;
/** Global sequence for generating unique ids. */
private static IDSequence _seq = null;
// Global eviction path and prefix (prefix used for isolation purposes)
public static String cacheEvictionLocalFilePath = null; //set during init
public static String cacheEvictionLocalFilePrefix = "cache";
/**
* Current state of pinned variables, required for guarded collect.
*/
private static ThreadLocal sizePinned = new ThreadLocal() {
@Override protected Long initialValue() { return 0L; }
};
//current size of live broadcast objects (because Spark's ContextCleaner maintains
//a buffer with references to prevent eager cleanup by GC); note that this is an
//overestimate, because we maintain partitioned broadcasts as soft references, which
//might be collected by the GC and subsequently cleaned up by Spark's ContextCleaner.
private static AtomicLong _refBCs = new AtomicLong(0);
static {
_seq = new IDSequence();
}
/**
* The unique (JVM-wide) ID of a cacheable data object; to ensure unique IDs across JVMs, we
* concatenate filenames with a unique prefix (map task ID).
*/
private final int _uniqueID;
/** The cache status of the data blob (whether it can be or is evicted, etc. */
private CacheStatus _cacheStatus = null;
/** Cache for actual data, evicted by garbage collector. */
protected SoftReference _cache = null;
/** Container object that holds the actual data. */
protected T _data = null;
/**
* Object that holds the metadata associated with the matrix, which
* includes: 1) Matrix dimensions, if available 2) Number of non-zeros, if
* available 3) Block dimensions, if applicable 4) InputInfo -- subsequent
* operations that use this Matrix expect it to be in this format.
*
* When the matrix is written to HDFS (local file system, as well?), one
* must get the OutputInfo that matches with InputInfo stored inside _mtd.
*/
protected MetaData _metaData = null;
/** The name of HDFS file in which the data is backed up. */
protected String _hdfsFileName = null; // file name and path
/**
* Flag that indicates whether or not hdfs file exists.It is used
* for improving the performance of "rmvar" instruction. When it has
* value false
, one can skip file system existence
* checks which can be expensive.
*/
private boolean _hdfsFileExists = false;
/** Information relevant to specific external file formats. */
private FileFormatProperties _formatProps = null;
/**
* true
if the in-memory or evicted matrix may be different from
* the matrix located at {@link #_hdfsFileName}; false
if the two
* matrices should be the same.
*/
private boolean _dirtyFlag = false;
// additional private flags and meta data
private int _numReadThreads = 0; //number of threads for read from HDFS
private boolean _cleanupFlag = true; //flag if obj unpinned (cleanup enabled)
private String _varName = ""; //plan variable name
private String _cacheFileName = null; //local eviction file name
private boolean _requiresLocalWrite = false; //flag if local write for read obj
private boolean _isAcquireFromEmpty = false; //flag if read from status empty
//spark-specific handles
//note: we use the abstraction of LineageObjects for two reasons: (1) to keep track of cleanup
//for lazily evaluated RDDs, and (2) as abstraction for environments that do not necessarily have spark libraries available
private RDDObject _rddHandle = null; //RDD handle
private BroadcastObject _bcHandle = null; //Broadcast handle
protected GPUObject _gpuHandle = null;
/**
* Basic constructor for any cacheable data.
*
* @param dt data type
* @param vt value type
*/
protected CacheableData(DataType dt, ValueType vt) {
super (dt, vt);
_uniqueID = (int)_seq.getNextID();
_cacheStatus = CacheStatus.EMPTY;
_numReadThreads = 0;
}
/**
* Copy constructor for cacheable data (of same type).
*
* @param that cacheable data object
*/
protected CacheableData(CacheableData that) {
this( that.getDataType(), that.getValueType() );
_cleanupFlag = that._cleanupFlag;
_hdfsFileName = that._hdfsFileName;
_hdfsFileExists = that._hdfsFileExists;
_varName = that._varName;
_gpuHandle = that._gpuHandle;
}
/**
* Enables or disables the cleanup of the associated
* data object on clearData().
*
* @param flag true if cleanup
*/
public void enableCleanup(boolean flag) {
_cleanupFlag = flag;
}
/**
* Indicates if cleanup of the associated data object
* is enabled on clearData().
*
* @return true if cleanup enabled
*/
public boolean isCleanupEnabled() {
return _cleanupFlag;
}
public void setVarName(String s) {
_varName = s;
}
public String getVarName() {
return _varName;
}
public boolean isHDFSFileExists() {
return _hdfsFileExists;
}
public void setHDFSFileExists( boolean flag ) {
_hdfsFileExists = flag;
}
public String getFileName() {
return _hdfsFileName;
}
public synchronized void setFileName( String file ) {
if( _hdfsFileName!=null && !_hdfsFileName.equals(file) )
if( !isEmpty(true) )
_dirtyFlag = true;
_hdfsFileName = file;
}
/**
* true
if the in-memory or evicted matrix may be different from
* the matrix located at {@link #_hdfsFileName}; false
if the two
* matrices are supposed to be the same.
*
* @return true if dirty
*/
public boolean isDirty() {
return _dirtyFlag;
}
public void setDirty(boolean flag) {
_dirtyFlag = flag;
}
public FileFormatProperties getFileFormatProperties() {
return _formatProps;
}
public void setFileFormatProperties(FileFormatProperties props) {
_formatProps = props;
}
@Override
public void setMetaData(MetaData md) {
_metaData = md;
}
@Override
public MetaData getMetaData() {
return _metaData;
}
@Override
public void removeMetaData() {
_metaData = null;
}
public MatrixCharacteristics getMatrixCharacteristics() {
MatrixDimensionsMetaData meta = (MatrixDimensionsMetaData) _metaData;
return meta.getMatrixCharacteristics();
}
public abstract void refreshMetaData()
throws CacheException;
public RDDObject getRDDHandle() {
return _rddHandle;
}
public void setRDDHandle( RDDObject rdd ) {
//cleanup potential old back reference
if( _rddHandle != null )
_rddHandle.setBackReference(null);
//add new rdd handle
_rddHandle = rdd;
if( _rddHandle != null )
rdd.setBackReference(this);
}
public BroadcastObject getBroadcastHandle() {
return _bcHandle;
}
@SuppressWarnings({ "rawtypes", "unchecked" })
public void setBroadcastHandle( BroadcastObject bc ) {
//cleanup potential old back reference
if( _bcHandle != null )
_bcHandle.setBackReference(null);
//add new broadcast handle
_bcHandle = bc;
if( _bcHandle != null )
bc.setBackReference(this);
}
public GPUObject getGPUObject() {
return _gpuHandle;
}
public void setGPUObject(GPUObject handle) {
_gpuHandle = handle;
}
// *********************************************
// *** ***
// *** HIGH-LEVEL METHODS THAT SPECIFY ***
// *** THE LOCKING AND CACHING INTERFACE ***
// *** ***
// *********************************************
/**
* Acquires a shared "read-only" lock, produces the reference to the cache block,
* restores the cache block to main memory, reads from HDFS if needed.
*
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
*
* In-Status: EMPTY, EVICTABLE, EVICTED, READ;
* Out-Status: READ(+1).
*
* @return cacheable data
* @throws CacheException if CacheException occurs
*/
public synchronized T acquireRead()
throws CacheException
{
if( LOG.isTraceEnabled() )
LOG.trace("Acquire read "+getVarName());
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if ( !isAvailableToRead() )
throw new CacheException ("MatrixObject not available to read.");
//get object from cache
if( _data == null )
getCache();
//call acquireHostRead if gpuHandle is set as well as is allocated
if( _gpuHandle != null && _gpuHandle.isAllocated()) {
_gpuHandle.acquireHostRead();
if( _data == null )
getCache();
}
//read data from HDFS/RDD if required
//(probe data for cache_nowrite / jvm_reuse)
if( isEmpty(true) && _data==null )
{
try
{
if( DMLScript.STATISTICS )
CacheStatistics.incrementHDFSHits();
if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() )
{
//check filename
if( _hdfsFileName == null )
throw new CacheException("Cannot read matrix for empty filename.");
//read cacheable data from hdfs
_data = readBlobFromHDFS( _hdfsFileName );
//mark for initial local write despite read operation
_requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ;
}
else
{
//read matrix from rdd (incl execute pending rdd operations)
MutableBoolean writeStatus = new MutableBoolean();
_data = readBlobFromRDD( getRDDHandle(), writeStatus );
//mark for initial local write (prevent repeated execution of rdd operations)
if( writeStatus.booleanValue() )
_requiresLocalWrite = CACHING_WRITE_CACHE_ON_READ;
else
_requiresLocalWrite = true;
}
setDirty(false);
}
catch (IOException e) {
throw new CacheException("Reading of " + _hdfsFileName + " ("+getVarName()+") failed.", e);
}
_isAcquireFromEmpty = true;
}
else if( DMLScript.STATISTICS )
{
if( _data!=null )
CacheStatistics.incrementMemHits();
}
//cache status maintenance
acquire( false, _data==null );
updateStatusPinned(true);
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireRTime(t1-t0);
}
return _data;
}
/**
* Acquires the exclusive "write" lock for a thread that wants to change cache block
* cell values. Produces the reference to the cache block, restores the cache block
* to main memory, reads from HDFS if needed.
*
* In-Status: EMPTY, EVICTABLE, EVICTED;
* Out-Status: MODIFY.
*
* @return cacheable data
* @throws CacheException if CacheException occurs
*/
public synchronized T acquireModify()
throws CacheException
{
if( LOG.isTraceEnabled() )
LOG.trace("Acquire modify "+getVarName());
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if ( !isAvailableToModify() )
throw new CacheException("MatrixObject not available to modify.");
//get object from cache
if( _data == null )
getCache();
//read data from HDFS if required
if( isEmpty(true) && _data == null )
{
//check filename
if( _hdfsFileName == null )
throw new CacheException("Cannot read matrix for empty filename.");
//load data
try
{
_data = readBlobFromHDFS( _hdfsFileName );
}
catch (IOException e)
{
throw new CacheException("Reading of " + _hdfsFileName + " ("+getVarName()+") failed.", e);
}
}
//cache status maintenance
acquire( true, _data==null );
updateStatusPinned(true);
setDirty(true);
_isAcquireFromEmpty = false;
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireMTime(t1-t0);
}
return _data;
}
/**
* Acquires the exclusive "write" lock for a thread that wants to throw away the
* old cache block data and link up with new cache block data. Abandons the old data
* without reading it and sets the new data reference.
* In-Status: EMPTY, EVICTABLE, EVICTED;
* Out-Status: MODIFY.
*
* @param newData new data
* @return cacheable data
* @throws CacheException if CacheException occurs
*/
public synchronized T acquireModify(T newData)
throws CacheException
{
if( LOG.isTraceEnabled() )
LOG.trace("Acquire modify newdata "+getVarName());
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
if (! isAvailableToModify ())
throw new CacheException ("CacheableData not available to modify.");
//clear old data
clearData();
//cache status maintenance
acquire (true, false); //no need to load evicted matrix
setDirty(true);
_isAcquireFromEmpty = false;
//set references to new data
if (newData == null)
throw new CacheException("acquireModify with empty cache block.");
_data = newData;
updateStatusPinned(true);
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementAcquireMTime(t1-t0);
}
return _data;
}
/**
* Releases the shared ("read-only") or exclusive ("write") lock. Updates
* size information, last-access time, metadata, etc.
*
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
*
* In-Status: READ, MODIFY;
* Out-Status: READ(-1), EVICTABLE, EMPTY.
*
* @throws CacheException if CacheException occurs
*/
public synchronized void release()
throws CacheException
{
if( LOG.isTraceEnabled() )
LOG.trace("Release "+getVarName());
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
boolean write = false;
if ( isModify() )
{
//set flags for write
write = true;
setDirty(true);
//update meta data
refreshMetaData();
}
//compact empty in-memory block
_data.compactEmptyBlock();
//cache status maintenance (pass cacheNoWrite flag)
release(_isAcquireFromEmpty && !_requiresLocalWrite);
updateStatusPinned(false);
if( isCachingActive() //only if caching is enabled (otherwise keep everything in mem)
&& isCached(true) //not empty and not read/modify
&& !isBelowCachingThreshold() ) //min size for caching
{
if( write || _requiresLocalWrite )
{
//evict blob
String filePath = getCacheFilePathAndName();
try {
LazyWriteBuffer.writeBlock(filePath, _data);
}
catch (Exception e)
{
throw new CacheException("Eviction to local path " + filePath + " ("+getVarName()+") failed.", e);
}
_requiresLocalWrite = false;
}
//create cache
createCache();
_data = null;
}
else if( LOG.isTraceEnabled() ){
LOG.trace("Var "+getVarName()+" not subject to caching, state="+getStatusAsString());
}
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementReleaseTime(t1-t0);
}
}
protected void clearReusableData() {}
/**
* Sets the cache block reference to null
, abandons the old block.
* Makes the "envelope" empty. Run it to finalize the object (otherwise the
* evicted cache block file may remain undeleted).
*
* In-Status: EMPTY, EVICTABLE, EVICTED;
* Out-Status: EMPTY.
*
* @throws CacheException if CacheException occurs
*/
public synchronized void clearData()
throws CacheException
{
if( LOG.isTraceEnabled() )
LOG.trace("Clear data "+getVarName());
// check if cleanup enabled and possible
if( !isCleanupEnabled() )
return; // do nothing
if( !isAvailableToModify() )
throw new CacheException ("CacheableData (" + getDebugName() + ") not available to "
+ "modify. Status = " + getStatusAsString() + ".");
// clear existing WB / FS representation (but prevent unnecessary probes)
if( !(isEmpty(true)||(_data!=null && isBelowCachingThreshold())
||(_data!=null && !isCachingActive()) )) //additional condition for JMLC
freeEvictedBlob();
// clear the in-memory data
clearReusableData();
_data = null;
clearCache();
// clear rdd/broadcast back refs
if( _rddHandle != null )
_rddHandle.setBackReference(null);
if( _bcHandle != null )
_bcHandle.setBackReference(null);
if( _gpuHandle != null )
_gpuHandle.clearData();
// change object state EMPTY
setDirty(false);
setEmpty();
}
public synchronized void exportData() throws CacheException {
exportData( -1 );
}
/**
* Writes, or flushes, the cache block data to HDFS.
*
* In-Status: EMPTY, EVICTABLE, EVICTED, READ;
* Out-Status: EMPTY, EVICTABLE, EVICTED, READ.
*
* @param replication ?
* @throws CacheException if CacheException occurs
*/
public synchronized void exportData( int replication )
throws CacheException
{
exportData(_hdfsFileName, null, replication, null);
_hdfsFileExists = true;
}
public synchronized void exportData(String fName, String outputFormat)
throws CacheException
{
exportData(fName, outputFormat, -1, null);
}
public synchronized void exportData(String fName, String outputFormat, FileFormatProperties formatProperties)
throws CacheException
{
exportData(fName, outputFormat, -1, formatProperties);
}
/**
* Synchronized because there might be parallel threads (parfor local) that
* access the same object (in case it was created before the loop).
* If all threads export the same data object concurrently it results in errors
* because they all write to the same file. Efficiency for loops and parallel threads
* is achieved by checking if the in-memory block is dirty.
*
* NOTE: MB: we do not use dfs copy from local (evicted) to HDFS because this would ignore
* the output format and most importantly would bypass reblocking during write (which effects the
* potential degree of parallelism). However, we copy files on HDFS if certain criteria are given.
*
* @param fName file name
* @param outputFormat format
* @param replication ?
* @param formatProperties file format properties
* @throws CacheException if CacheException occurs
*/
public synchronized void exportData (String fName, String outputFormat, int replication, FileFormatProperties formatProperties)
throws CacheException
{
if( LOG.isTraceEnabled() )
LOG.trace("Export data "+getVarName()+" "+fName);
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
//prevent concurrent modifications
if ( !isAvailableToRead() )
throw new CacheException ("MatrixObject not available to read.");
LOG.trace("Exporting " + this.getDebugName() + " to " + fName + " in format " + outputFormat);
//TODO remove
if( getGPUObject() != null && getGPUObject().isAllocated() ) {
getGPUObject().acquireHostRead();
}
boolean pWrite = false; // !fName.equals(_hdfsFileName); //persistent write flag
if ( fName.equals(_hdfsFileName) ) {
setHDFSFileExists(true);
pWrite = false;
}
else {
pWrite = true; // i.e., export is called from "write" instruction
}
//actual export (note: no direct transfer of local copy in order to ensure blocking (and hence, parallelism))
if( isDirty() || //use dirty for skipping parallel exports
(pWrite && !isEqualOutputFormat(outputFormat)) )
{
// CASE 1: dirty in-mem matrix or pWrite w/ different format (write matrix to fname; load into memory if evicted)
// a) get the matrix
if( isEmpty(true) )
{
//read data from HDFS if required (never read before), this applies only to pWrite w/ different output formats
//note: for large rdd outputs, we compile dedicated writespinstructions (no need to handle this here)
try
{
if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() )
_data = readBlobFromHDFS( _hdfsFileName );
else
_data = readBlobFromRDD( getRDDHandle(), new MutableBoolean() );
setDirty(false);
}
catch (IOException e)
{
throw new CacheException("Reading of " + _hdfsFileName + " ("+getVarName()+") failed.", e);
}
}
//get object from cache
if( _data == null )
getCache();
acquire( false, _data==null ); //incl. read matrix if evicted
// b) write the matrix
try
{
writeMetaData( fName, outputFormat, formatProperties );
writeBlobToHDFS( fName, outputFormat, replication, formatProperties );
if ( !pWrite )
setDirty(false);
}
catch (Exception e)
{
throw new CacheException ("Export to " + fName + " failed.", e);
}
finally
{
release();
}
}
else if( pWrite ) // pwrite with same output format
{
//CASE 2: matrix already in same format but different file on hdfs (copy matrix to fname)
try
{
MapReduceTool.deleteFileIfExistOnHDFS(fName);
MapReduceTool.deleteFileIfExistOnHDFS(fName+".mtd");
if( getRDDHandle()==null || getRDDHandle().allowsShortCircuitRead() )
MapReduceTool.copyFileOnHDFS( _hdfsFileName, fName );
else //write might trigger rdd operations and nnz maintenance
writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat);
writeMetaData( fName, outputFormat, formatProperties );
}
catch (Exception e) {
throw new CacheException ("Export to " + fName + " failed.", e);
}
}
else if( getRDDHandle()!=null && //pending rdd operation
!getRDDHandle().allowsShortCircuitRead() )
{
//CASE 3: pending rdd operation (other than checkpoints)
try
{
writeBlobFromRDDtoHDFS(getRDDHandle(), fName, outputFormat);
writeMetaData( fName, outputFormat, formatProperties );
}
catch (Exception e) {
throw new CacheException ("Export to " + fName + " failed.", e);
}
}
else
{
//CASE 4: data already in hdfs (do nothing, no need for export)
LOG.trace(this.getDebugName() + ": Skip export to hdfs since data already exists.");
}
if( DMLScript.STATISTICS ){
long t1 = System.nanoTime();
CacheStatistics.incrementExportTime(t1-t0);
}
}
// --------- ABSTRACT LOW-LEVEL CACHE I/O OPERATIONS ----------
/**
* Checks if the data blob reference points to some in-memory object.
* This method is called when releasing the (last) lock. Do not call
* this method for a blob that has been evicted.
*
* @return true
if the blob is in main memory and the
* reference points to it;
* false
if the blob reference is null
.
*/
protected boolean isBlobPresent() {
return (_data != null);
}
/**
* Low-level cache I/O method that physically restores the data blob to
* main memory. Must be defined by a subclass, never called by users.
*
* @throws CacheException if CacheException occurs
*/
protected void restoreBlobIntoMemory()
throws CacheException
{
String cacheFilePathAndName = getCacheFilePathAndName();
long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0;
if( LOG.isTraceEnabled() )
LOG.trace ("CACHE: Restoring matrix... " + getVarName() + " HDFS path: " +
(_hdfsFileName == null ? "null" : _hdfsFileName) + ", Restore from path: " + cacheFilePathAndName);
if (_data != null)
throw new CacheException (cacheFilePathAndName + " : Cannot restore on top of existing in-memory data.");
try {
_data = readBlobFromCache(cacheFilePathAndName);
}
catch (IOException e) {
throw new CacheException (cacheFilePathAndName + " : Restore failed.", e);
}
//check for success
if (_data == null)
throw new CacheException (cacheFilePathAndName + " : Restore failed.");
if( LOG.isTraceEnabled() )
LOG.trace("Restoring matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec.");
}
protected abstract T readBlobFromCache(String fname)
throws IOException;
/**
* Low-level cache I/O method that deletes the file containing the
* evicted data blob, without reading it.
* Must be defined by a subclass, never called by users.
*/
protected void freeEvictedBlob() {
String cacheFilePathAndName = getCacheFilePathAndName();
long begin = LOG.isTraceEnabled() ? System.currentTimeMillis() : 0;
if( LOG.isTraceEnabled() )
LOG.trace("CACHE: Freeing evicted matrix... " + getVarName() + " HDFS path: " +
(_hdfsFileName == null ? "null" : _hdfsFileName) + " Eviction path: " + cacheFilePathAndName);
LazyWriteBuffer.deleteBlock(cacheFilePathAndName);
if( LOG.isTraceEnabled() )
LOG.trace("Freeing evicted matrix - COMPLETED ... " + (System.currentTimeMillis()-begin) + " msec.");
}
protected boolean isBelowCachingThreshold() {
return (_data.getInMemorySize() <= CACHING_THRESHOLD);
}
protected ValueType[] getSchema() {
return null;
}
@Override //Data
public synchronized String getDebugName() {
int maxLength = 23;
String debugNameEnding = (_hdfsFileName == null ? "null" :
(_hdfsFileName.length() < maxLength ? _hdfsFileName : "..." +
_hdfsFileName.substring (_hdfsFileName.length() - maxLength + 3)));
return getVarName() + " " + debugNameEnding;
}
protected T readBlobFromHDFS(String fname)
throws IOException
{
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
return readBlobFromHDFS(fname, mc.getRows(), mc.getCols());
}
protected abstract T readBlobFromHDFS(String fname, long rlen, long clen)
throws IOException;
protected abstract T readBlobFromRDD(RDDObject rdd, MutableBoolean status)
throws IOException;
protected abstract void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop)
throws IOException, DMLRuntimeException;
protected abstract void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String ofmt)
throws IOException, DMLRuntimeException;
protected void writeMetaData (String filePathAndName, String outputFormat, FileFormatProperties formatProperties)
throws DMLRuntimeException, IOException
{
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
if (iimd == null)
throw new DMLRuntimeException("Unexpected error while writing mtd file (" + filePathAndName + ") -- metadata is null.");
// Write the matrix to HDFS in requested format
OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo (outputFormat)
: InputInfo.getMatchingOutputInfo (iimd.getInputInfo ()));
if ( oinfo != OutputInfo.MatrixMarketOutputInfo ) {
// Get the dimension information from the metadata stored within MatrixObject
MatrixCharacteristics mc = iimd.getMatrixCharacteristics ();
// when outputFormat is binaryblock, make sure that matrixCharacteristics has correct blocking dimensions
// note: this is only required if singlenode (due to binarycell default)
if ( oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE &&
(mc.getRowsPerBlock() != ConfigurationManager.getBlocksize() || mc.getColsPerBlock() != ConfigurationManager.getBlocksize()) )
{
mc = new MatrixCharacteristics(mc.getRows(), mc.getCols(), ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), mc.getNonZeros());
}
//write the actual meta data file
MapReduceTool.writeMetaDataFile (filePathAndName + ".mtd", valueType,
getSchema(), dataType, mc, oinfo, formatProperties);
}
}
protected boolean isEqualOutputFormat( String outputFormat )
{
boolean ret = true;
if( outputFormat != null ) {
try {
MatrixFormatMetaData iimd = (MatrixFormatMetaData) _metaData;
OutputInfo oi1 = InputInfo.getMatchingOutputInfo( iimd.getInputInfo() );
OutputInfo oi2 = OutputInfo.stringToOutputInfo( outputFormat );
if( oi1 != oi2 )
ret = false;
}
catch(Exception ex) {
ret = false;
}
}
return ret;
}
// ------------- IMPLEMENTED CACHE LOGIC METHODS --------------
protected int getUniqueCacheID() {
return _uniqueID;
}
protected String getCacheFilePathAndName () {
if( _cacheFileName==null ) {
StringBuilder sb = new StringBuilder();
sb.append(CacheableData.cacheEvictionLocalFilePath);
sb.append(CacheableData.cacheEvictionLocalFilePrefix);
sb.append(String.format ("%09d", getUniqueCacheID()));
sb.append(CacheableData.CACHING_EVICTION_FILEEXTENSION);
_cacheFileName = sb.toString();
}
return _cacheFileName;
}
/**
* This method "acquires the lock" to ensure that the data blob is in main memory
* (not evicted) while it is being accessed. When called, the method will try to
* restore the blob if it has been evicted. There are two kinds of locks it may
* acquire: a shared "read" lock (if the argument is false
) or the
* exclusive "modify" lock (if the argument is true
).
* The method can fail in three ways:
* (1) if there is lock status conflict;
* (2) if there is not enough cache memory to restore the blob;
* (3) if the restore method returns an error.
* The method locks the data blob in memory (which disables eviction) and updates
* its last-access timestamp. For the shared "read" lock, acquiring a new lock
* increments the associated count. The "read" count has to be decremented once
* the blob is no longer used, which may re-enable eviction. This method has to
* be called only once per matrix operation and coupled with {@link #release()},
* because it increments the lock count and the other method decrements this count.
*
* @param isModify : true
for the exclusive "modify" lock,
* false
for a shared "read" lock.
* @param restore true if restore
* @throws CacheException if CacheException occurs
*/
protected void acquire (boolean isModify, boolean restore)
throws CacheException
{
switch ( _cacheStatus )
{
case CACHED:
if(restore)
restoreBlobIntoMemory();
case CACHED_NOWRITE:
case EMPTY:
if (isModify)
setModify();
else
addOneRead();
break;
case READ:
if (isModify)
throw new CacheException ("READ-MODIFY not allowed.");
else
addOneRead();
break;
case MODIFY:
throw new CacheException ("MODIFY-MODIFY not allowed.");
}
if( LOG.isTraceEnabled() )
LOG.trace("Acquired lock on " + this.getDebugName() + ", status: " + this.getStatusAsString() );
}
/**
* Call this method to permit eviction for the stored data blob, or to
* decrement its "read" count if it is "read"-locked by other threads.
* It is expected that you eliminate all external references to the blob
* prior to calling this method, because otherwise eviction will
* duplicate the blob, but not release memory. This method has to be
* called only once per process and coupled with {@link #acquire(boolean, boolean)},
* because it decrements the lock count and the other method increments
* the lock count.
*
* @param cacheNoWrite ?
* @throws CacheException if CacheException occurs
*/
protected void release(boolean cacheNoWrite)
throws CacheException
{
switch ( _cacheStatus )
{
case EMPTY:
case CACHED:
case CACHED_NOWRITE:
throw new CacheException("Redundant release.");
case READ:
removeOneRead( isBlobPresent(), cacheNoWrite );
break;
case MODIFY:
if ( isBlobPresent() )
setCached();
else
setEmpty();
break;
}
if( LOG.isTraceEnabled() )
LOG.trace("Released lock on " + this.getDebugName() + ", status: " + this.getStatusAsString());
}
// **************************************************
// *** ***
// *** CACHE STATUS FIELD - CLASSES AND METHODS ***
// *** ***
// **************************************************
public String getStatusAsString() {
return _cacheStatus.toString();
}
public boolean isCached(boolean inclCachedNoWrite) {
if( inclCachedNoWrite )
return (_cacheStatus == CacheStatus.CACHED || _cacheStatus == CacheStatus.CACHED_NOWRITE);
else
return (_cacheStatus == CacheStatus.CACHED);
}
public void setEmptyStatus() {
setEmpty();
}
protected boolean isEmpty(boolean inclCachedNoWrite) {
if( inclCachedNoWrite )
return (_cacheStatus == CacheStatus.EMPTY || _cacheStatus == CacheStatus.CACHED_NOWRITE);
else
return (_cacheStatus == CacheStatus.EMPTY);
}
protected boolean isModify() {
return (_cacheStatus == CacheStatus.MODIFY);
}
protected void setEmpty() {
_cacheStatus = CacheStatus.EMPTY;
}
protected void setModify() {
_cacheStatus = CacheStatus.MODIFY;
}
protected void setCached() {
_cacheStatus = CacheStatus.CACHED;
}
protected void addOneRead() {
_numReadThreads ++;
_cacheStatus = CacheStatus.READ;
}
protected void removeOneRead(boolean doesBlobExist, boolean cacheNoWrite) {
_numReadThreads --;
if (_numReadThreads == 0) {
if( cacheNoWrite )
_cacheStatus = (doesBlobExist ?
CacheStatus.CACHED_NOWRITE : CacheStatus.EMPTY);
else
_cacheStatus = (doesBlobExist ?
CacheStatus.CACHED : CacheStatus.EMPTY);
}
}
protected boolean isAvailableToRead() {
return ( _cacheStatus == CacheStatus.EMPTY
|| _cacheStatus == CacheStatus.CACHED
|| _cacheStatus == CacheStatus.CACHED_NOWRITE
|| _cacheStatus == CacheStatus.READ);
}
protected boolean isAvailableToModify() {
return ( _cacheStatus == CacheStatus.EMPTY
|| _cacheStatus == CacheStatus.CACHED
|| _cacheStatus == CacheStatus.CACHED_NOWRITE);
}
// *******************************************
// *** ***
// *** LOW-LEVEL PRIVATE METHODS ***
// *** FOR SOFTREFERENCE CACHE ***
// *** ***
// *******************************************
/**
* Creates a new cache soft reference to the currently
* referenced cache block.
*/
protected void createCache( ) {
_cache = new SoftReference( _data );
}
/**
* Tries to get the cache block from the cache soft reference
* and subsequently clears the cache soft reference if existing.
*/
protected void getCache() {
if( _cache !=null ) {
_data = _cache.get();
clearCache();
}
}
/** Clears the cache soft reference if existing. */
protected void clearCache() {
if( _cache != null ) {
_cache.clear();
_cache = null;
}
}
protected void updateStatusPinned(boolean add) {
if( _data != null ) { //data should never be null
long size = sizePinned.get();
size += (add ? 1 : -1) * _data.getInMemorySize();
sizePinned.set( Math.max(size,0) );
}
}
protected long getPinnedSize() {
return sizePinned.get();
}
public static void addBroadcastSize(long size) {
_refBCs.addAndGet(size);
}
public static long getBroadcastSize() {
return _refBCs.longValue();
}
// --------- STATIC CACHE INIT/CLEANUP OPERATIONS ----------
public synchronized static void cleanupCacheDir() {
//cleanup remaining cached writes
LazyWriteBuffer.cleanup();
//delete cache dir and files
cleanupCacheDir(true);
}
/**
* Deletes the DML-script-specific caching working dir.
*
* @param withDir if true, delete directory
*/
public synchronized static void cleanupCacheDir(boolean withDir)
{
//get directory name
String dir = cacheEvictionLocalFilePath;
//clean files with cache prefix
if( dir != null ) //if previous init cache
{
File fdir = new File(dir);
if( fdir.exists()){ //just for robustness
File[] files = fdir.listFiles();
for( File f : files )
if( f.getName().startsWith(cacheEvictionLocalFilePrefix) )
f.delete();
if( withDir )
fdir.delete(); //deletes dir only if empty
}
}
_activeFlag = false;
}
/**
* Inits caching with the default uuid of DMLScript
*
* @throws IOException if IOException occurs
*/
public synchronized static void initCaching()
throws IOException
{
initCaching(DMLScript.getUUID());
}
/**
* Creates the DML-script-specific caching working dir.
*
* Takes the UUID in order to allow for custom uuid, e.g., for remote parfor caching
*
* @param uuid ID
* @throws IOException if IOException occurs
*/
public synchronized static void initCaching( String uuid )
throws IOException
{
try
{
String dir = LocalFileUtils.getWorkingDir( LocalFileUtils.CATEGORY_CACHE );
LocalFileUtils.createLocalFileIfNotExist(dir);
cacheEvictionLocalFilePath = dir;
}
catch(DMLRuntimeException e)
{
throw new IOException(e);
}
//init write-ahead buffer
LazyWriteBuffer.init();
_refBCs.set(0);
_activeFlag = true; //turn on caching
}
public static synchronized boolean isCachingActive() {
return _activeFlag;
}
public static synchronized void disableCaching() {
_activeFlag = false;
}
public static synchronized void enableCaching() {
_activeFlag = true;
}
public synchronized boolean moveData(String fName, String outputFormat)
throws CacheException
{
boolean ret = false;
try
{
//export or rename to target file on hdfs
if( (isDirty() || (!isEqualOutputFormat(outputFormat) && isEmpty(true))) ||
(getRDDHandle() != null && !MapReduceTool.existsFileOnHDFS(_hdfsFileName)))
{
exportData(fName, outputFormat);
ret = true;
}
else if( isEqualOutputFormat(outputFormat) )
{
MapReduceTool.deleteFileIfExistOnHDFS(fName);
MapReduceTool.deleteFileIfExistOnHDFS(fName+".mtd");
MapReduceTool.renameFileOnHDFS( _hdfsFileName, fName );
writeMetaData( fName, outputFormat, null );
ret = true;
}
}
catch (Exception e)
{
throw new CacheException ("Move to " + fName + " failed.", e);
}
return ret;
}
public String toString() {
StringBuilder str = new StringBuilder();
str.append(getClass().getSimpleName());
str.append(": ");
str.append(_hdfsFileName + ", ");
if (_metaData instanceof NumItemsByEachReducerMetaData) {
str.append("NumItemsByEachReducerMetaData");
} else {
try {
MatrixFormatMetaData md = (MatrixFormatMetaData) _metaData;
if (md != null) {
MatrixCharacteristics mc = ((MatrixDimensionsMetaData) _metaData).getMatrixCharacteristics();
str.append(mc.toString());
InputInfo ii = md.getInputInfo();
if (ii == null)
str.append("null");
else {
str.append(", ");
str.append(InputInfo.inputInfoToString(ii));
}
} else {
str.append("null, null");
}
} catch (Exception ex) {
LOG.error(ex);
}
}
str.append(", ");
str.append(isDirty() ? "dirty" : "not-dirty");
return str.toString();
}
}