org.apache.sysml.hops.OptimizerUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.hops;
import java.util.HashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.hops.Hop.DataOpTypes;
import org.apache.sysml.hops.Hop.FileFormatTypes;
import org.apache.sysml.hops.Hop.OpOp2;
import org.apache.sysml.hops.rewrite.HopRewriteUtils;
import org.apache.sysml.lops.LopProperties.ExecType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.LocalVariableMap;
import org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext;
import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
import org.apache.sysml.runtime.instructions.cp.Data;
import org.apache.sysml.runtime.instructions.cp.ScalarObject;
import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.SparseRow;
import org.apache.sysml.runtime.util.UtilFunctions;
import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer;
public class OptimizerUtils
{
private static final Log LOG = LogFactory.getLog(OptimizerUtils.class.getName());
////////////////////////////////////////////////////////
// Optimizer constants and flags (incl tuning knobs) //
////////////////////////////////////////////////////////
/**
* Utilization factor used in deciding whether an operation to be scheduled on CP or MR.
* NOTE: it is important that MEM_UTIL_FACTOR+CacheableData.CACHING_BUFFER_SIZE < 1.0
*/
public static double MEM_UTIL_FACTOR = 0.7d;
/**
* Default memory size, which is used the actual estimate can not be computed
* -- for example, when input/output dimensions are unknown. In case of ROBUST,
* the default is set to a large value so that operations are scheduled on MR.
*/
public static double DEFAULT_SIZE;
public static final long DOUBLE_SIZE = 8;
public static final long INT_SIZE = 4;
public static final long CHAR_SIZE = 1;
public static final long BOOLEAN_SIZE = 1;
public static final double BIT_SIZE = (double)1/8;
public static final double INVALID_SIZE = -1d; // memory estimate not computed
public static final long MAX_NUMCELLS_CP_DENSE = Integer.MAX_VALUE;
/**
* Enables/disables dynamic re-compilation of lops/instructions.
* If enabled, we recompile each program block that contains at least
* one hop that requires re-compilation (e.g., unknown statistics
* during compilation, or program blocks in functions).
*/
public static boolean ALLOW_DYN_RECOMPILATION = true;
public static boolean ALLOW_PARALLEL_DYN_RECOMPILATION = ALLOW_DYN_RECOMPILATION && true;
/**
* Enables/disables to put operations with data-dependent output
* size into individual statement blocks / program blocks.
* Since recompilation is done on the granularity of program blocks
* this enables recompilation of subsequent operations according
* to the actual output size. This rewrite might limit the opportunity
* for piggybacking and therefore should only be applied if
* dyanmic recompilation is enabled as well.
*/
public static boolean ALLOW_INDIVIDUAL_SB_SPECIFIC_OPS = ALLOW_DYN_RECOMPILATION && true;
/**
* Enables common subexpression elimination in dags. There is however, a potential tradeoff
* between computation redundancy and data transfer between MR jobs. Since, we do not reason
* about transferred data yet, this rewrite rule is enabled by default.
*/
public static boolean ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = true;
/**
* Enables constant folding in dags. Constant folding computes simple expressions of binary
* operations and literals and replaces the hop sub-DAG with a new literal operator.
*/
public static boolean ALLOW_CONSTANT_FOLDING = true;
/**
*
*/
public static boolean ALLOW_ALGEBRAIC_SIMPLIFICATION = true;
/**
* Enables if-else branch removal for constant predicates (original literals or
* results of constant folding).
*
*/
public static boolean ALLOW_BRANCH_REMOVAL = true;
/**
*
*/
public static boolean ALLOW_AUTO_VECTORIZATION = true;
/**
* Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple
* expressions are defined as binary operations on literals and nrow/ncol. This applies
* only to exact size information.
*/
public static boolean ALLOW_SIZE_EXPRESSION_EVALUATION = true;
/**
* Enables simple expression evaluation for datagen parameters 'rows', 'cols'. Simple
* expressions are defined as binary operations on literals and b(+) or b(*) on nrow/ncol.
* This applies also to worst-case size information.
*/
public static boolean ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = true;
/**
*
*/
public static boolean ALLOW_RAND_JOB_RECOMPILE = true;
/**
* Enables CP-side data transformation for small files.
*/
public static boolean ALLOW_TRANSFORM_RECOMPILE = true;
/**
* Enables parfor runtime piggybacking of MR jobs into the packed jobs for
* scan sharing.
*/
public static boolean ALLOW_RUNTIME_PIGGYBACKING = true;
/**
* Enables interprocedural analysis between main script and functions as well as functions
* and other functions. This includes, for example, to propagate statistics into functions
* if save to do so (e.g., if called once).
*/
public static boolean ALLOW_INTER_PROCEDURAL_ANALYSIS = true;
/**
* Enables sum product rewrites such as mapmultchains. In the future, this will cover
* all sum-product related rewrites.
*/
public static boolean ALLOW_SUM_PRODUCT_REWRITES = true;
/**
* Enables a specific hop dag rewrite that splits hop dags after csv persistent reads with
* unknown size in order to allow for recompile.
*/
public static boolean ALLOW_SPLIT_HOP_DAGS = true;
/**
* Enables parallel read/write of all text formats (textcell, csv, mm)
* and binary formats (binary block).
*
*/
public static boolean PARALLEL_CP_READ_TEXTFORMATS = true;
public static boolean PARALLEL_CP_WRITE_TEXTFORMATS = true;
public static boolean PARALLEL_CP_READ_BINARYFORMATS = true;
public static boolean PARALLEL_CP_WRITE_BINARYFORMATS = true;
/**
* Specifies a multiplier computing the degree of parallelism of parallel
* text read/write out of the available degree of parallelism. Set it to 1.0
* to get a number of threads equal the number of virtual cores.
*
*/
public static final double PARALLEL_CP_READ_PARALLELISM_MULTIPLIER = 1.0;
public static final double PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER = 1.0;
/**
* Enables multi-threaded matrix multiply for mm, mmchain, and tsmm.
*
*/
public static boolean PARALLEL_CP_MATRIX_MULTIPLY = true;
/**
* Enables the use of CombineSequenceFileInputFormat with splitsize = 2x hdfs blocksize,
* if sort buffer size large enough and parallelism not hurt. This solves to issues:
* (1) it combines small files (depending on producers), and (2) it reduces task
* latency of large jobs with many tasks by factor 2.
*
*/
public static final boolean ALLOW_COMBINE_FILE_INPUT_FORMAT = true;
//////////////////////
// Optimizer levels //
//////////////////////
private static OptimizationLevel _optLevel = OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT;
/**
* Optimization Types for Compilation
*
* O0 STATIC - Decisions for scheduling operations on CP/MR are based on
* predefined set of rules, which check if the dimensions are below a
* fixed/static threshold (OLD Method of choosing between CP and MR).
* The optimization scope is LOCAL, i.e., per statement block.
* Advanced rewrites like constant folding, common subexpression elimination,
* or inter procedural analysis are NOT applied.
*
* O1 MEMORY_BASED - Every operation is scheduled on CP or MR, solely
* based on the amount of memory required to perform that operation.
* It does NOT take the execution time into account.
* The optimization scope is LOCAL, i.e., per statement block.
* Advanced rewrites like constant folding, common subexpression elimination,
* or inter procedural analysis are NOT applied.
*
* O2 MEMORY_BASED - Every operation is scheduled on CP or MR, solely
* based on the amount of memory required to perform that operation.
* It does NOT take the execution time into account.
* The optimization scope is LOCAL, i.e., per statement block.
* All advanced rewrites are applied. This is the default optimization
* level of SystemML.
*
* O3 GLOBAL TIME_MEMORY_BASED - Operation scheduling on CP or MR as well as
* many other rewrites of data flow properties such as block size, partitioning,
* replication, vectorization, etc are done with the optimization objective of
* minimizing execution time under hard memory constraints per operation and
* execution context. The optimization scope if GLOBAL, i.e., program-wide.
* All advanced rewrites are applied. This optimization level requires more
* optimization time but has higher optimization potential.
*
* O4 DEBUG MODE - All optimizations, global and local, which interfere with
* breakpoints are NOT applied. This optimization level is REQUIRED for the
* compiler running in debug mode.
*/
public enum OptimizationLevel {
O0_LOCAL_STATIC,
O1_LOCAL_MEMORY_MIN,
O2_LOCAL_MEMORY_DEFAULT,
O3_LOCAL_RESOURCE_TIME_MEMORY,
O4_GLOBAL_TIME_MEMORY,
O5_DEBUG_MODE,
};
public static OptimizationLevel getOptLevel() {
return _optLevel;
}
public static boolean isMemoryBasedOptLevel() {
return (_optLevel != OptimizationLevel.O0_LOCAL_STATIC);
}
public static boolean isOptLevel( OptimizationLevel level ){
return (_optLevel == level);
}
/**
*
* @param optlevel
* @throws DMLRuntimeException
*/
public static void setOptimizationLevel( int optlevel )
throws DMLRuntimeException
{
if( optlevel < 0 || optlevel > 5 )
throw new DMLRuntimeException("Error: invalid optimization level '"+optlevel+"' (valid values: 0-5).");
// This overrides any optimization level that is present in the configuration file.
// Why ? This simplifies the calling logic: User doesnot have to maintain two config file or worse
// edit config file everytime he/she is trying to call the debugger.
if(DMLScript.ENABLE_DEBUG_MODE) {
optlevel = 5;
}
switch( optlevel )
{
// opt level 0: static dimensionality
case 0:
_optLevel = OptimizationLevel.O0_LOCAL_STATIC;
ALLOW_CONSTANT_FOLDING = false;
ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
ALLOW_AUTO_VECTORIZATION = false;
ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
ALLOW_BRANCH_REMOVAL = false;
ALLOW_SUM_PRODUCT_REWRITES = false;
break;
// opt level 1: memory-based (no advanced rewrites)
case 1:
_optLevel = OptimizationLevel.O1_LOCAL_MEMORY_MIN;
ALLOW_CONSTANT_FOLDING = false;
ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
ALLOW_AUTO_VECTORIZATION = false;
ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
ALLOW_BRANCH_REMOVAL = false;
ALLOW_SUM_PRODUCT_REWRITES = false;
break;
// opt level 2: memory-based (all advanced rewrites)
case 2:
_optLevel = OptimizationLevel.O2_LOCAL_MEMORY_DEFAULT;
break;
// opt level 3: resource optimization, time- and memory-based (2 w/ resource optimizat)
case 3:
_optLevel = OptimizationLevel.O3_LOCAL_RESOURCE_TIME_MEMORY;
break;
// opt level 3: global, time- and memory-based (all advanced rewrites)
case 4:
_optLevel = OptimizationLevel.O4_GLOBAL_TIME_MEMORY;
break;
// opt level 4: debug mode (no interfering rewrites)
case 5:
_optLevel = OptimizationLevel.O5_DEBUG_MODE;
ALLOW_CONSTANT_FOLDING = false;
ALLOW_COMMON_SUBEXPRESSION_ELIMINATION = false;
ALLOW_ALGEBRAIC_SIMPLIFICATION = false;
ALLOW_INTER_PROCEDURAL_ANALYSIS = false;
ALLOW_BRANCH_REMOVAL = false;
ALLOW_DYN_RECOMPILATION = false;
ALLOW_SIZE_EXPRESSION_EVALUATION = false;
ALLOW_WORSTCASE_SIZE_EXPRESSION_EVALUATION = false;
ALLOW_RAND_JOB_RECOMPILE = false;
ALLOW_SUM_PRODUCT_REWRITES = false;
ALLOW_SPLIT_HOP_DAGS = false;
break;
}
setDefaultSize();
//handle parallel text io (incl awareness of thread contention in 1 )
{
LOG.warn("Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE < 1.8"
+ " (java.version="+ System.getProperty("java.version")+").");
//disable parallel text read
PARALLEL_CP_READ_TEXTFORMATS = false;
}
//handle parallel matrix mult / rand configuration
if (!ConfigurationManager.getConfig().getBooleanValue(DMLConfig.CP_PARALLEL_MATRIXMULT)) {
PARALLEL_CP_MATRIX_MULTIPLY = false;
}
}
/**
*
*/
public static void setDefaultSize()
{
//we need to set default_size larger than any execution context
//memory budget, however, it should not produce overflows on sum
DEFAULT_SIZE = Math.max( InfrastructureAnalyzer.getLocalMaxMemory(),
Math.max(InfrastructureAnalyzer.getRemoteMaxMemoryMap(),
InfrastructureAnalyzer.getRemoteMaxMemoryReduce()));
}
/**
* Returns memory budget (according to util factor) in bytes
*
* @param localOnly specifies if only budget of current JVM or also MR JVMs
* @return
*/
public static double getLocalMemBudget()
{
double ret = InfrastructureAnalyzer.getLocalMaxMemory();
return ret * OptimizerUtils.MEM_UTIL_FACTOR;
}
/**
*
* @return
*/
public static double getRemoteMemBudgetMap()
{
return getRemoteMemBudgetMap(false);
}
/**
*
* @return
*/
public static double getRemoteMemBudgetMap(boolean substractSortBuffer)
{
double ret = InfrastructureAnalyzer.getRemoteMaxMemoryMap();
if( substractSortBuffer )
ret -= InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer();
return ret * OptimizerUtils.MEM_UTIL_FACTOR;
}
/**
*
* @return
*/
public static double getRemoteMemBudgetReduce()
{
double ret = InfrastructureAnalyzer.getRemoteMaxMemoryReduce();
return ret * OptimizerUtils.MEM_UTIL_FACTOR;
}
/**
*
* @param size
* @return
*/
public static boolean checkSparkBroadcastMemoryBudget( double size )
{
double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
double memBudgetLocal = OptimizerUtils.getLocalMemBudget();
//basic requirement: the broadcast needs to to fit once in the remote broadcast memory
//and twice into the local memory budget because we have to create a partitioned broadcast
//memory and hand it over to the spark context as in-memory object
return ( size < memBudgetExec && 2*size < memBudgetLocal );
}
/**
*
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param nnz
* @return
*/
public static boolean checkSparkBroadcastMemoryBudget( long rlen, long clen, long brlen, long bclen, long nnz )
{
double memBudgetExec = SparkExecutionContext.getBroadcastMemoryBudget();
double memBudgetLocal = OptimizerUtils.getLocalMemBudget();
double sp = getSparsity(rlen, clen, nnz);
double size = estimateSizeExactSparsity(rlen, clen, sp);
double sizeP = estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp);
//basic requirement: the broadcast needs to to fit once in the remote broadcast memory
//and twice into the local memory budget because we have to create a partitioned broadcast
//memory and hand it over to the spark context as in-memory object
return ( OptimizerUtils.isValidCPDimensions(rlen, clen)
&& sizeP < memBudgetExec && size+sizeP < memBudgetLocal );
}
/**
*
* @param mc
* @param memPinned
* @return
*/
public static boolean checkSparkCollectMemoryBudget( MatrixCharacteristics mc, long memPinned )
{
return checkSparkCollectMemoryBudget(
mc.getRows(),
mc.getCols(),
mc.getRowsPerBlock(),
mc.getColsPerBlock(),
mc.getNonZeros(), memPinned);
}
/**
*
* @param rlen
* @param clen
* @param brlen
* @param bclen
* @param nnz
* @return
*/
public static boolean checkSparkCollectMemoryBudget( long rlen, long clen, int brlen, int bclen, long nnz, long memPinned )
{
//compute size of output matrix and its blocked representation
double sp = getSparsity(rlen, clen, nnz);
double memMatrix = estimateSizeExactSparsity(rlen, clen, sp);
double memPMatrix = estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp);
//check if both output matrix and partitioned matrix fit into local mem budget
return (memPinned + memMatrix + memPMatrix < getLocalMemBudget());
}
/**
* Returns the number of reducers that potentially run in parallel.
* This is either just the configured value (SystemML config) or
* the minimum of configured value and available reduce slots.
*
* @param configOnly
* @return
*/
public static int getNumReducers( boolean configOnly )
{
int ret = ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS);
if( !configOnly ) {
ret = Math.min(ret,InfrastructureAnalyzer.getRemoteParallelReduceTasks());
//correction max number of reducers on yarn clusters
if( InfrastructureAnalyzer.isYarnEnabled() )
ret = (int)Math.max( ret, YarnClusterAnalyzer.getNumCores()/2 );
}
return ret;
}
/**
*
* @return
*/
public static int getNumMappers()
{
int ret = InfrastructureAnalyzer.getRemoteParallelMapTasks();
//correction max number of reducers on yarn clusters
if( InfrastructureAnalyzer.isYarnEnabled() )
ret = (int)Math.max( ret, YarnClusterAnalyzer.getNumCores() );
return ret;
}
/**
*
* @return
*/
public static boolean isSparkExecutionMode() {
return ( DMLScript.rtplatform == RUNTIME_PLATFORM.SPARK
|| DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK);
}
/**
*
* @return
*/
public static boolean isHybridExecutionMode() {
return ( DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID
|| DMLScript.rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK );
}
/**
* Returns the degree of parallelism used for parallel text read.
* This is computed as the number of virtual cores scales by the
* PARALLEL_READ_PARALLELISM_MULTIPLIER. If PARALLEL_READ_TEXTFORMATS
* is disabled, this method returns 1.
*
* @return
*/
public static int getParallelTextReadParallelism()
{
if( !PARALLEL_CP_READ_TEXTFORMATS )
return 1; // sequential execution
//compute degree of parallelism for parallel text read
double dop = InfrastructureAnalyzer.getLocalParallelism()
* PARALLEL_CP_READ_PARALLELISM_MULTIPLIER;
return (int) Math.round(dop);
}
/**
*
* @return
*/
public static int getParallelBinaryReadParallelism()
{
if( !PARALLEL_CP_READ_BINARYFORMATS )
return 1; // sequential execution
//compute degree of parallelism for parallel text read
double dop = InfrastructureAnalyzer.getLocalParallelism()
* PARALLEL_CP_READ_PARALLELISM_MULTIPLIER;
return (int) Math.round(dop);
}
/**
* Returns the degree of parallelism used for parallel text write.
* This is computed as the number of virtual cores scales by the
* PARALLEL_WRITE_PARALLELISM_MULTIPLIER. If PARALLEL_WRITE_TEXTFORMATS
* is disabled, this method returns 1.
*
* @return
*/
public static int getParallelTextWriteParallelism()
{
if( !PARALLEL_CP_WRITE_TEXTFORMATS )
return 1; // sequential execution
//compute degree of parallelism for parallel text read
double dop = InfrastructureAnalyzer.getLocalParallelism()
* PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER;
return (int) Math.round(dop);
}
/**
*
* @return
*/
public static int getParallelBinaryWriteParallelism()
{
if( !PARALLEL_CP_WRITE_BINARYFORMATS )
return 1; // sequential execution
//compute degree of parallelism for parallel text read
double dop = InfrastructureAnalyzer.getLocalParallelism()
* PARALLEL_CP_WRITE_PARALLELISM_MULTIPLIER;
return (int) Math.round(dop);
}
////////////////////////
// Memory Estimates //
////////////////////////
/**
*
* @param mc
* @return
*/
public static long estimateSizeExactSparsity(MatrixCharacteristics mc)
{
return estimateSizeExactSparsity(
mc.getRows(),
mc.getCols(),
mc.getNonZeros());
}
/**
* Estimates the footprint (in bytes) for an in-memory representation of a
* matrix with dimensions=(nrows,ncols) and and number of non-zeros nnz.
*
* @param nrows
* @param ncols
* @param sp
* @return
*/
public static long estimateSizeExactSparsity(long nrows, long ncols, long nnz)
{
double sp = getSparsity(nrows, ncols, nnz);
return estimateSizeExactSparsity(nrows, ncols, sp);
}
/**
* Estimates the footprint (in bytes) for an in-memory representation of a
* matrix with dimensions=(nrows,ncols) and sparsity=sp.
*
* This function can be used directly in Hops, when the actual sparsity is
* known i.e., sp
is guaranteed to give worst-case estimate
* (e.g., Rand with a fixed sparsity). In all other cases, estimateSize()
* must be used so that worst-case estimates are computed, whenever
* applicable.
*
* @param nrows
* @param ncols
* @param sp
* @return
*/
public static long estimateSizeExactSparsity(long nrows, long ncols, double sp)
{
return MatrixBlock.estimateSizeInMemory(nrows,ncols,sp);
}
/**
* Estimates the footprint (in bytes) for a partitioned in-memory representation of a
* matrix with the given matrix characteristics
*
* @param mc
* @return
*/
public static long estimatePartitionedSizeExactSparsity(MatrixCharacteristics mc)
{
return estimatePartitionedSizeExactSparsity(
mc.getRows(),
mc.getCols(),
mc.getRowsPerBlock(),
mc.getColsPerBlock(),
mc.getNonZeros());
}
/**
* Estimates the footprint (in bytes) for a partitioned in-memory representation of a
* matrix with dimensions=(nrows,ncols) and number of non-zeros nnz.
*
* @param nrows
* @param ncols
* @param sp
* @return
*/
public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long brlen, long bclen, long nnz)
{
double sp = getSparsity(rlen, clen, nnz);
return estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, sp);
}
/**
* Estimates the footprint (in bytes) for a partitioned in-memory representation of a
* matrix with dimensions=(nrows,ncols) and sparsity=sp.
*
* @param nrows
* @param ncols
* @param sp
* @return
*/
public static long estimatePartitionedSizeExactSparsity(long rlen, long clen, long brlen, long bclen, double sp)
{
long ret = 0;
//check for guaranteed existence of empty blocks (less nnz than total number of blocks)
long tnrblks = (long)Math.ceil((double)rlen/brlen);
long tncblks = (long)Math.ceil((double)clen/bclen);
long nnz = (long) Math.ceil(sp * rlen * clen);
if( nnz < tnrblks * tncblks ) {
long lrlen = Math.min(rlen, brlen);
long lclen = Math.min(clen, bclen);
return nnz * estimateSizeExactSparsity(lrlen, lclen, 1)
+ (tnrblks * tncblks - nnz) * estimateSizeEmptyBlock(lrlen, lclen);
}
//estimate size of full brlen x bclen blocks
long nrblks = rlen / brlen;
long ncblks = clen / bclen;
if( nrblks * ncblks > 0 )
ret += nrblks * ncblks * estimateSizeExactSparsity(brlen, bclen, sp);
//estimate size of bottom boundary blocks
long lrlen = rlen % brlen;
if( ncblks > 0 && lrlen > 0 )
ret += ncblks * estimateSizeExactSparsity(lrlen, bclen, sp);
//estimate size of right boundary blocks
long lclen = clen % bclen;
if( nrblks > 0 && lclen > 0 )
ret += nrblks * estimateSizeExactSparsity(brlen, lclen, sp);
//estimate size of bottom right boundary block
if( lrlen > 0 && lclen > 0 )
ret += estimateSizeExactSparsity(lrlen, lclen, sp);
return ret;
}
/**
* Similar to estimate() except that it provides worst-case estimates
* when the optimization type is ROBUST.
*
* @param nrows
* @param ncols
* @param sp
* @return
*/
public static long estimateSize(long nrows, long ncols)
{
return estimateSizeExactSparsity(nrows, ncols, 1.0);
}
/**
*
* @param nrows
* @param ncols
* @return
*/
public static long estimateSizeEmptyBlock(long nrows, long ncols)
{
return estimateSizeExactSparsity(0, 0, 0.0d);
}
/**
* Estimates the memory footprint of a SparseRow with clen
* columns and sp
sparsity. This method accounts for the
* overhead incurred by extra cells allocated (but not used) for SparseRow.
* It assumes that non-zeros are uniformly distributed in the matrix --
* i.e., #estimated nnz in a given SparseRow = clen*sp.
*
* @param clen
* @param sp
* @return estimated size in bytes
*/
public static long estimateRowSize(long clen, double sp)
{
if ( sp == 0 )
return 0;
int basicSize = 28;
int cellSize = 12; // every cell takes 12 (8+4) bytes
if ( sp == 1 ) {
return clen * cellSize;
}
long numCells = SparseRow.initialCapacity;
if ( (long) (sp*clen) > numCells ) {
numCells = (long) (sp*clen);
}
long allocatedCells = (long)Math.pow(2, Math.ceil(Math.log(numCells)/Math.log(2)) );
long rowSize = basicSize + allocatedCells * cellSize;
return rowSize;
}
public static long estimateSizeTextOutput( long rows, long cols, long nnz, OutputInfo oinfo )
{
long bsize = MatrixBlock.estimateSizeOnDisk(rows, cols, nnz);
if( oinfo == OutputInfo.TextCellOutputInfo || oinfo == OutputInfo.MatrixMarketOutputInfo )
return bsize * 3;
else if( oinfo == OutputInfo.CSVOutputInfo )
return bsize * 2;
//unknown output info
return bsize;
}
/**
* Returns false if dimensions known to be invalid; other true
*
* @param rows
* @param cols
* @return
*/
public static boolean isValidCPDimensions( long rows, long cols )
{
//the current CP runtime implementation requires that rows and cols
//are integers since we use a single matrixblock to represent the
//entire matrix
return (rows <= Integer.MAX_VALUE && cols<=Integer.MAX_VALUE);
}
/**
* Determines if valid matrix size to be represented in CP data structures. Note that
* sparsity needs to be specified as rows*cols if unknown.
*
* @param rows
* @param cols
* @param sparsity
* @return
*/
public static boolean isValidCPMatrixSize( long rows, long cols, double sparsity )
{
boolean ret = true;
//the current CP runtime implementation has several limitations:
//1) for dense: 16GB because we use a linearized array (bounded to int in java)
//2) for sparse: 2G x 2G nnz because (1) nnz maintained as long, (2) potential changes
// to dense, and (3) sparse row arrays also of max int size (worst case in case of skew)
long nnz = (long)(sparsity * rows * cols);
boolean sparse = MatrixBlock.evalSparseFormatInMemory(rows, cols, nnz);
if( sparse ) //SPARSE
{
//check max nnz
ret = (nnz <= Long.MAX_VALUE);
}
else //DENSE
{
//check number of matrix cell
ret = ((rows * cols) <= MAX_NUMCELLS_CP_DENSE);
}
return ret;
}
/**
*
* @return
* @throws HopsException
*/
public static boolean allowsToFilterEmptyBlockOutputs( Hop hop )
throws HopsException
{
boolean ret = true;
for( Hop p : hop.getParent() ) {
p.optFindExecType(); //ensure exec type evaluated
ret &= ( p.getExecType()==ExecType.CP
||(p instanceof AggBinaryOp && allowsToFilterEmptyBlockOutputs(p) )
||(p instanceof DataOp && ((DataOp)p).getDataOpType()==DataOpTypes.PERSISTENTWRITE && ((DataOp)p).getInputFormatType()==FileFormatTypes.TEXT))
&& !(p instanceof FunctionOp || (p instanceof DataOp && ((DataOp)p).getInputFormatType()!=FileFormatTypes.TEXT) ); //no function call or transient write
}
return ret;
}
/**
*
* @return
*/
public static int getConstrainedNumThreads(int maxNumThreads)
{
//by default max local parallelism (vcores)
int ret = InfrastructureAnalyzer.getLocalParallelism();
//apply external max constraint (e.g., set by parfor or other rewrites)
if( maxNumThreads > 0 ) {
ret = Math.min(ret, maxNumThreads);
}
//apply global multi-threading constraint
if( !PARALLEL_CP_MATRIX_MULTIPLY ) {
ret = 1;
}
return ret;
}
////////////////////////
// Sparsity Estimates //
////////////////////////
/**
* Estimates the result sparsity for Matrix Multiplication A %*% B.
*
* @param sp1 -- sparsity of A
* @param sp2 -- sparsity of B
* @param m -- nrow(A)
* @param k -- ncol(A), nrow(B)
* @param n -- ncol(B)
* @return
*/
public static double getMatMultSparsity(double sp1, double sp2, long m, long k, long n, boolean worstcase)
{
if( worstcase ){
double nnz1 = sp1 * m * k;
double nnz2 = sp2 * k * n;
return Math.min(1, nnz1/m) * Math.min(1, nnz2/n);
}
else
return (1 - Math.pow(1-sp1*sp2, k) );
}
/**
*
* @param rlen1
* @param clen1
* @param nnz1
* @param rlen2
* @param clen2
* @param nnz2
* @return
*/
public static double getLeftIndexingSparsity( long rlen1, long clen1, long nnz1, long rlen2, long clen2, long nnz2 )
{
boolean scalarRhs = (rlen2==0 && clen2==0);
//infer output worstcase output nnz
long lnnz = -1;
if( nnz1>=0 && scalarRhs )
lnnz = nnz1+1; // nnz(left) + scalar
else if( nnz1>=0 && nnz2>=0 )
lnnz = nnz1 + nnz2; // nnz(left) + nnz(right)
else if( nnz1>=0 && rlen2>0 && clen2>0 )
lnnz = nnz1 + rlen2*clen2; // nnz(left) + nnz(right_dense)
lnnz = Math.min(lnnz, rlen1*clen1);
return getSparsity(rlen1, clen1, (lnnz>=0) ? lnnz : rlen1*clen1);
}
/**
* Determines if a given binary op is potentially conditional sparse safe.
*
* @param op
* @return
*/
public static boolean isBinaryOpConditionalSparseSafe( OpOp2 op )
{
return ( op==OpOp2.GREATER
|| op==OpOp2.LESS
|| op==OpOp2.NOTEQUAL
|| op==OpOp2.EQUAL
|| op==OpOp2.MINUS);
}
/**
* Determines if a given binary op with scalar literal guarantee an output
* sparsity which is exactly the same as its matrix input sparsity.
*
* @param op
* @param lit
* @return
*/
public static boolean isBinaryOpConditionalSparseSafeExact( OpOp2 op, LiteralOp lit )
{
double val = HopRewriteUtils.getDoubleValueSafe(lit);
return ( op==OpOp2.NOTEQUAL && val==0);
}
/**
*
* @param sp1
* @param op
* @param lit
* @return
*/
public static double getBinaryOpSparsityConditionalSparseSafe( double sp1, OpOp2 op, LiteralOp lit )
{
double val = HopRewriteUtils.getDoubleValueSafe(lit);
return ( (op==OpOp2.GREATER && val==0)
||(op==OpOp2.LESS && val==0)
||(op==OpOp2.NOTEQUAL && val==0)
||(op==OpOp2.EQUAL && val!=0)
||(op==OpOp2.MINUS && val==0)) ? sp1 : 1.0;
}
/**
* Estimates the result sparsity for matrix-matrix binary operations (A op B)
*
* @param sp1 -- sparsity of A
* @param sp2 -- sparsity of B
* @param op -- binary operation
* @return
*
* NOTE: append has specific computation
*/
public static double getBinaryOpSparsity(double sp1, double sp2, OpOp2 op, boolean worstcase)
{
// default is worst-case estimate for robustness
double ret = 1.0;
if( worstcase )
{
//NOTE: for matrix-scalar operations this estimate is too conservative, because
//Math.min(1, sp1 + sp2) will always give a sparsity 1 if we pass sp2=1 for scalars.
//In order to do better (with guarantees), we need to take the actual values into account
switch(op)
{
case PLUS:
case MINUS:
case LESS:
case GREATER:
case NOTEQUAL:
case MIN:
case MAX:
case OR:
ret = Math.min(1, sp1 + sp2); break;
case MULT:
case AND:
ret = Math.min(sp1, sp2); break;
case DIV:
case MODULUS:
case POW:
case MINUS_NZ:
case LOG_NZ:
ret = sp1; break;
//case EQUAL: //doesnt work on worstcase estimates, but on
// ret = 1-Math.abs(sp1-sp2); break;
default:
ret = 1.0;
}
}
else
{
switch(op) {
case PLUS:
case MINUS:
// result[i,j] != 0 iff A[i,j] !=0 || B[i,j] != 0
// worst case estimate = sp1+sp2
ret = (1 - (1-sp1)*(1-sp2));
break;
case MULT:
// result[i,j] != 0 iff A[i,j] !=0 && B[i,j] != 0
// worst case estimate = min(sp1,sp2)
ret = sp1 * sp2;
break;
case DIV:
ret = 1.0; // worst case estimate
break;
case LESS:
case LESSEQUAL:
case GREATER:
case GREATEREQUAL:
case EQUAL:
case NOTEQUAL:
ret = 1.0; // purely data-dependent operations, and hence worse-case estimate
break;
//MIN, MAX, AND, OR, LOG, POW
default:
ret = 1.0;
}
}
return ret;
}
public static double getSparsity( long dim1, long dim2, long nnz )
{
if( dim1<=0 || dim2<=0 || nnz<0 )
return 1.0;
else
return Math.min(((double)nnz)/dim1/dim2, 1.0);
}
public static String toMB(double inB) {
if ( inB < 0 )
return "-";
return String.format("%.0f", inB/(1024*1024) );
}
/**
* Function to evaluate simple size expressions over literals and now/ncol.
*
* It returns the exact results of this expressions if known, otherwise
* Long.MAX_VALUE if unknown.
*
* @param root
* @return
* @throws HopsException
*/
public static long rEvalSimpleLongExpression( Hop root, HashMap valMemo )
throws HopsException
{
long ret = Long.MAX_VALUE;
//for simplicity and robustness call double and cast.
HashMap dvalMemo = new HashMap();
double tmp = rEvalSimpleDoubleExpression(root, dvalMemo);
if( tmp!=Double.MAX_VALUE )
ret = UtilFunctions.toLong( tmp );
return ret;
}
/**
*
* @param root
* @param valMemo
* @param vars
* @return
* @throws HopsException
*/
public static long rEvalSimpleLongExpression( Hop root, HashMap valMemo, LocalVariableMap vars )
throws HopsException
{
long ret = Long.MAX_VALUE;
//for simplicity and robustness call double and cast.
HashMap dvalMemo = new HashMap();
double tmp = rEvalSimpleDoubleExpression(root, dvalMemo, vars);
if( tmp!=Double.MAX_VALUE )
ret = UtilFunctions.toLong( tmp );
return ret;
}
/**
*
* @param root
* @param valMemo
* @return
* @throws HopsException
*/
public static double rEvalSimpleDoubleExpression( Hop root, HashMap valMemo )
throws HopsException
{
//memoization (prevent redundant computation of common subexpr)
if( valMemo.containsKey(root.getHopID()) )
return valMemo.get(root.getHopID());
double ret = Double.MAX_VALUE;
//always use constants
if( root instanceof LiteralOp )
ret = HopRewriteUtils.getDoubleValue((LiteralOp)root);
//advanced size expression evaluation
if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION )
{
if( root instanceof UnaryOp )
ret = rEvalSimpleUnaryDoubleExpression(root, valMemo);
else if( root instanceof BinaryOp )
ret = rEvalSimpleBinaryDoubleExpression(root, valMemo);
}
valMemo.put(root.getHopID(), ret);
return ret;
}
/**
*
* @param root
* @param valMemo
* @param vars
* @return
* @throws HopsException
*/
public static double rEvalSimpleDoubleExpression( Hop root, HashMap valMemo, LocalVariableMap vars )
throws HopsException
{
//memoization (prevent redundant computation of common subexpr)
if( valMemo.containsKey(root.getHopID()) )
return valMemo.get(root.getHopID());
double ret = Double.MAX_VALUE;
if( OptimizerUtils.ALLOW_SIZE_EXPRESSION_EVALUATION )
{
if( root instanceof LiteralOp )
ret = HopRewriteUtils.getDoubleValue((LiteralOp)root);
else if( root instanceof UnaryOp )
ret = rEvalSimpleUnaryDoubleExpression(root, valMemo, vars);
else if( root instanceof BinaryOp )
ret = rEvalSimpleBinaryDoubleExpression(root, valMemo, vars);
else if( root instanceof DataOp ) {
String name = root.getName();
Data dat = vars.get(name);
if( dat!=null && dat instanceof ScalarObject )
ret = ((ScalarObject)dat).getDoubleValue();
}
}
valMemo.put(root.getHopID(), ret);
return ret;
}
/**
*
* @param root
* @param valMemo
* @return
* @throws HopsException
*/
protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap valMemo )
throws HopsException
{
//memoization (prevent redundant computation of common subexpr)
if( valMemo.containsKey(root.getHopID()) )
return valMemo.get(root.getHopID());
double ret = Double.MAX_VALUE;
UnaryOp uroot = (UnaryOp) root;
Hop input = uroot.getInput().get(0);
if(uroot.getOp() == Hop.OpOp1.NROW)
ret = (input.getDim1()>0) ? input.getDim1() : Double.MAX_VALUE;
else if( uroot.getOp() == Hop.OpOp1.NCOL )
ret = (input.getDim2()>0) ? input.getDim2() : Double.MAX_VALUE;
else
{
double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo);
if( lval != Double.MAX_VALUE )
{
switch( uroot.getOp() )
{
case SQRT: ret = Math.sqrt(lval); break;
case ROUND: ret = Math.round(lval); break;
case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break;
case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break;
case CAST_AS_DOUBLE: ret = lval; break;
default: ret = Double.MAX_VALUE;
}
}
}
valMemo.put(root.getHopID(), ret);
return ret;
}
/**
*
* @param root
* @param valMemo
* @param vars
* @return
* @throws HopsException
*/
protected static double rEvalSimpleUnaryDoubleExpression( Hop root, HashMap valMemo, LocalVariableMap vars )
throws HopsException
{
//memoization (prevent redundant computation of common subexpr)
if( valMemo.containsKey(root.getHopID()) )
return valMemo.get(root.getHopID());
double ret = Double.MAX_VALUE;
UnaryOp uroot = (UnaryOp) root;
Hop input = uroot.getInput().get(0);
if(uroot.getOp() == Hop.OpOp1.NROW)
ret = (input.getDim1()>0) ? input.getDim1() : Double.MAX_VALUE;
else if( uroot.getOp() == Hop.OpOp1.NCOL )
ret = (input.getDim2()>0) ? input.getDim2() : Double.MAX_VALUE;
else
{
double lval = rEvalSimpleDoubleExpression(uroot.getInput().get(0), valMemo, vars);
if( lval != Double.MAX_VALUE )
{
switch( uroot.getOp() )
{
case SQRT: ret = Math.sqrt(lval); break;
case ROUND: ret = Math.round(lval); break;
case CAST_AS_BOOLEAN: ret = (lval!=0)? 1 : 0; break;
case CAST_AS_INT: ret = UtilFunctions.toLong(lval); break;
case CAST_AS_DOUBLE: ret = lval; break;
default: ret = Double.MAX_VALUE;
}
}
}
valMemo.put(root.getHopID(), ret);
return ret;
}
/**
*
* @param root
* @param valMemo
* @return
* @throws HopsException
*/
protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap valMemo )
throws HopsException
{
//memoization (prevent redundant computation of common subexpr)
if( valMemo.containsKey(root.getHopID()) )
return valMemo.get(root.getHopID());
double ret = Double.MAX_VALUE;
BinaryOp broot = (BinaryOp) root;
double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo);
double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo);
//note: positive and negative values might be valid subexpressions
if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known
{
switch( broot.getOp() )
{
case PLUS: ret = lret + rret; break;
case MINUS: ret = lret - rret; break;
case MULT: ret = lret * rret; break;
case DIV: ret = lret / rret; break;
case MIN: ret = Math.min(lret, rret); break;
case MAX: ret = Math.max(lret, rret); break;
case POW: ret = Math.pow(lret, rret); break;
default: ret = Double.MAX_VALUE;
}
}
valMemo.put(root.getHopID(), ret);
return ret;
}
/**
*
* @param root
* @param valMemo
* @param vars
* @return
* @throws HopsException
*/
protected static double rEvalSimpleBinaryDoubleExpression( Hop root, HashMap valMemo, LocalVariableMap vars )
throws HopsException
{
//memoization (prevent redundant computation of common subexpr)
if( valMemo.containsKey(root.getHopID()) )
return valMemo.get(root.getHopID());
double ret = Double.MAX_VALUE;
BinaryOp broot = (BinaryOp) root;
double lret = rEvalSimpleDoubleExpression(broot.getInput().get(0), valMemo, vars);
double rret = rEvalSimpleDoubleExpression(broot.getInput().get(1), valMemo, vars);
//note: positive and negative values might be valid subexpressions
if( lret!=Double.MAX_VALUE && rret!=Double.MAX_VALUE ) //if known
{
switch( broot.getOp() )
{
case PLUS: ret = lret + rret; break;
case MINUS: ret = lret - rret; break;
case MULT: ret = lret * rret; break;
case DIV: ret = lret / rret; break;
case MIN: ret = Math.min(lret, rret); break;
case MAX: ret = Math.max(lret, rret); break;
case POW: ret = Math.pow(lret, rret); break;
default: ret = Double.MAX_VALUE;
}
}
valMemo.put(root.getHopID(), ret);
return ret;
}
}