org.apache.sysml.runtime.matrix.data.LibMatrixMult Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.sysml.runtime.matrix.data;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;

import org.apache.commons.math3.util.FastMath;
import org.apache.sysml.lops.MapMultChain.ChainType;
import org.apache.sysml.lops.WeightedCrossEntropy.WCeMMType;
import org.apache.sysml.lops.WeightedDivMM.WDivMMType;
import org.apache.sysml.lops.WeightedSigmoid.WSigmoidType;
import org.apache.sysml.lops.WeightedSquaredLoss.WeightsType;
import org.apache.sysml.lops.WeightedUnaryMM.WUMMType;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.functionobjects.SwapIndex;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
import org.apache.sysml.runtime.matrix.operators.ReorgOperator;
import org.apache.sysml.runtime.util.UtilFunctions;

/**
 * MB:
 * Library for matrix multiplications including MM, MV, VV for all
 * combinations of dense, sparse, ultrasparse representations and special
 * operations such as transpose-self matrix multiplication.
 * 
 * In general all implementations use internally dense outputs
 * for direct access, but change the final result to sparse if necessary.
 * The only exceptions are ultra-sparse matrix mult, wsloss and wsigmoid.  
 * 
 * NOTES on BLAS:
 * * Experiments in 04/2013 showed that even on dense-dense this implementation 
 *   is 3x faster than f2j-BLAS-DGEMM, 2x faster than f2c-BLAS-DGEMM, and
 *   level (+10% after JIT) with a native C implementation. 
 * * Calling native BLAS would loose platform independence and would require 
 *   JNI calls incl data transfer. Furthermore, BLAS does not support sparse 
 *   matrices (except Sparse BLAS, with dedicated function calls and matrix formats) 
 *   and would be an external dependency. 
 * * Experiments in 02/2014 showed that on dense-dense this implementation now achieves
 *   almost 30% peak FP performance. Compared to Intel MKL 11.1 (dgemm, N=1000) it is
 *   just 3.2x (sparsity=1.0) and 1.9x (sparsity=0.5) slower, respectively.  
 *  
 */
public class LibMatrixMult 
{
	//internal configuration
	private static final boolean LOW_LEVEL_OPTIMIZATION = true;
	private static final long MEM_OVERHEAD_THRESHOLD = 2L*1024*1024; //MAX 2 MB
	private static final long PAR_MINFLOP_THRESHOLD = 2L*1024*1024; //MIN 2 MFLOP
	private static final int L2_CACHESIZE = 256 *1024; //256KB (common size)
	
	private LibMatrixMult() {
		//prevent instantiation via private constructor
	}
	
	////////////////////////////////
	// public matrix mult interface
	////////////////////////////////
	
	/**
	 * Performs a matrix multiplication and stores the result in the output matrix.
	 * 
	 * All variants use a IKJ access pattern, and internally use dense output. After the
	 * actual computation, we recompute nnz and check for sparse/dense representation.
	 *  
	 * 
	 * @param m1 first matrix
	 * @param m2 second matrix
	 * @param ret result matrix
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret) 
		throws DMLRuntimeException
	{	
		matrixMult(m1, m2, ret, 0, m1.rlen);
	}
	
	/**
	 * This method allows one to disabling exam sparsity. This feature is useful if matrixMult is used as an intermediate
	 * operation (for example: LibMatrixDNN). It makes sense for LibMatrixDNN because the output is internally
	 * consumed by another dense instruction, which makes repeated conversion to sparse wasteful.
	 * This should be used in rare cases and if you are unsure,
	 * use the method 'matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret)' instead.
	 * 
	 * @param m1 first matrix
	 * @param m2 second matrix
	 * @param ret result matrix
	 * @param examSparsity if false, sparsity examination is disabled
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean examSparsity) 
			throws DMLRuntimeException
	{	
		matrixMult(m1, m2, ret, 0, m1.rlen, examSparsity);
	}
	
	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru) 
			throws DMLRuntimeException
	{
		matrixMult(m1, m2, ret, rl, ru, true);
	}

	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int rl, int ru, boolean examSparsity) 
		throws DMLRuntimeException
	{	
		//check inputs / outputs
		if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) {
			ret.examSparsity(); //turn empty dense into sparse
			return;
		}
		
		//Timing time = new Timing(true);
		
		//pre-processing: output allocation
		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
		m2 = prepMatrixMultRightInput(m1, m2);
		ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse());
		if( !ret.sparse )
			ret.allocateDenseBlock();
		
		//prepare row-upper for special cases of vector-matrix
		boolean pm2 = checkParMatrixMultRightInputRows(m1, m2, Integer.MAX_VALUE);
		int ru2 = (pm2 && ru==m1.rlen) ? m2.rlen : ru; 
		int cu = m2.clen;
		
		//core matrix mult computation
		if( m1.isUltraSparse() || m2.isUltraSparse() )
			matrixMultUltraSparse(m1, m2, ret, 0, ru2);
		else if(!m1.sparse && !m2.sparse)
			matrixMultDenseDense(m1, m2, ret, tm2, pm2, 0, ru2, 0, cu);
		else if(m1.sparse && m2.sparse)
			matrixMultSparseSparse(m1, m2, ret, pm2, 0, ru2);
		else if(m1.sparse)
			matrixMultSparseDense(m1, m2, ret, pm2, 0, ru2);
		else
			matrixMultDenseSparse(m1, m2, ret, pm2, 0, ru2);
		
		//post-processing: nnz/representation
		if( !ret.sparse )
			ret.recomputeNonZeros();
		
		if(examSparsity)
			ret.examSparsity();
		
		//System.out.println("MM ("+m1.isInSparseFormat()+","+m1.getNumRows()+","+m1.getNumColumns()+","+m1.getNonZeros()+")x" +
		//		              "("+m2.isInSparseFormat()+","+m2.getNumRows()+","+m2.getNumColumns()+","+m2.getNonZeros()+") in "+time.stop());
	}
	
	/**
	 * Performs a multi-threaded matrix multiplication and stores the result in the output matrix.
	 * The parameter k (k>=1) determines the max parallelism k' with k'=min(k, vcores, m1.rlen).
	 * 
	 * @param m1 first matrix
	 * @param m2 second matrix
	 * @param ret result matrix
	 * @param k maximum parallelism
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
	public static void matrixMult(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, int k) 
		throws DMLRuntimeException
	{	
		//check inputs / outputs
		if( m1.isEmptyBlock(false) || m2.isEmptyBlock(false) ) {
			ret.examSparsity(); //turn empty dense into sparse
			return;
		}
		
		//check too high additional vector-matrix memory requirements (fallback to sequential)
		//check too small workload in terms of flops (fallback to sequential too)
		if( m1.rlen == 1 && (8L * m2.clen * k > MEM_OVERHEAD_THRESHOLD || !LOW_LEVEL_OPTIMIZATION || m2.clen==1 || m1.isUltraSparse() || m2.isUltraSparse()) 
			|| 2L * m1.rlen * m1.clen * m2.clen < PAR_MINFLOP_THRESHOLD ) 
		{ 
			matrixMult(m1, m2, ret);
			return;
		}
		
		//Timing time = new Timing(true);
		
		//pre-processing: output allocation (in contrast to single-threaded,
		//we need to allocate sparse as well in order to prevent synchronization)
		boolean tm2 = checkPrepMatrixMultRightInput(m1,m2);
		m2 = prepMatrixMultRightInput(m1, m2);
		ret.sparse = (m1.isUltraSparse() || m2.isUltraSparse());
		if( !ret.sparse )
			ret.allocateDenseBlock();
		else
			ret.allocateSparseRowsBlock();
		
		if (!ret.isThreadSafe()){
			matrixMult(m1, m2, ret);
			return;
		}
		
		//prepare row-upper for special cases of vector-matrix / matrix-matrix
		boolean pm2r = checkParMatrixMultRightInputRows(m1, m2, k);
		boolean pm2c = checkParMatrixMultRightInputCols(m1, m2, k, pm2r);
		int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen; 
		
		//core multi-threaded matrix mult computation
		//(currently: always parallelization over number of rows)
		try {
			ExecutorService pool = Executors.newFixedThreadPool( k );
			ArrayList tasks = new ArrayList();
			int nk = (pm2r||pm2c) ? k : UtilFunctions.roundToNext(Math.min(8*k,num/32), k);
			ArrayList blklens = getBalancedBlockSizes(num, nk);
			for( int i=0, lb=0; i> taskret = pool.invokeAll(tasks);	
			pool.shutdown();
			//aggregate partial results (nnz, ret for vector/matrix)
			ret.nonZeros = 0; //reset after execute
			for( Future