org.apache.sysml.runtime.matrix.data.LibMatrixCUDA Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix.data;
import jcuda.Pointer;
import jcuda.Sizeof;
import jcuda.jcublas.JCublas2;
import jcuda.jcublas.cublasFillMode;
import jcuda.jcublas.cublasHandle;
import jcuda.jcublas.cublasOperation;
import jcuda.jcudnn.cudnnActivationDescriptor;
import jcuda.jcudnn.cudnnBatchNormMode;
import jcuda.jcudnn.cudnnConvolutionDescriptor;
import jcuda.jcudnn.cudnnConvolutionFwdPreference;
import jcuda.jcudnn.cudnnFilterDescriptor;
import jcuda.jcudnn.cudnnHandle;
import jcuda.jcudnn.cudnnPoolingDescriptor;
import jcuda.jcudnn.cudnnStatus;
import jcuda.jcudnn.cudnnTensorDescriptor;
import jcuda.jcusparse.JCusparse;
import jcuda.jcusparse.cusparseHandle;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.runtime.DMLRuntimeException;
import org.apache.sysml.runtime.controlprogram.caching.MatrixObject;
import org.apache.sysml.runtime.controlprogram.context.ExecutionContext;
import org.apache.sysml.runtime.functionobjects.And;
import org.apache.sysml.runtime.functionobjects.Builtin;
import org.apache.sysml.runtime.functionobjects.CM;
import org.apache.sysml.runtime.functionobjects.Divide;
import org.apache.sysml.runtime.functionobjects.Equals;
import org.apache.sysml.runtime.functionobjects.GreaterThan;
import org.apache.sysml.runtime.functionobjects.GreaterThanEquals;
import org.apache.sysml.runtime.functionobjects.IndexFunction;
import org.apache.sysml.runtime.functionobjects.KahanPlus;
import org.apache.sysml.runtime.functionobjects.KahanPlusSq;
import org.apache.sysml.runtime.functionobjects.LessThan;
import org.apache.sysml.runtime.functionobjects.LessThanEquals;
import org.apache.sysml.runtime.functionobjects.Mean;
import org.apache.sysml.runtime.functionobjects.Minus;
import org.apache.sysml.runtime.functionobjects.Multiply;
import org.apache.sysml.runtime.functionobjects.Multiply2;
import org.apache.sysml.runtime.functionobjects.NotEquals;
import org.apache.sysml.runtime.functionobjects.Or;
import org.apache.sysml.runtime.functionobjects.Plus;
import org.apache.sysml.runtime.functionobjects.Power;
import org.apache.sysml.runtime.functionobjects.Power2;
import org.apache.sysml.runtime.functionobjects.ReduceAll;
import org.apache.sysml.runtime.functionobjects.ReduceCol;
import org.apache.sysml.runtime.functionobjects.ReduceDiag;
import org.apache.sysml.runtime.functionobjects.ReduceRow;
import org.apache.sysml.runtime.functionobjects.ValueFunction;
import org.apache.sysml.runtime.instructions.cp.DoubleObject;
import org.apache.sysml.runtime.instructions.gpu.GPUInstruction;
import org.apache.sysml.runtime.instructions.gpu.context.ExecutionConfig;
import org.apache.sysml.runtime.instructions.gpu.context.GPUContext;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaContext;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaKernels;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject;
import org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.CSRPointer;
import org.apache.sysml.runtime.matrix.operators.AggregateOperator;
import org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator;
import org.apache.sysml.runtime.matrix.operators.BinaryOperator;
import org.apache.sysml.runtime.matrix.operators.CMOperator;
import org.apache.sysml.runtime.matrix.operators.LeftScalarOperator;
import org.apache.sysml.runtime.matrix.operators.RightScalarOperator;
import org.apache.sysml.runtime.matrix.operators.ScalarOperator;
import org.apache.sysml.utils.GPUStatistics;
import org.apache.sysml.utils.Statistics;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_N;
import static jcuda.jcublas.cublasOperation.CUBLAS_OP_T;
import static jcuda.jcudnn.JCudnn.cudnnActivationForward;
import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationBackward;
import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardInference;
import static jcuda.jcudnn.JCudnn.cudnnBatchNormalizationForwardTraining;
import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardData;
import static jcuda.jcudnn.JCudnn.cudnnConvolutionBackwardFilter;
import static jcuda.jcudnn.JCudnn.cudnnConvolutionForward;
import static jcuda.jcudnn.JCudnn.cudnnCreateActivationDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnCreateConvolutionDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnCreateFilterDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnCreatePoolingDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnCreateTensorDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnDestroyConvolutionDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnDestroyFilterDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnDestroyPoolingDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardDataWorkspaceSize;
import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionBackwardFilterWorkspaceSize;
import static jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardWorkspaceSize;
import static jcuda.jcudnn.JCudnn.cudnnPoolingBackward;
import static jcuda.jcudnn.JCudnn.cudnnPoolingForward;
import static jcuda.jcudnn.JCudnn.cudnnSetActivationDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnSetConvolution2dDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnSetFilter4dDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnSetPooling2dDescriptor;
import static jcuda.jcudnn.JCudnn.cudnnSetTensor4dDescriptor;
import static jcuda.jcudnn.cudnnActivationMode.CUDNN_ACTIVATION_RELU;
import static jcuda.jcudnn.cudnnConvolutionMode.CUDNN_CROSS_CORRELATION;
import static jcuda.jcudnn.cudnnDataType.CUDNN_DATA_DOUBLE;
import static jcuda.jcudnn.cudnnNanPropagation.CUDNN_PROPAGATE_NAN;
import static jcuda.jcudnn.cudnnPoolingMode.CUDNN_POOLING_MAX;
import static jcuda.jcudnn.cudnnTensorFormat.CUDNN_TENSOR_NCHW;
import static jcuda.jcusparse.JCusparse.cusparseDcsrgemm;
import static jcuda.jcusparse.JCusparse.cusparseDcsrmv;
import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_NON_TRANSPOSE;
import static jcuda.jcusparse.cusparseOperation.CUSPARSE_OPERATION_TRANSPOSE;
import static jcuda.runtime.JCuda.cudaDeviceSynchronize;
import static jcuda.runtime.JCuda.cudaMemcpy;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToDevice;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost;
import static jcuda.runtime.cudaMemcpyKind.cudaMemcpyHostToDevice;
import static org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.allocate;
import static org.apache.sysml.runtime.instructions.gpu.context.JCudaObject.cudaFreeHelper;
//FIXME move could to respective instructions, this is not a block library
public class LibMatrixCUDA {
// Assume Compute Capability 3.0
// MAX BLOCKS is 2^31 - 1 For compute capability > 3.0
// MAX_THREADS is 1024 For compute capability > 3.0
private static int _MAX_THREADS = -1;
private static int _MAX_BLOCKS = -1;
private static int _WARP_SIZE = -1;
/**
* Utility function to get maximum number of threads supported by the active CUDA device.
* This is put into a singleton style method because the GPUContext is not fully initialized when
* the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a
* static block in this class, it will access the {@link JCudaContext} instance when it is not
* properly initialized. Due to the proper checks in place, a deadlock occurs.
* @return max threads
* @throws DMLRuntimeException if exception occurs
*/
static int getMaxThreads() throws DMLRuntimeException{
if (_MAX_THREADS == -1){
_MAX_THREADS = JCudaContext.getMaxThreadsPerBlock();
}
return _MAX_THREADS;
}
/**
* Utility function to get maximum number of blocks supported by the active CUDA device.
* This is put into a singleton style method because the GPUContext is not fully initialized when
* the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a
* static block in this class, it will access the {@link JCudaContext} instance when it is not
* properly initialized. Due to the proper checks in place, a deadlock occurs.
* @return max blocks
* @throws DMLRuntimeException if exception occurs
*/
static int getMaxBlocks() throws DMLRuntimeException{
if (_MAX_BLOCKS == -1){
_MAX_BLOCKS = JCudaContext.getMaxBlocks();
}
return _MAX_BLOCKS;
}
/**
* Utility function to get the warp size supported by the active CUDA device.
* This is put into a singleton style method because the GPUContext is not fully initialized when
* the LibMatrixCUDA class is loaded. If the {@link GPUContext#getGPUContext()} is invoked in a
* static block in this class, it will access the {@link JCudaContext} instance when it is not
* properly initialized. Due to the proper checks in place, a deadlock occurs.
* @return warp size
* @throws DMLRuntimeException if exception occurs
*/
static int getWarpSize() throws DMLRuntimeException {
if (_WARP_SIZE == -1) {
_WARP_SIZE = JCudaContext.getWarpSize();
}
return _WARP_SIZE;
}
public static boolean isInSparseFormat(MatrixObject mo) {
if(mo.getGPUObject() != null && mo.getGPUObject().isAllocated())
return mo.getGPUObject().isInSparseFormat();
return MatrixBlock.evalSparseFormatInMemory(mo.getNumRows(), mo.getNumColumns(), mo.getNnz());
}
//********************************************************************/
//***************** DEEP LEARNING Operators **************************/
//********************************************************************/
public static cudnnHandle cudnnHandle;
public static cublasHandle cublasHandle;
public static cusparseHandle cusparseHandle;
public static JCudaKernels kernels; // Used to launch custom kernels
private static final Log LOG = LogFactory.getLog(LibMatrixCUDA.class.getName());
private static int CONVOLUTION_PREFERENCE = cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
private static Pointer _one;
private static Pointer _zero;
/**
* Convenience method to get a pointer to value '1.0' on device. Instead of allocating and deallocating it for every kernel invocation.
* @return jcuda pointer
*/
private static Pointer one() {
if(_one == null) {
_one = pointerTo(1.0);
}
return _one;
}
/**
* Convenience method to get a pointer to value '0.0f' on device. Instead of allocating and deallocating it for every kernel invocation.
* @return jcuda pointer
*/
private static Pointer zero() {
if(_zero == null) {
_zero = pointerTo(0.0f);
}
return _zero;
}
/**
* Convenience method to get tensor descriptor from underlying JCudaObject
* @param mat matrix object
* @param N number of images
* @param C number of channels
* @param H height
* @param W width
* @return cudnn tensor descriptor
* @throws DMLRuntimeException if the input descriptor and matrix dimensions don't match
*/
private static cudnnTensorDescriptor allocateTensorDescriptor(MatrixObject mat, int N, int C, int H, int W) throws DMLRuntimeException {
if(mat.getNumRows() != N || mat.getNumColumns() != C*H*W) {
throw new DMLRuntimeException("Mismatch descriptor-matrix dimensions:" + mat.getNumRows() + " != " + N
+ " || " + mat.getNumColumns() + " != " + (C*H*W));
}
return ((JCudaObject)mat.getGPUObject()).allocateTensorDescriptor(N, C, H, W);
}
/**
* Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
* @param image input matrix object
* @param isForCuDNN true if the dense pointer is to be used by a CuDNN kernel
* @return jcuda pointer
* @throws DMLRuntimeException if error occurs while sparse to dense conversion
*/
private static Pointer getDensePointer(MatrixObject image, boolean isForCuDNN, String instName) throws DMLRuntimeException {
if(isForCuDNN && image.getNumRows()*image.getNumColumns() > numDoublesIn2GB) {
throw new DMLRuntimeException("CuDNN restriction: the size of input tensor cannot be greater than 2GB. Hint: try reducing the mini-batch size.");
}
return getDensePointer(image, instName);
}
/**
* Convenience method to get jcudaDenseMatrixPtr. This method explicitly converts sparse to dense format, so use it judiciously.
* @param image input matrix object
* @return jcuda pointer
* @throws DMLRuntimeException if error occurs while sparse to dense conversion
*/
private static Pointer getDensePointer(MatrixObject image, String instName) throws DMLRuntimeException {
if(isInSparseFormat(image)) {
((JCudaObject)image.getGPUObject()).sparseToDense(instName);
}
return ((JCudaObject)image.getGPUObject()).jcudaDenseMatrixPtr;
}
/**
* Convenience method for checking the status of CuDNN kernel.
*
* @param status status returned by CuDNN
* @throws DMLRuntimeException if status is not CUDNN_STATUS_SUCCESS
*/
private static void checkStatus(int status) throws DMLRuntimeException {
if(status != cudnnStatus.CUDNN_STATUS_SUCCESS)
throw new DMLRuntimeException("Error status returned by CuDNN:" + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
public static void conv2dBiasAdd(String instName, MatrixObject image, MatrixObject bias, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
throws DMLRuntimeException {
conv2d(instName, image, filter, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
biasAdd(instName, outputBlock, bias, outputBlock);
}
public static void conv2d(String instName, MatrixObject image, MatrixObject filter, MatrixObject outputBlock, int N, int C, int H, int W,
int K, int R, int S, int pad_h, int pad_w, int stride_h, int stride_w, int P, int Q)
throws DMLRuntimeException {
cudnnFilterDescriptor filterDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
long sizeInBytes = 0;
try {
long t1=0, t2=0;
// Allocate descriptors
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(image, N, C, H, W);
cudnnTensorDescriptor dstTensorDesc = allocateTensorDescriptor(outputBlock, N, K, P, Q);
filterDesc = allocateFilterDescriptor(K, C, R, S);
Pointer imagePointer = getDensePointer(image, true, instName);
Pointer filterPointer = getDensePointer(filter, true, instName);
Pointer dstPointer = getDensePointer(outputBlock, true, instName);
int padding [] = { pad_h, pad_w };
int strides [] = { stride_h, stride_w };
convDesc = allocateConvolutionDescriptor(padding, strides);
// Select the best algorithm depending on the data and supported CUDA
int algo = -1;
workSpace = new Pointer();
if(CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_NO_WORKSPACE) {
algo = jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
else if(CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_PREFER_FASTEST) {
int [] algos = {
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
jcuda.jcudnn.cudnnConvolutionFwdAlgo.CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM
};
// TODO: Look into FFt, Winograd, etc
// Also ensure that GPU has enough memory to allocate memory
long sizeInBytesArray[] = { 0 };
algo = jcuda.jcudnn.JCudnn.cudnnGetConvolutionForwardAlgorithm(cudnnHandle, srcTensorDesc, filterDesc, convDesc, dstTensorDesc,
CONVOLUTION_PREFERENCE, sizeInBytesArray[0], algos);
cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle, srcTensorDesc, filterDesc, convDesc, dstTensorDesc, algo, sizeInBytesArray);
if(sizeInBytesArray[0] != 0)
workSpace = allocate(sizeInBytesArray[0]);
sizeInBytes = sizeInBytesArray[0];
}
else if(CONVOLUTION_PREFERENCE == cudnnConvolutionFwdPreference.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT) {
throw new DMLRuntimeException("CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT is not implemented");
}
else {
throw new DMLRuntimeException("Unsupported preference criteria for convolution");
}
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnConvolutionForward(cudnnHandle, one(),
srcTensorDesc, imagePointer,
filterDesc, filterPointer,
convDesc, algo, workSpace, sizeInBytes, zero(),
dstTensorDesc, dstPointer);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_FORWARD_LIB, System.nanoTime() - t2);
if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnConvolutionForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
}
finally {
long t3=0;
if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
if(filterDesc != null)
cudnnDestroyFilterDescriptor(filterDesc);
if(convDesc != null)
cudnnDestroyConvolutionDescriptor(convDesc);
if(workSpace != null && sizeInBytes != 0)
cudaFreeHelper(instName, workSpace);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
private static cudnnConvolutionDescriptor allocateConvolutionDescriptor(int padding [], int strides []) {
cudnnConvolutionDescriptor convDesc = new cudnnConvolutionDescriptor();
cudnnCreateConvolutionDescriptor(convDesc);
cudnnSetConvolution2dDescriptor(convDesc, padding[0], padding[1], strides[0], strides[1], 1, 1, CUDNN_CROSS_CORRELATION);
return convDesc;
}
public static Pointer pointerTo(double value) {
return Pointer.to(new double[] { value });
}
private static cudnnFilterDescriptor allocateFilterDescriptor(int K, int C, int R, int S) {
cudnnFilterDescriptor filterDesc = new cudnnFilterDescriptor();
cudnnCreateFilterDescriptor(filterDesc);
cudnnSetFilter4dDescriptor(filterDesc, CUDNN_DATA_DOUBLE, CUDNN_TENSOR_NCHW, K, C, R, S);
return filterDesc;
}
/**
* allocates pooling descriptor, used in poolingForward and poolingBackward
* @param R pooling window height
* @param S pooling window width
* @param pad_h vertical padding
* @param pad_w horizontal padding
* @param stride_h pooling vertical stride
* @param stride_w pooling horizontal stride
* @return cudnn pooling descriptor
*/
private static cudnnPoolingDescriptor allocatePoolingDescriptor(int R, int S, int pad_h, int pad_w, int stride_h, int stride_w) {
cudnnPoolingDescriptor poolingDesc = new cudnnPoolingDescriptor();
cudnnCreatePoolingDescriptor(poolingDesc);
cudnnSetPooling2dDescriptor(poolingDesc, CUDNN_POOLING_MAX, CUDNN_PROPAGATE_NAN, R, S, pad_h, pad_w, stride_h, stride_w);
return poolingDesc;
}
/**
* This method computes the backpropagation errors for previous layer of relu operation
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param dout next layer error propogation
* @param outputBlock output
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void reluBackward(String instName, MatrixObject input, MatrixObject dout, MatrixObject outputBlock) throws DMLRuntimeException {
long rows = input.getNumRows();
long cols = input.getNumColumns();
Pointer imagePointer = getDensePointer(input, instName);
Pointer doutPointer = getDensePointer(dout, instName);
Pointer outputPointer = getDensePointer(outputBlock, instName);
long t1=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
kernels.launchKernel("relu_backward",
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
imagePointer, doutPointer, outputPointer, (int)rows, (int)cols);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_BIAS_ADD_LIB, System.nanoTime() - t1);
}
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input * matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param bias bias
* @param outputBlock output
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void biasMultiply(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
if(isInSparseFormat(input)) {
((JCudaObject)input.getGPUObject()).sparseToDense(instName);
}
if(isInSparseFormat(bias)) {
((JCudaObject)bias.getGPUObject()).sparseToDense(instName);
}
long rows = input.getNumRows();
long cols = input.getNumColumns();
long K = bias.getNumRows();
long PQ = cols / K;
if(bias.getNumColumns() != 1 || cols % K != 0) {
throw new DMLRuntimeException("Incorrect inputs for bias_multiply: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
}
Pointer imagePointer = ((JCudaObject)input.getGPUObject()).jcudaDenseMatrixPtr;
Pointer biasPointer = ((JCudaObject)bias.getGPUObject()).jcudaDenseMatrixPtr;
Pointer outputPointer = ((JCudaObject)outputBlock.getGPUObject()).jcudaDenseMatrixPtr;
long t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
kernels.launchKernel("bias_multiply",
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int) PQ);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
}
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param input input image
* @param bias bias
* @param outputBlock output
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void biasAdd(String instName, MatrixObject input, MatrixObject bias, MatrixObject outputBlock) throws DMLRuntimeException {
long rows = input.getNumRows();
long cols = input.getNumColumns();
long K = bias.getNumRows();
long PQ = cols / K;
if(bias.getNumColumns() != 1 || cols % K != 0) {
throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + rows + " X " + cols + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
}
Pointer imagePointer = getDensePointer(input, instName);
Pointer biasPointer = getDensePointer(bias, instName);
Pointer outputPointer = getDensePointer(outputBlock, instName);
long t1 = 0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
kernels.launchKernel("bias_add",
ExecutionConfig.getConfigForSimpleMatrixOperations((int)rows, (int)cols),
imagePointer, biasPointer, outputPointer, (int)rows, (int)cols, (int) PQ);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_BACKWARD_KERNEL, System.nanoTime() - t1);
}
private static void validateBatchNormalizationDimensions(MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar, int C) throws DMLRuntimeException {
if(scale.getNumRows() != 1 || scale.getNumColumns() != C) {
throw new DMLRuntimeException("Incorrect dimensions for scale");
}
if(bias.getNumRows() != 1 || bias.getNumColumns() != C) {
throw new DMLRuntimeException("Incorrect dimensions for bias");
}
if(runningMean.getNumRows() != 1 || runningMean.getNumColumns() != C) {
throw new DMLRuntimeException("Incorrect dimensions for running mean");
}
if(runningVar.getNumRows() != 1 || runningVar.getNumColumns() != C) {
throw new DMLRuntimeException("Incorrect dimensions for running variance");
}
}
/**
* Performs the forward BatchNormalization layer computation for inference
*
* @param instName name of the instruction
* @param image input image
* @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
* @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
* @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
* @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
* @param ret normalized input
* @param epsilon epsilon value used in the batch normalization formula
* @throws DMLRuntimeException if error occurs
*/
public static void batchNormalizationForwardInference(String instName, MatrixObject image,
MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
MatrixObject ret, double epsilon) throws DMLRuntimeException {
int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
int N = (int) image.getNumRows();
int C = (int) scale.getNumColumns();
long CHW = image.getNumColumns();
validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
// Allocate descriptors
cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(N, C, CHW,
new MatrixObject[] {image}, new MatrixObject[] {ret});
cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(scale, 1, C, 1, 1);
// Get underlying dense pointer
Pointer imagePtr = getDensePointer(image, true, instName);
Pointer retPtr = getDensePointer(ret, true, instName);
Pointer biasPtr = getDensePointer(bias, true, instName);
Pointer scalePtr = getDensePointer(scale, true, instName);
Pointer runningMeanPtr = getDensePointer(runningMean, true, instName);
Pointer runningVarPtr = getDensePointer(runningVar, true, instName);
checkStatus(cudnnBatchNormalizationForwardInference(cudnnHandle, mode, one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
scaleTensorDesc, scalePtr, biasPtr,
runningMeanPtr, runningVarPtr, epsilon));
}
/**
* Performs the forward BatchNormalization layer computation for training
*
* @param instName name of the instruction
* @param image input image
* @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
* @param bias bias (as per CuDNN) and beta as per original paper: shape [1, C, 1, 1]
* @param runningMean running mean accumulated during training phase: shape [1, C, 1, 1]
* @param runningVar running variance accumulated during training phase: shape [1, C, 1, 1]
* @param ret (output) normalized input
* @param retRunningMean (output) running mean accumulated during training phase: shape [1, C, 1, 1]
* @param retRunningVar (output) running variance accumulated during training phase: shape [1, C, 1, 1]
* @param epsilon epsilon value used in the batch normalization formula
* @param exponentialAverageFactor factor used in the moving average computation
* @throws DMLRuntimeException if error occurs
*/
public static void batchNormalizationForwardTraining(String instName, MatrixObject image,
MatrixObject scale, MatrixObject bias, MatrixObject runningMean, MatrixObject runningVar,
MatrixObject ret, MatrixObject retRunningMean, MatrixObject retRunningVar, double epsilon, double exponentialAverageFactor) throws DMLRuntimeException {
int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
int N = (int) image.getNumRows();
int C = (int) scale.getNumColumns();
long CHW = image.getNumColumns();
validateBatchNormalizationDimensions(scale, bias, runningMean, runningVar, C);
// Allocate descriptors
cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(N, C, CHW,
new MatrixObject[] {image}, new MatrixObject[] {ret});
cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(scale, 1, C, 1, 1);
// Get underlying dense pointer
Pointer imagePtr = getDensePointer(image, true, instName);
Pointer retPtr = getDensePointer(ret, true, instName);
Pointer biasPtr = getDensePointer(bias, true, instName);
Pointer scalePtr = getDensePointer(scale, true, instName);
Pointer runningMeanPtr = getDensePointer(runningMean, true, instName);
Pointer runningVarPtr = getDensePointer(runningVar, true, instName);
// To allow for copy-on-write
Pointer retRunningMeanPtr = getDensePointer(retRunningMean, true, instName);
Pointer retRunningVarPtr = getDensePointer(retRunningVar, true, instName);
cudaMemcpy(retRunningMeanPtr, runningMeanPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
cudaMemcpy(retRunningVarPtr, runningVarPtr, C * Sizeof.DOUBLE, cudaMemcpyDeviceToDevice);
// ignoring resultSaveMean and resultSaveVariance as it requires state management
checkStatus(cudnnBatchNormalizationForwardTraining(cudnnHandle, mode, one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, retPtr,
scaleTensorDesc, scalePtr, biasPtr, exponentialAverageFactor,
retRunningMeanPtr, retRunningVarPtr, epsilon, new Pointer(), new Pointer()));
}
/**
* Convenient utility for batch normalization that returns a NCHW descriptor
*
* @param N number of images
* @param C number of channels
* @param CHW channels*height*width
* @param input input matrix objects
* @param output output matrix objects
* @return one of the NCHW descriptor
* @throws DMLRuntimeException if error occurs
*/
private static cudnnTensorDescriptor allocateNCHWDescriptors(int N, int C, long CHW, MatrixObject [] input, MatrixObject [] output) throws DMLRuntimeException {
cudnnTensorDescriptor ret = null; // Return any one
if(CHW > ((long)Integer.MAX_VALUE)*C) {
throw new DMLRuntimeException("image size (height*width) should be less than " + Integer.MAX_VALUE);
}
cudnnTensorDescriptor knownNCHWdescriptor = null;
int H = -1; int W = -1;
for(int i = 0; i < input.length; i++) {
knownNCHWdescriptor = ((JCudaObject)input[i].getGPUObject()).getTensorDescriptor();
if(knownNCHWdescriptor != null) {
int [] shape = ((JCudaObject)input[i].getGPUObject()).getTensorShape();
if(shape[0] != N || shape[1] != C) {
throw new DMLRuntimeException("Incorrect N and C:" + shape[0] + " != " + N + " || " + shape[1] + " != " + C);
}
H = shape[2];
W = shape[3];
break;
}
}
if(knownNCHWdescriptor != null) {
// We precisely know N, C, H, W
for(int i = 0; i < input.length; i++) {
ret = allocateTensorDescriptor(input[i], N, C, H, W);
}
for(int i = 0; i < output.length; i++) {
ret = allocateTensorDescriptor(output[i], N, C, H, W);
}
}
else {
int HW = (int) (CHW / C);
H = HW; W = 1; // If not known
double potentialH = Math.sqrt(HW);
if(potentialH == ((int) potentialH)) {
H = (int) potentialH;
W = H;
}
// We are not sure about H and W, hence don't allocate them.
ret = new cudnnTensorDescriptor();
cudnnCreateTensorDescriptor(ret);
cudnnSetTensor4dDescriptor(ret, CUDNN_TENSOR_NCHW, CUDNN_DATA_DOUBLE, N, C, H, W);
}
return ret;
}
/**
* This method computes the backpropagation errors for image, scale and bias of batch normalization layer
*
* @param instName name of the instruction
* @param image input image
* @param dout input errors of shape C, H, W
* @param scale scale (as per CuDNN) and gamma as per original paper: shape [1, C, 1, 1]
* @param ret (output) backpropagation errors for previous layer
* @param retScale backpropagation error for scale
* @param retBias backpropagation error for bias
* @param epsilon epsilon value used in the batch normalization formula
* @throws DMLRuntimeException if error occurs
*/
public static void batchNormalizationBackward(String instName, MatrixObject image, MatrixObject dout,
MatrixObject scale, MatrixObject ret, MatrixObject retScale, MatrixObject retBias,
double epsilon) throws DMLRuntimeException {
int mode = cudnnBatchNormMode.CUDNN_BATCHNORM_SPATIAL;
int N = (int) image.getNumRows();
int C = (int) scale.getNumColumns();
long CHW = image.getNumColumns();
// Allocate descriptors
cudnnTensorDescriptor nCHWDescriptor = allocateNCHWDescriptors(N, C, CHW,
new MatrixObject[] {image, dout}, new MatrixObject[] {ret});
cudnnTensorDescriptor scaleTensorDesc = allocateTensorDescriptor(scale, 1, C, 1, 1);
// Get underlying dense pointer
Pointer imagePtr = getDensePointer(image, true, instName);
Pointer doutPtr = getDensePointer(dout, true, instName);
Pointer scalePtr = getDensePointer(scale, true, instName);
Pointer retPtr = getDensePointer(ret, true, instName);
Pointer retScalePtr = getDensePointer(retScale, true, instName);
Pointer retBiasPtr = getDensePointer(retBias, true, instName);
// ignoring resultSaveMean and resultSaveVariance as it requires state management
checkStatus(cudnnBatchNormalizationBackward(cudnnHandle, mode, one(), zero(), one(), zero(),
nCHWDescriptor, imagePtr, nCHWDescriptor, doutPtr, nCHWDescriptor, retPtr,
scaleTensorDesc, scalePtr, retScalePtr, retBiasPtr, epsilon, new Pointer(), new Pointer()));
}
/**
* This method computes the backpropogation errors for filter of convolution operation
*
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param image input image
* @param dout errors from next layer
* @param outputBlock output errors
* @param N number of images
* @param C number of channels
* @param H height
* @param W width
* @param K number of filters
* @param R filter height
* @param S filter width
* @param pad_h pad height
* @param pad_w pad width
* @param stride_h stride height
* @param stride_w stride width
* @param P output activation height
* @param Q output activation width
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardFilter(String instName, MatrixObject image, MatrixObject dout,
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
cudnnFilterDescriptor dwDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
long sizeInBytes = 0;
try {
long t1=0, t2=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
// Allocate descriptors
cudnnTensorDescriptor xTensorDesc = allocateTensorDescriptor(image, N, C, H, W);
cudnnTensorDescriptor doutTensorDesc = allocateTensorDescriptor(dout, N, K, P, Q);
dwDesc = allocateFilterDescriptor(K, C, R, S);
// Allocate data
Pointer imagePointer = getDensePointer(image, true, instName);
Pointer doutPointer = getDensePointer(dout, true, instName);
Pointer dwPointer = getDensePointer(outputBlock, true, instName);
int padding [] = { pad_h, pad_w };
int strides [] = { stride_h, stride_w };
convDesc = allocateConvolutionDescriptor(padding, strides);
long sizeInBytesArray[] = { 0 };
// TODO: Select the best algorithm depending on the data and supported CUDA
int algo = jcuda.jcudnn.cudnnConvolutionBwdFilterAlgo.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0;
workSpace = new Pointer();
cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle,
xTensorDesc, doutTensorDesc, convDesc, dwDesc, algo, sizeInBytesArray);
if (GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnConvolutionBackwardFilter(cudnnHandle, one(), xTensorDesc, imagePointer,
doutTensorDesc, doutPointer, convDesc, algo, workSpace, sizeInBytes, zero(), dwDesc, dwPointer);
if (GPUStatistics.DISPLAY_STATISTICS)GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_FILTER_LIB, System.nanoTime() - t2);
if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardFilter: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
}
finally {
long t3=0;
if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
if(workSpace != null && sizeInBytes != 0)
cudaFreeHelper(instName, workSpace);
if(dwDesc != null)
cudnnDestroyFilterDescriptor(dwDesc);
if(convDesc != null)
cudnnDestroyConvolutionDescriptor(convDesc);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
private static long numDoublesIn2GB = 125000000;
/**
* This method computes the backpropogation errors for previous layer of convolution operation
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param filter filter used in conv2d
* @param dout errors from next layer
* @param output output errors
* @param N number of images
* @param C number of channels
* @param H height
* @param W width
* @param K number of filters
* @param R filter height
* @param S filter width
* @param pad_h pad height
* @param pad_w pad width
* @param stride_h stride height
* @param stride_w stride width
* @param P output activation height
* @param Q output activation width
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardData(String instName, MatrixObject filter, MatrixObject dout,
MatrixObject output, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
cudnnFilterDescriptor wDesc = null;
cudnnConvolutionDescriptor convDesc = null;
Pointer workSpace = null;
long sizeInBytes = 0;
try {
long t1=0, t2=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
// Allocate descriptors
wDesc = allocateFilterDescriptor(K, C, R, S);
cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(dout, N, K, P, Q);
cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(output, N, C, H, W);
// Allocate data
Pointer w = getDensePointer(filter, true, instName);
Pointer dy = getDensePointer(dout, true, instName);
Pointer dx = getDensePointer(output, true, instName);
int padding [] = { pad_h, pad_w };
int strides [] = { stride_h, stride_w };
convDesc = allocateConvolutionDescriptor(padding, strides);
long sizeInBytesArray[] = { 0 };
// TODO: Select the best algorithm depending on the data and supported CUDA
int algo = jcuda.jcudnn.cudnnConvolutionBwdDataAlgo.CUDNN_CONVOLUTION_BWD_DATA_ALGO_0;
workSpace = new Pointer();
cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle,
wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytesArray);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnConvolutionBackwardData(cudnnHandle, one(), wDesc, w,
dyDesc, dy, convDesc, algo, workSpace, sizeInBytes, zero(), dxDesc, dx);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CONVOLUTION_BACKWARD_DATA_LIB, System.nanoTime() - t2);
if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnConvolutionBackwardData: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
}
finally {
long t3=0;
if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
if(workSpace != null && sizeInBytes != 0)
cudaFreeHelper(instName, workSpace);
if(wDesc != null)
cudnnDestroyFilterDescriptor(wDesc);
if(convDesc != null)
cudnnDestroyConvolutionDescriptor(convDesc);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
/**
* performs maxpooling on GPU by exploiting cudnnPoolingForward(...)
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param image image as matrix object
* @param outputBlock output matrix
* @param N batch size
* @param C number of channels
* @param H height of image
* @param W width of image
* @param K number of filters
* @param R height of filter
* @param S width of filter
* @param pad_h vertical padding
* @param pad_w horizontal padding
* @param stride_h horizontal stride
* @param stride_w vertical stride
* @param P (H - R + 1 + 2*pad_h)/stride_h
* @param Q (W - S + 1 + 2*pad_w)/stride_w
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void maxpooling(String instName, MatrixObject image,
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
Pointer x = getDensePointer(image, true, instName);
cudnnTensorDescriptor xDesc = allocateTensorDescriptor(image, N, C, H, W);
performMaxpooling(instName, x, xDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
}
public static void performMaxpooling(String instName, Pointer x, cudnnTensorDescriptor xDesc,
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
Pointer y = getDensePointer(outputBlock, true, instName);
cudnnPoolingDescriptor poolingDesc = null;
try {
long t1=0,t2=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
// Allocate descriptors
cudnnTensorDescriptor yDesc = allocateTensorDescriptor(outputBlock, N, C, P, Q);
poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnPoolingForward(cudnnHandle, poolingDesc, one(), xDesc, x, zero(), yDesc, y);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnPoolingForward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
}
finally {
long t3=0;
if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
if(poolingDesc != null)
cudnnDestroyPoolingDescriptor(poolingDesc);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t3);
}
}
/**
* performs relu followed by maxpooling on GPU by exploiting cudnnPoolingForward(...)
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param image image as matrix object
* @param outputBlock output matrix
* @param N batch size
* @param C number of channels
* @param H height of image
* @param W width of image
* @param K number of filters
* @param R height of filter
* @param S width of filter
* @param pad_h vertical padding
* @param pad_w horizontal padding
* @param stride_h horizontal stride
* @param stride_w vertical stride
* @param P (H - R + 1 + 2*pad_h)/stride_h
* @param Q (W - S + 1 + 2*pad_w)/stride_w
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void reluMaxpooling(String instName, MatrixObject image,
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
cudnnTensorDescriptor srcTensorDesc = allocateTensorDescriptor(image, N, C, H, W);
long size = image.getNumRows() * image.getNumColumns() * Sizeof.DOUBLE;
Pointer tmp = allocate(size);
performCuDNNReLU(instName, image, tmp, srcTensorDesc);
cudaDeviceSynchronize(); // It seemed like the cudnn operation in performCuDNNReLU was being done aysnchronously, this adds the neccesary sync
performMaxpooling(instName, tmp, srcTensorDesc, outputBlock, N, C, H, W, K, R, S, pad_h, pad_w, stride_h, stride_w, P, Q);
cudaFreeHelper(tmp);
}
/**
* Performs maxpoolingBackward on GPU by exploiting cudnnPoolingBackward(...)
* This method computes the backpropogation errors for previous layer of maxpooling operation
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param image image as matrix object
* @param dout delta matrix, output of previous layer
* @param outputBlock output matrix
* @param N batch size
* @param C number of channels
* @param H height of image
* @param W width of image
* @param K number of filters
* @param R height of filter
* @param S width of filter
* @param pad_h vertical padding
* @param pad_w horizontal padding
* @param stride_h horizontal stride
* @param stride_w vertical stride
* @param P (H - R + 1 + 2*pad_h)/stride_h
* @param Q (W - S + 1 + 2*pad_w)/stride_w
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void maxpoolingBackward(String instName, MatrixObject image, MatrixObject dout,
MatrixObject outputBlock, int N, int C, int H, int W, int K, int R,
int S, int pad_h, int pad_w, int stride_h, int stride_w, int P,
int Q) throws DMLRuntimeException {
Pointer y = null;
cudnnPoolingDescriptor poolingDesc = null;
try {
long t1=0, t2=0, t3=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
// Allocate descriptors
cudnnTensorDescriptor xDesc = allocateTensorDescriptor(image, N, C, H, W);
cudnnTensorDescriptor yDesc = allocateTensorDescriptor(dout, N, C, P, Q);
cudnnTensorDescriptor dxDesc = allocateTensorDescriptor(outputBlock, N, C, H, W);
cudnnTensorDescriptor dyDesc = allocateTensorDescriptor(dout, N, C, P, Q);
poolingDesc = allocatePoolingDescriptor(R, S, pad_h, pad_w, stride_h, stride_w);
// Calling PoolForward first, y is one of the inputs for poolBackward
// TODO: Remove calling poolForward after necessary changes at language level for poolBackward
long numBytes = N*C*P*Q*Sizeof.DOUBLE;
y = allocate(numBytes);
// Allocate data
Pointer x = getDensePointer(image, true, instName);
Pointer dx = getDensePointer(outputBlock, true, instName);
Pointer dy = getDensePointer(dout, true, instName);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_INIT, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
int status = cudnnPoolingForward(cudnnHandle, poolingDesc, one(), xDesc, x, zero(), yDesc, y);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_FORWARD_LIB, System.nanoTime() - t2);
if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnPoolingForward before cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
status = cudnnPoolingBackward(cudnnHandle, poolingDesc, one(), yDesc, y, dyDesc, dy, xDesc, x, zero(), dxDesc, dx);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MAXPOOLING_BACKWARD_LIB, System.nanoTime() - t3);
if(status != jcuda.jcudnn.cudnnStatus.CUDNN_STATUS_SUCCESS) {
throw new DMLRuntimeException("Could not executed cudnnPoolingBackward: " + jcuda.jcudnn.cudnnStatus.stringFor(status));
}
}
finally {
long t4=0;
if (GPUStatistics.DISPLAY_STATISTICS) t4 = System.nanoTime();
if(y != null)
cudaFreeHelper(instName, y);
if(poolingDesc != null)
cudnnDestroyPoolingDescriptor(poolingDesc);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t4);
}
}
private static void performCuDNNReLU(String instName, MatrixObject in, Pointer dstData, cudnnTensorDescriptor srcTensorDesc) throws DMLRuntimeException {
long t0=0;
try {
cudnnTensorDescriptor dstTensorDesc = srcTensorDesc;
Pointer srcData = getDensePointer(in, true, instName);
cudnnActivationDescriptor activationDescriptor = new cudnnActivationDescriptor();
cudnnCreateActivationDescriptor(activationDescriptor);
double dummy = -1;
cudnnSetActivationDescriptor(activationDescriptor, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, dummy);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
cudnnActivationForward(cudnnHandle, activationDescriptor,
one(), srcTensorDesc, srcData,
zero(), dstTensorDesc, dstData);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ACTIVATION_FORWARD_LIB, System.nanoTime() - t0);
}
finally {
long t1=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDNN_CLEANUP, System.nanoTime() - t1);
}
}
/**
* Performs the relu operation on the GPU.
* @param ec currently active {@link ExecutionContext}
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName name of the output matrix
* @throws DMLRuntimeException if an error occurs
*/
public static void relu(ExecutionContext ec, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
long N = in.getNumRows();
long CHW = in.getNumColumns();
MatrixObject output = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
long t0=0;
cudnnTensorDescriptor srcTensorDesc = ((JCudaObject)in.getGPUObject()).getTensorDescriptor();
if(N*CHW >= numDoublesIn2GB || srcTensorDesc == null) {
// Invokes relu(double* A, double* ret, int rlen, int clen)
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
Pointer dstData = getDensePointer(output, instName);
Pointer srcData = getDensePointer(in, instName); // TODO: FIXME: Add sparse kernel support for relu
kernels.launchKernel("relu",
ExecutionConfig.getConfigForSimpleMatrixOperations((int)N, (int)CHW),
srcData, dstData, (int)N, (int) CHW);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_RELU_KERNEL, System.nanoTime() - t0);
}
else {
performCuDNNReLU(instName, in, getDensePointer(output, true, instName), srcTensorDesc);
}
}
//********************************************************************/
//************* End of DEEP LEARNING Operators ***********************/
//********************************************************************/
//********************************************************************/
//********** TRANSPOSE SELF MATRIX MULTIPLY Functions ****************/
//********************************************************************/
/**
* Performs tsmm, A %*% A' or A' %*% A, on GPU by exploiting cublasDsyrk(...)
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param left input matrix, as in a tsmm expression like A %*% A' or A' %*% A, we just need to check whether the left one is transposed or not, I named it 'left'
* @param outputName output matrix name
* @param isLeftTransposed if true, left transposed
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void matmultTSMM(ExecutionContext ec, String instName, MatrixObject left, String outputName,
boolean isLeftTransposed) throws DMLRuntimeException {
if(isInSparseFormat(left)) {
// For sparse TSMM, invoke matmult (TODO: possible performance improvement)
matmult(ec, instName, left, left, outputName, isLeftTransposed, !isLeftTransposed);
return;
}
// For dense TSMM, exploit cublasDsyrk(...) and call custom kernel to flip the matrix
MatrixObject output = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
// Since CuBLAS expects inputs in column-major format,
// reverse the order of matrix-multiplication and take care of dimension mismatch.
int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
// Note: the dimensions are swapped
int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows());
int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
if(m == -1)
throw new DMLRuntimeException("Incorrect dimensions");
int lda = (int) (isLeftTransposed ? m : k);
int ldc = m;
if(!left.getGPUObject().isAllocated())
throw new DMLRuntimeException("Input is not allocated:" + left.getGPUObject().isAllocated());
if(!output.getGPUObject().isAllocated())
throw new DMLRuntimeException("Output is not allocated:" + output.getGPUObject().isAllocated());
Pointer A = getDensePointer(left, instName);
Pointer C = getDensePointer(output, instName);
long t0=0, t1=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
JCublas2.cublasDsyrk(cublasHandle, cublasFillMode.CUBLAS_FILL_MODE_LOWER,transa, m, k, one(), A, lda, zero(), C, ldc);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SYRK_LIB, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
copyUpperToLowerTriangle(instName, output);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_UPPER_TO_LOWER_TRIANGLE_KERNEL, System.nanoTime() - t1);
}
/**
* Used for all version of TSMM where the result is known to be symmetric.
* Hence, we compute only the upper triangular matrix and copy this partial
* result down to lower triangular matrix once.
*
* @param instName instruction name
* @param ret upper triangular matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void copyUpperToLowerTriangle(String instName, MatrixObject ret) throws DMLRuntimeException {
if(isInSparseFormat(ret)) {
throw new DMLRuntimeException("Sparse GPU copyUpperToLowerTriangle is not implemented");
}
if(ret.getNumRows() != ret.getNumColumns()) {
throw new DMLRuntimeException("Only square matrix kernel is implemented for copyUpperToLowerTriangle");
}
int dim = (int) ret.getNumRows();
kernels.launchKernel("copy_u2l_dense",
ExecutionConfig.getConfigForSimpleMatrixOperations(dim, dim),
getDensePointer(ret, instName), dim, dim*dim);
}
//********************************************************************/
//******** End of TRANSPOSE SELF MATRIX MULTIPLY Functions ***********/
//********************************************************************/
//********************************************************************/
//***************** MATRIX MULTIPLY Functions ************************/
//********************************************************************/
/**
* Matrix multiply on GPU
* Examines sparsity and shapes and routes call to appropriate method
* from cuBLAS or cuSparse
* C = op(A) x op(B)
* @param ec Current {@link ExecutionContext} instance
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param left1 Matrix A
* @param right1 Matrix B
* @param outputName Name of the output matrix C (in code generated after LOP layer)
* @param isLeftTransposed1 op for A, transposed or not
* @param isRightTransposed1 op for B, tranposed or not
* @return output of matrix multiply
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static MatrixObject matmult(ExecutionContext ec, String instName, MatrixObject left1, MatrixObject right1, String outputName,
boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
if(!left1.getGPUObject().isAllocated() || !right1.getGPUObject().isAllocated())
throw new DMLRuntimeException("One of input is not allocated:" + left1.getGPUObject().isAllocated() + " " + right1.getGPUObject().isAllocated());
boolean bothDense = !left1.getGPUObject().isInSparseFormat() && !right1.getGPUObject().isInSparseFormat();
boolean bothSparse = left1.getGPUObject().isInSparseFormat() && right1.getGPUObject().isInSparseFormat();
MatrixObject output = ec.getMatrixObject(outputName);
if (bothDense) { // Dense C = Dense A * Dense B
// For both dense, do cuBLAS
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
denseDenseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
}
else if (bothSparse){ // Sparse C = Sparse A * Sparse B
ec.allocateGPUMatrixObject(outputName);
bothSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
}
else { // Either of A or B is sparse, Sparse C = Sparse/Dense A * Dense/Sparse B
// Convert the dense to sparse and use the cusparseDcsrgemm routine
ec.allocateGPUMatrixObject(outputName);
eitherSparseMatmult(instName, output, left1, right1, isLeftTransposed1, isRightTransposed1);
}
return output;
}
/**
* One of the matrices is sparse, the other dense
* C = op(A) x op(B)
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output object for C on host to which GPU output will be attached
* @param left Matrix A on host
* @param right Matrix B on host
* @param isLeftTransposed op for A, tranposed or not
* @param isRightTransposed op for B, transposed or not
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void eitherSparseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right,
boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
int n = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns());
int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
int k1 = (int) (isRightTransposed ? right.getNumColumns() : right.getNumRows());
if(k != k1)
throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
if(m == -1 || n == -1 || k == -1)
throw new DMLRuntimeException("Incorrect dimensions");
if (left.getGPUObject().isInSparseFormat()) {
// Left sparse, right dense
sparseDenseMatmult(instName, output, left, right, isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
} else {
// Left dense, right sparse
denseSparseMatmult(instName, output, right, left, isLeftTransposed, isRightTransposed, transA, transB, m, n, k);
}
}
/**
* C = op(A) * op(B) where A is dense and B is sparse
* If B is ultrasparse, A is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
* otherwise B is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output ?
* @param right ?
* @param left ?
* @param isLeftTransposed ?
* @param isRightTransposed ?
* @param transA ?
* @param transB ?
* @param m ?
* @param n ?
* @param k ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void denseSparseMatmult(String instName, MatrixObject output, MatrixObject right, MatrixObject left,
boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k)
throws DMLRuntimeException {
// right sparse, left dense
CSRPointer B = ((JCudaObject)right.getGPUObject()).jcudaSparseMatrixPtr;
Pointer ADense = getDensePointer(left, instName);
if (B.isUltraSparse(k, n)){
LOG.debug(" GPU Dense-Sparse Matrix Multiplication (Converted to Sparse-Sparse)");
// Convert left to CSR and do cuSparse matmul
int rowsA = (int)left.getNumRows();
int colsA = (int)left.getNumColumns();
long t0=0,t1=0, t2=0;
if (DMLScript.STATISTICS) t0 = System.nanoTime();
Pointer AT = JCudaObject.transpose(ADense, rowsA, colsA, colsA, rowsA);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
CSRPointer A = JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsA, colsA, AT);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
A.deallocate();
cudaFreeHelper(AT);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
} else {
LOG.debug(" GPU Dense-Sparse Matrix Multiplication (Converted to Dense-Dense)");
// Convert right to dense and do a cuBlas matmul
// BDenseTransposed is a column major matrix
// Note the arguments to denseDenseMatmult to accommodate for this.
long t0=0, t1=0;
if (DMLScript.STATISTICS) t0 = System.nanoTime();
Pointer BDenseTransposed = B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, (int)right.getNumRows(), (int)right.getNumColumns());
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
boolean allocated = output.getGPUObject().acquireDeviceModifyDense(); // To allocate the dense matrix
if (GPUStatistics.DISPLAY_STATISTICS && allocated) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
Pointer C = getDensePointer(output, instName);
denseDenseMatmult(instName, C,
(int) left.getNumRows(), (int) left.getNumColumns(),
(int) right.getNumColumns(), (int) right.getNumRows(),
isLeftTransposed, !isRightTransposed,
ADense, BDenseTransposed);
cudaFreeHelper(instName, BDenseTransposed);
}
}
/**
* * C = op(A) * op(B) where A is sparse and B is dense
* If A is ultrasparse, B is converted to a sparse matrix and {@code sparseSparseMatmult(MatrixObject, int, int, int, int, int, CSRPointer, CSRPointer)} is invoked
* otherwise A is converted to a dense matrix and {@code denseDenseMatmult(Pointer, int, int, int, int, boolean, boolean, Pointer, Pointer)} is invoked.
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output ?
* @param left ?
* @param right ?
* @param isLeftTransposed ?
* @param isRightTransposed ?
* @param transA ?
* @param transB ?
* @param m ?
* @param n ?
* @param k ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void sparseDenseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right,
boolean isLeftTransposed, boolean isRightTransposed, int transA, int transB, int m, int n, int k)
throws DMLRuntimeException {
CSRPointer A = ((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr;
Pointer BDense = getDensePointer(right, instName);
if (n == 1){
// Sparse Matrix - Dense Vector multiply
LOG.debug(" GPU Sparse Matrix - Dense Vector Mutliply");
sparseMatrixDenseVectorMult(instName, output, A, BDense, transA, (int)left.getNumRows(), (int)left.getNumColumns());
} else {
long t0=0, t1=0, t2=0;
// Sparse Matrix Dense Matrix multiply
if (A.isUltraSparse(m, k)){
LOG.debug(" GPU Sparse-Dense Matrix Multiplication (Converted to Sparse-Sparse)");
// Convert right to CSR and do cuSparse matmul
int rowsB = (int)right.getNumRows();
int colsB = (int)right.getNumColumns();
if (DMLScript.STATISTICS) t0 = System.nanoTime();
Pointer BT = JCudaObject.transpose(BDense, rowsB, colsB, colsB, rowsB);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_TRANSPOSE_LIB, System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
CSRPointer B = JCudaObject.columnMajorDenseToRowMajorSparse(cusparseHandle, rowsB, colsB, BT);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t1);
if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS) GPUStatistics.cudaDenseToSparseCount.getAndAdd(1);
sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
B.deallocate();
cudaFreeHelper(BT);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_CUDA_FREE, System.nanoTime() - t2, 2);
} else {
LOG.debug(" GPU Sparse-Dense Matrix Multiplication (Converted to Dense-Dense)");
// Convert left to dense and do a cuBlas matmul
// ADenseTransposed is a column major matrix
// Note the arguments to denseDenseMatmult to accommodate for this.
if (DMLScript.STATISTICS) t0 = System.nanoTime();
Pointer ADenseTransposed = A.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, (int)left.getNumRows(), (int)left.getNumColumns());
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseTime.getAndAdd(System.nanoTime() - t0);
if (DMLScript.STATISTICS) GPUStatistics.cudaSparseToDenseCount.getAndAdd(System.nanoTime() - t0);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
boolean allocated = output.getGPUObject().acquireDeviceModifyDense(); // To allocate the dense matrix
if (allocated && GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t1);
Pointer C = getDensePointer(output, instName);
denseDenseMatmult(instName, C,
(int) left.getNumColumns(), (int) left.getNumRows(),
(int) right.getNumRows(), (int) right.getNumColumns(),
!isLeftTransposed, isRightTransposed,
ADenseTransposed, BDense);
cudaFreeHelper(instName, ADenseTransposed);
}
}
}
/**
* C = op(A) x B
* A is a sparse matrix, B is a dense vector
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output on the host, to which the GPU output C will be attached
* @param A sparse matrix A on the GPU
* @param B_dense dense matrix/vector B on the GPU
* @param transA op for A, tranposed or not
* @param m number of rows in A (not op(A))
* @param k number of cols in A or number of rows in B (not op(A) or op(B))
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void sparseMatrixDenseVectorMult(String instName, MatrixObject output, CSRPointer A, Pointer B_dense, int transA,
int m, int k) throws DMLRuntimeException {
long size = m * Sizeof.DOUBLE;
if (transA == CUSPARSE_OPERATION_TRANSPOSE){
size = k * Sizeof.DOUBLE;
}
Pointer C_dense = JCudaObject.allocate(instName, (int)size);
long t1=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
cusparseDcsrmv(cusparseHandle, transA, m, k, (int)A.nnz, one(), A.descr, A.val, A.rowPtr, A.colInd, B_dense, zero(), C_dense);
cudaDeviceSynchronize(); // Since cusparseDcsrmv is asynchronously executed
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t1);
((JCudaObject)(output.getGPUObject())).setDenseMatrixCudaPointer(C_dense);
output.getGPUObject().setDeviceModify(size);
}
/**
* Sparse C = Sparse op(A) * Sparse op(B)
* Reroutes call to sparse matrix-vector mult if needed
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output ?
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param left ?
* @param right ?
* @param isLeftTransposed ?
* @param isRightTransposed ?
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void bothSparseMatmult(String instName, MatrixObject output, MatrixObject left, MatrixObject right,
boolean isLeftTransposed, boolean isRightTransposed) throws DMLRuntimeException {
int transA = isLeftTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
int transB = isRightTransposed ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
int m = (int) (isLeftTransposed ? left.getNumColumns() : left.getNumRows()) ;
int n = (int) (isRightTransposed ? right.getNumRows() : right.getNumColumns());
int k = (int) (isLeftTransposed ? left.getNumRows() : left.getNumColumns());
int k1 = (int) (isRightTransposed ? right.getNumColumns() : right.getNumRows());
if(k != k1)
throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
if(m == -1 || n == -1 || k == -1)
throw new DMLRuntimeException("Incorrect dimensions");
CSRPointer A = ((JCudaObject)left.getGPUObject()).jcudaSparseMatrixPtr;
CSRPointer B = ((JCudaObject)right.getGPUObject()).jcudaSparseMatrixPtr;
// TODO if (m == 1) { // Vector-matrix multiplication
if (!isRightTransposed && right.getNumColumns() == 1){ // Matrix-Vector multiplication
sparseMatrixVectorMult(instName, output, transA, (int)left.getNumRows(), (int)left.getNumColumns(), (int)right.getNumRows(), A, B);
} else { // Matrix-Matrix multiplication
sparseSparseMatmult(instName, output, transA, transB, m, n, k, A, B);
}
}
/**
* Does a sparse matrix-vector multiply.
* C = op(A) x B, A is a sparse matrix, B is a sparse vector with numCols = 1.
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output object C to which the GPU output matrix will be attached
* @param transA if A is to be transposed or not (the op in op(A))
* @param m number of rows in A (not op(A))
* @param n number of cols in A (not op(A))
* @param k number of rows in B, (cols in B is assumed to be 1)
* @param A left sparse matrix on GPU
* @param B right sparse vector on GPU
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void sparseMatrixVectorMult(String instName, MatrixObject output, int transA, int m, int n, int k,
CSRPointer A, CSRPointer B) throws DMLRuntimeException {
LOG.debug(" GPU Sparse Matrix Sparse Vector Multiply (Converted to Sparse Matrix Dense Vector Multiply)");
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
Pointer BDenseVector = B.toColumnMajorDenseMatrix(cusparseHandle, cublasHandle, k, 1);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_TO_DENSE, System.nanoTime() - t0);
sparseMatrixDenseVectorMult(instName, output, A, BDenseVector, transA, m, k);
}
/**
* Does a sparse-sparse Matrix multiply
* C = op(A) x op(B), A, B are sparse matrices
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param output allocated output object on host to which the GPU output matrix will be attached
* @param transA op for A - to be transposed or not
* @param transB op for B
* @param m number of rows in op(A)
* @param n number of cols in op(B)
* @param k number of cols in op(A) or rows in op(B)
* @param A left sparse matrix on GPU
* @param B right sparse matrix on GPU
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void sparseSparseMatmult(String instName, MatrixObject output, int transA, int transB, int m, int n, int k,
CSRPointer A, CSRPointer B) throws DMLRuntimeException {
LOG.debug(" GPU Sparse-Sparse Matrix Multiply ");
long t0=0, t1=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
CSRPointer C = CSRPointer.allocateForMatrixMultiply(cusparseHandle, A, transA, B, transB, m, n, k);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t0);
((JCudaObject)output.getGPUObject()).setSparseMatrixCudaPointer(C);
long sizeOfC = CSRPointer.estimateSize(C.nnz, output.getNumRows());
output.getGPUObject().setDeviceModify(sizeOfC);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
cusparseDcsrgemm(cusparseHandle, transA, transB, m, n, k,
A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd,
B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
C.descr, C.val, C.rowPtr, C.colInd);
cudaDeviceSynchronize();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_MATRIX_SPARSE_MATRIX_LIB, System.nanoTime() - t1);
}
/**
* Dense dense matrix multiply
* C = op(A) * op(B), A and B are dense matrices
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param output output object C on host with GPU data allocated
* @param left1 left matrix A on host (in row-major order)
* @param right1 right matrix B on host (in row-major order)
* @param isLeftTransposed1 op for A, transposed or not
* @param isRightTransposed1 op for B, transposed or not
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
protected static void denseDenseMatmult(String instName, MatrixObject output, MatrixObject left1, MatrixObject right1,
boolean isLeftTransposed1, boolean isRightTransposed1) throws DMLRuntimeException {
Pointer leftPtr = getDensePointer(left1, instName);
Pointer rightPtr = getDensePointer(right1, instName);
int leftRows = (int) left1.getNumRows();
int leftCols = (int) left1.getNumColumns();
int rightRows = (int) right1.getNumRows();
int rightCols = (int) right1.getNumColumns();
Pointer C = getDensePointer(output, instName);
denseDenseMatmult(instName, C, leftRows, leftCols, rightRows, rightCols, isLeftTransposed1, isRightTransposed1,
leftPtr, rightPtr);
}
/**
* Dense-dense matrix multiply
* C = op(A) * op(B), A and B are dense matrices
* On the host, the matrices are in row-major format; cuBLAS expects them in column-major format.
* What we have as input is t(A) and t(B), t(X) = transpose of X.
* We do t(B) %*% t(A) to get t(C);
* If we were to calculate t(t(C), we would get the resultant matrix C, but this would be in column-major format.
* What we really want is t(C). This we already have as the result of t(B) %*% t(A).
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param output output allocated on GPU in column major format
* @param leftRows1 number of rows in A
* @param leftCols1 number of cols in A
* @param rightRows1 number of rows in B
* @param rightCols1 number of cols in B
* @param isLeftTransposed1 op for A, transposed or not
* @param isRightTransposed1 op for B, transposed or not
* @param leftPtr A allocated on the GPU in row-major format
* @param rightPtr B allocated on the GPU in row-major format
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void denseDenseMatmult(String instName, Pointer output, int leftRows1, int leftCols1, int rightRows1,
int rightCols1, boolean isLeftTransposed1, boolean isRightTransposed1, Pointer leftPtr, Pointer rightPtr)
throws DMLRuntimeException {
Pointer A = rightPtr;
Pointer B = leftPtr;
int leftRows = rightCols1;
int leftCols = rightRows1;
int rightRows = leftCols1;
int rightCols = leftRows1;
boolean isLeftTransposed = isRightTransposed1;
boolean isRightTransposed = isLeftTransposed1;
// Note: the dimensions are swapped
int m = (int) (isLeftTransposed ? leftCols : leftRows) ;
int n = (int) (isRightTransposed ? rightRows : rightCols);
int k = (int) (isLeftTransposed ? leftRows : leftCols);
int k1 = (int) (isRightTransposed ? rightCols : rightRows);
if(k != k1)
throw new DMLRuntimeException("Dimension mismatch: " + k + " != " + k1);
if(m == -1 || n == -1 || k == -1)
throw new DMLRuntimeException("Incorrect dimensions");
double[] one = { 1 };
double[] zero = { 0 };
//int lda = leftRows;
//int ldb = leftCols;
int lda = isLeftTransposed ? k : m;
int ldb = isRightTransposed ? n : k;
int ldc = m;
int transa = isLeftTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
int transb = isRightTransposed ? cublasOperation.CUBLAS_OP_T : cublasOperation.CUBLAS_OP_N;
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
Pointer C = output;
if (m == 1 && n == 1){
// Vector product
LOG.debug(" GPU Dense-dense Vector Product");
double[] result = {0};
JCublas2.cublasDdot(cublasHandle, k, A, 1, B, 1, Pointer.to(result));
// By default in CuBlas V2, cublas pointer mode is set to CUBLAS_POINTER_MODE_HOST.
// This means that scalar values passed are on host (as opposed to on device).
// The result is copied from the host back to the device so that the rest of
// infrastructure can treat it uniformly.
cudaMemcpy(C, Pointer.to(result), 1 * Sizeof.DOUBLE, cudaMemcpyHostToDevice);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DOT_LIB, System.nanoTime() - t0);
} else if (m == 1) {
// Vector-matrix multiply
LOG.debug(" GPU Dense Vector-Matrix Multiply");
transb = isRightTransposed ? cublasOperation.CUBLAS_OP_N : cublasOperation.CUBLAS_OP_T;
JCublas2.cublasDgemv(cublasHandle, transb, rightRows, rightCols, Pointer.to(one), B, ldb, A, 1, Pointer.to(zero), C, 1);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_VECTOR_DENSE_MATRIX_LIB, System.nanoTime() - t0);
} else if (n == 1){
// Matrix-vector multiply
LOG.debug(" GPU Dense Matrix-Vector Multiply");
JCublas2.cublasDgemv(cublasHandle, transa, leftRows, leftCols, Pointer.to(one), A, lda, B, 1, Pointer.to(zero), C, 1);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_VECTOR_LIB, System.nanoTime() - t0);
} else {
LOG.debug(" GPU Dense-Dense Matrix Multiply ");
JCublas2.cublasDgemm(cublasHandle, transa, transb, m, n, k, Pointer.to(one), A, lda, B, ldb, Pointer.to(zero), C, ldc);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_MATRIX_DENSE_MATRIX_LIB, System.nanoTime() - t0);
}
}
//********************************************************************/
//***************** END OF MATRIX MULTIPLY Functions *****************/
//********************************************************************/
//********************************************************************/
//**************** UNARY AGGREGATE Functions ************************/
//********************************************************************/
/**
* Entry point to perform Unary aggregate operations on the GPU.
* The execution context object is used to allocate memory for the GPU.
* @param ec Instance of {@link ExecutionContext}, from which the output variable will be allocated
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param in1 input matrix
* @param output output matrix/scalar name
* @param op Instance of {@link AggregateUnaryOperator} which encapsulates the direction of reduction/aggregation and the reduction operation.
* @throws DMLRuntimeException if {@link DMLRuntimeException} occurs
*/
public static void unaryAggregate(ExecutionContext ec, String instName, MatrixObject in1, String output, AggregateUnaryOperator op)
throws DMLRuntimeException {
final int REDUCTION_ALL = 1;
final int REDUCTION_ROW = 2;
final int REDUCTION_COL = 3;
final int REDUCTION_DIAG = 4;
// A kahan sum implemention is not provided. is a "uak+" or other kahan operator is encountered,
// it just does regular summation reduction.
final int OP_PLUS = 1;
final int OP_PLUS_SQ = 2;
final int OP_MEAN = 3;
final int OP_VARIANCE = 4;
final int OP_MULTIPLY = 5;
final int OP_MAX = 6;
final int OP_MIN = 7;
final int OP_MAXINDEX = 8;
final int OP_MININDEX = 9;
// Sanity Checks
if(!in1.getGPUObject().isAllocated())
throw new DMLRuntimeException("Internal Error - The input is not allocated for a GPU Aggregate Unary:" + in1.getGPUObject().isAllocated());
boolean isSparse = in1.getGPUObject().isInSparseFormat();
IndexFunction indexFn = op.indexFn;
AggregateOperator aggOp = op.aggOp;
// Convert Reduction direction to a number to pass to CUDA kernel
int reductionDirection = -1;
if (indexFn instanceof ReduceAll){
reductionDirection = REDUCTION_ALL;
} else if (indexFn instanceof ReduceRow){
reductionDirection = REDUCTION_ROW;
} else if (indexFn instanceof ReduceCol){
reductionDirection = REDUCTION_COL;
} else if (indexFn instanceof ReduceDiag){
reductionDirection = REDUCTION_DIAG;
} else {
throw new DMLRuntimeException("Internal Error - Invalid index function type, only reducing along rows, columns, diagonals or all elements is supported in Aggregate Unary operations");
}
assert reductionDirection !=-1 : "Internal Error - Incorrect type of reduction direction set for aggregate unary GPU instruction";
// Convert function type to a number to pass to the CUDA Kernel
int opIndex = -1;
if (aggOp.increOp.fn instanceof KahanPlus) {
opIndex = OP_PLUS;
} else if (aggOp.increOp.fn instanceof KahanPlusSq) {
opIndex = OP_PLUS_SQ;
} else if (aggOp.increOp.fn instanceof Mean) {
opIndex = OP_MEAN;
} else if (aggOp.increOp.fn instanceof CM) {
assert ((CM)aggOp.increOp.fn).getAggOpType() == CMOperator.AggregateOperationTypes.VARIANCE : "Internal Error - Invalid Type of CM operator for Aggregate Unary operation on GPU";
opIndex = OP_VARIANCE;
} else if (aggOp.increOp.fn instanceof Plus) {
opIndex = OP_PLUS;
} else if (aggOp.increOp.fn instanceof Multiply) {
opIndex = OP_MULTIPLY;
} else if (aggOp.increOp.fn instanceof Builtin) {
Builtin b = (Builtin)aggOp.increOp.fn;
switch(b.bFunc) {
case MAX: opIndex = OP_MAX; break;
case MIN: opIndex = OP_MIN; break;
case MAXINDEX: opIndex = OP_MAXINDEX; break;
case MININDEX: opIndex = OP_MININDEX;break;
default:
new DMLRuntimeException("Internal Error - Unsupported Builtin Function for Aggregate unary being done on GPU");
}
} else {
throw new DMLRuntimeException("Internal Error - Aggregate operator has invalid Value function");
}
assert opIndex != -1 : "Internal Error - Incorrect type of operation set for aggregate unary GPU instruction";
int rlen = (int)in1.getNumRows();
int clen = (int)in1.getNumColumns();
if (isSparse){
// The strategy for the time being is to convert sparse to dense
// until a sparse specific kernel is written.
((JCudaObject)in1.getGPUObject()).sparseToDense(instName);
// long nnz = in1.getNnz();
// assert nnz > 0 : "Internal Error - number of non zeroes set to " + nnz + " in Aggregate Binary for GPU";
// MatrixObject out = ec.getSparseMatrixOutputForGPUInstruction(output, nnz);
// throw new DMLRuntimeException("Internal Error - Not implemented");
}
Pointer out = null;
if (reductionDirection == REDUCTION_COL || reductionDirection == REDUCTION_ROW) {
// Matrix output
MatrixObject out1 = getDenseMatrixOutputForGPUInstruction(ec, instName, output);
out = getDensePointer(out1, instName);
}
Pointer in = getDensePointer(in1, instName);
int size = rlen * clen;
// For scalars, set the scalar output in the Execution Context object
switch (opIndex){
case OP_PLUS: {
switch(reductionDirection) {
case REDUCTION_ALL : {
double result = reduceAll(instName, "reduce_sum", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
reduceRow(instName, "reduce_row_sum", in, out, rlen, clen);
break;
}
case REDUCTION_ROW : {
reduceCol(instName, "reduce_col_sum", in, out, rlen, clen);
break;
}
case REDUCTION_DIAG :
throw new DMLRuntimeException("Internal Error - Row, Column and Diag summation not implemented yet");
}
break;
}
case OP_PLUS_SQ : {
// Calculate the squares in a temporary object tmp
Pointer tmp = JCudaObject.allocate(instName, size * Sizeof.DOUBLE);
squareMatrix(instName, in, tmp, rlen, clen);
// Then do the sum on the temporary object and free it
switch(reductionDirection) {
case REDUCTION_ALL : {
double result = reduceAll(instName, "reduce_sum", tmp, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL : { // The names are a bit misleading, REDUCTION_COL refers to the direction (reduce all elements in a column)
reduceRow(instName, "reduce_row_sum", tmp, out, rlen, clen);
break;
}
case REDUCTION_ROW : {
reduceCol(instName, "reduce_col_sum", tmp, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for summation squared");
}
cudaFreeHelper(instName, tmp);
break;
}
case OP_MEAN:{
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(instName, "reduce_sum", in, size);
double mean = result / size;
ec.setScalarOutput(output, new DoubleObject(mean));
break;
}
case REDUCTION_COL: {
reduceRow(instName, "reduce_row_mean", in, out, rlen, clen);
break;
}
case REDUCTION_ROW: {
reduceCol(instName, "reduce_col_mean", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for mean");
}
break;
}
case OP_MULTIPLY : {
switch (reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(instName, "reduce_prod", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for multiplication");
}
break;
}
case OP_MAX :{
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(instName, "reduce_max", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL: {
reduceRow(instName, "reduce_row_max", in, out, rlen, clen);
break;
}
case REDUCTION_ROW: {
reduceCol(instName, "reduce_col_max", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for max");
}
break;
}
case OP_MIN :{
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(instName, "reduce_min", in, size);
ec.setScalarOutput(output, new DoubleObject(result));
break;
}
case REDUCTION_COL: {
reduceRow(instName, "reduce_row_min", in, out, rlen, clen);
break;
}
case REDUCTION_ROW: {
reduceCol(instName, "reduce_col_min", in, out, rlen, clen);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for min");
}
break;
}
case OP_VARIANCE : {
// Temporary GPU array for
Pointer tmp = JCudaObject.allocate(instName, size * Sizeof.DOUBLE);
Pointer tmp2 = JCudaObject.allocate(instName, size * Sizeof.DOUBLE);
switch(reductionDirection) {
case REDUCTION_ALL: {
double result = reduceAll(instName, "reduce_sum", in, size);
double mean = result / size;
// Subtract mean from every element in the matrix
ScalarOperator minusOp = new RightScalarOperator(Minus.getMinusFnObject(), mean);
matrixScalarOp(instName, in, mean, rlen, clen, tmp, minusOp);
squareMatrix(instName, tmp, tmp2, rlen, clen);
double result2 = reduceAll(instName, "reduce_sum", tmp2, size);
double variance = result2 / (size - 1);
ec.setScalarOutput(output, new DoubleObject(variance));
break;
}
case REDUCTION_COL: {
reduceRow(instName, "reduce_row_mean", in, out, rlen, clen);
// Subtract the row-wise mean from every element in the matrix
BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
matrixMatrixOp(instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.COLUMN.code(), tmp, minusOp);
squareMatrix(instName, tmp, tmp2, rlen, clen);
Pointer tmpRow = JCudaObject.allocate(instName, rlen * Sizeof.DOUBLE);
reduceRow(instName, "reduce_row_sum", tmp2, tmpRow, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), clen - 1);
matrixScalarOp(instName, tmpRow, clen - 1, rlen, clen, out, divideOp);
cudaFreeHelper(instName, tmpRow);
break;
}
case REDUCTION_ROW: {
reduceCol(instName, "reduce_col_mean", in, out, rlen, clen);
// Subtract the columns-wise mean from every element in the matrix
BinaryOperator minusOp = new BinaryOperator(Minus.getMinusFnObject());
matrixMatrixOp(instName, in, out, rlen, clen, VectorShape.NONE.code(), VectorShape.ROW.code(), tmp, minusOp);
squareMatrix(instName, tmp, tmp2, rlen, clen);
Pointer tmpCol = JCudaObject.allocate(instName, clen * Sizeof.DOUBLE);
reduceCol(instName, "reduce_col_sum", tmp2, tmpCol, rlen, clen);
ScalarOperator divideOp = new RightScalarOperator(Divide.getDivideFnObject(), rlen - 1);
matrixScalarOp(instName, tmpCol, rlen - 1, rlen, clen, out, divideOp);
cudaFreeHelper(instName, tmpCol);
break;
}
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for variance");
}
cudaFreeHelper(instName, tmp);
cudaFreeHelper(instName, tmp2);
break;
}
case OP_MAXINDEX : {
switch(reductionDirection) {
case REDUCTION_COL:
throw new DMLRuntimeException("Internal Error - Column maxindex of matrix not implemented yet for GPU ");
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for maxindex");
}
// break;
}
case OP_MININDEX : {
switch(reductionDirection) {
case REDUCTION_COL:
throw new DMLRuntimeException("Internal Error - Column minindex of matrix not implemented yet for GPU ");
default:
throw new DMLRuntimeException("Internal Error - Unsupported reduction direction for minindex");
}
// break;
}
default : throw new DMLRuntimeException("Internal Error - Invalid GPU Unary aggregate function!");
}
}
/**
* Helper method to square a matrix in GPU memory
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix on GPU
* @param out output matrix on GPU
* @param rlen row length
* @param clen column length
* @throws DMLRuntimeException if error
*/
private static void squareMatrix(String instName, Pointer in, Pointer out, int rlen, int clen) throws DMLRuntimeException {
ScalarOperator power2op = new RightScalarOperator(Power.getPowerFnObject(), 2);
matrixScalarOp(instName, in, 2, rlen, clen, out, power2op);
}
/**
* Do a simple reduction, the output of which is a single value
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to matrix in device memory
* @param n size of array
* @return the reduced value
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static double reduceAll(String instName, String kernelFunction, Pointer in, int n) throws DMLRuntimeException {
int[] tmp = getKernelParamsForReduceAll(n);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
Pointer tempOut = JCudaObject.allocate(instName, n * Sizeof.DOUBLE);
long t1=0,t2=0,t3=0;
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem), in, tempOut, n);
cudaDeviceSynchronize();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t1);
int s = blocks;
while (s > 1) {
tmp = getKernelParamsForReduceAll(s);
blocks = tmp[0]; threads = tmp[1]; sharedMem = tmp[2];
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
tempOut, tempOut, s);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ALL_KERNEL, System.nanoTime() - t2);
s = (s + (threads*2-1)) / (threads*2);
}
double[] result = {-1f};
if (GPUStatistics.DISPLAY_STATISTICS) t3 = System.nanoTime();
cudaMemcpy(Pointer.to(result), tempOut, Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_HOST, System.nanoTime() - t3);
cudaFreeHelper(instName, tempOut);
return result[0];
}
/**
* Do a reduction by row. Data is reduced per row and the
* resulting vector is calculated.
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to input matrix in device memory (size - rows * columns)
* @param out {@link Pointer} to output matrix in device memory (size - rows * 1)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void reduceRow(String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
int[] tmp = getKernelParamsForReduceByRow(rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
in, out, rows, cols);
cudaDeviceSynchronize();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_ROW_KERNEL, System.nanoTime() - t0);
}
/**
* Do a reduction by column. Data is reduced per column and the
* resulting vector is calculated.
* @param kernelFunction name of the kernel function to invoke
* @param in {@link Pointer} to input matrix in device memory (size - rows * columns)
* @param out {@link Pointer} to output matrix in device memory (size - 1 * cols)
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void reduceCol(String instName, String kernelFunction, Pointer in, Pointer out, int rows, int cols) throws DMLRuntimeException {
int[] tmp = getKernelParamsForReduceByCol(rows, cols);
int blocks = tmp[0], threads = tmp[1], sharedMem = tmp[2];
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
kernels.launchKernel(kernelFunction, new ExecutionConfig(blocks, threads, sharedMem),
in, out, rows, cols);
cudaDeviceSynchronize();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_REDUCE_COL_KERNEL, System.nanoTime() - t0);
}
/**
* Get threads, blocks and shared memory for a reduce all operation
* @param n size of input array
* @return integer array containing {blocks, threads, shared memory}
*/
private static int[] getKernelParamsForReduceAll(int n) throws DMLRuntimeException{
final int MAX_THREADS = getMaxThreads();
final int MAX_BLOCKS = getMaxBlocks();
final int WARP_SIZE = getWarpSize();
int threads = (n < MAX_THREADS *2) ? nextPow2((n + 1)/ 2) : MAX_THREADS;
int blocks = (n + (threads * 2 - 1)) / (threads * 2);
blocks = Math.min(MAX_BLOCKS, blocks);
int sharedMemSize = threads * Sizeof.DOUBLE;
if (threads <= WARP_SIZE){
sharedMemSize *= 2;
}
return new int[] {blocks, threads, sharedMemSize};
}
/**
* Get threads, blocks and shared memory for a reduce by row operation
* @param rows number of rows in input matrix
* @param cols number of columns in input matrix
* @return integer array containing {blocks, threads, shared memory}
*/
private static int[] getKernelParamsForReduceByRow(int rows, int cols) throws DMLRuntimeException {
final int WARP_SIZE = getWarpSize();
final int MAX_THREADS = getMaxThreads();
int threads = (cols < MAX_THREADS *2) ? nextPow2((cols + 1)/ 2) : MAX_THREADS;
int blocks = rows;
int sharedMemSize = threads * Sizeof.DOUBLE;
if (threads <= WARP_SIZE){
sharedMemSize *=2;
}
return new int[] {blocks, threads, sharedMemSize};
}
private static int[] getKernelParamsForReduceByCol(int rows, int cols) throws DMLRuntimeException {
final int MAX_THREADS = getMaxThreads();
final int MAX_BLOCKS = getMaxBlocks();
final int WARP_SIZE = getWarpSize();
int threads = Math.min(cols, MAX_THREADS);
int blocks = Math.min(cols/MAX_THREADS, MAX_BLOCKS);
if (cols % MAX_THREADS != 0) blocks++;
int sharedMemSize = threads * Sizeof.DOUBLE;
if (threads <= WARP_SIZE){
sharedMemSize *=2;
}
return new int[] {blocks, threads, sharedMemSize};
}
private static int nextPow2(int x)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
//********************************************************************/
//**************** END OF UNARY AGGREGATE Functions *****************/
//********************************************************************/
//********************************************************************/
//************ Matrix-Matrix & Matrix-Scalar Functions ***************/
//********************************************************************/
/**
* Entry point to perform elementwise matrix-scalar operation specified by op
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName output matrix name
* @param isInputTransposed true if input transposed
* @param op scalar operator
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void matrixScalarArithmetic(ExecutionContext ec, String instName, MatrixObject in, String outputName, boolean isInputTransposed, ScalarOperator op) throws DMLRuntimeException {
double constant = op.getConstant();
boolean isCUDALibAvailable = (op.fn instanceof Multiply
|| (op.fn instanceof Divide && op instanceof RightScalarOperator && constant != 0)) && !isSparseAndEmpty(in);
if(!isCUDALibAvailable) {
if(constant == 0) {
if(op.fn instanceof Plus || (op.fn instanceof Minus && op instanceof RightScalarOperator) || op.fn instanceof Or) {
deviceCopy(ec, instName, in, outputName, isInputTransposed);
}
else if(op.fn instanceof Multiply || op.fn instanceof And) {
setOutputToConstant(ec, instName, 0.0, outputName);
}
else if(op.fn instanceof Power) {
setOutputToConstant(ec, instName, 1.0, outputName);
}
else if(op.fn instanceof Divide && isSparseAndEmpty(in)) {
setOutputToConstant(ec, instName, Double.NaN, outputName);
}
else if(op.fn instanceof Divide) {
//For division, IEEE 754 defines x/0.0 as INFINITY and 0.0/0.0 as NaN.
compareAndSet(ec, instName, in, outputName, 0.0, 1e-6, Double.NaN, Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY);
}
else {
// TODO: Potential to optimize
matrixScalarOp(ec, instName, in, outputName, isInputTransposed, op);
}
}
else if(constant == 1.0 && op.fn instanceof Or) {
setOutputToConstant(ec, instName, 1.0, outputName);
}
else if(constant == 1.0 && (op.fn instanceof And || op.fn instanceof Power)) {
deviceCopy(ec, instName, in, outputName, isInputTransposed);
}
else {
matrixScalarOp(ec, instName, in, outputName, isInputTransposed, op);
}
}
else {
double alpha = 0;
if(op.fn instanceof Multiply) {
alpha = op.getConstant();
}
else if(op.fn instanceof Divide && op instanceof RightScalarOperator) {
alpha = Math.pow(op.getConstant(), -1.0);
}
else {
throw new DMLRuntimeException("Unsupported op");
}
// TODO: Performance optimization: Call cublasDaxpy if(in.getNumRows() == 1 || in.getNumColumns() == 1)
// C = alpha* op( A ) + beta* op ( B )
dgeam(ec, instName, in, in, outputName, isInputTransposed, isInputTransposed, alpha, 0.0);
}
}
/**
* Performs elementwise operation specified by op of two input matrices in1 and in2
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix 1
* @param in2 input matrix 2
* @param outputName output matrix name
* @param isLeftTransposed true if left-transposed
* @param isRightTransposed true if right-transposed
* @param op binary operator
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void matrixScalarArithmetic(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2,
String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
boolean isCUDALibAvailable = (op.fn instanceof Plus || op.fn instanceof Minus) && !isSparseAndEmpty(in1) && !isSparseAndEmpty(in2) && !isVector(in1) && !isVector(in2);
if(!isCUDALibAvailable) {
matrixMatrixOp(ec, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, op);
}
else {
double alpha;
double beta;
if(op.fn instanceof Plus) {
alpha = 1.0;
beta = 1.0;
}
else if(op.fn instanceof Minus) {
alpha = 1.0;
beta = -1.0;
}
else {
throw new DMLRuntimeException("Unsupported op");
}
// C = alpha* op( A ) + beta* op ( B )
dgeam(ec, instName, in1, in2, outputName, isLeftTransposed, isRightTransposed, alpha, beta);
}
}
/**
* Utility to do matrix-scalar operation kernel
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param ec execution context
* @param in input matrix
* @param outputName output variable name
* @param isInputTransposed true if input is transposed
* @param op operator
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void matrixScalarOp(ExecutionContext ec, String instName, MatrixObject in, String outputName, boolean isInputTransposed,
ScalarOperator op) throws DMLRuntimeException {
if(isInputTransposed)
throw new DMLRuntimeException("Transposing the input is not supported");
int rlenA = (int) in.getNumRows();
int clenA = (int) in.getNumColumns();
Pointer A = getDensePointer(in, instName); // TODO: FIXME: Implement sparse binCellSparseScalarOp kernel
double scalar = op.getConstant();
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer C = getDensePointer(out, instName);
matrixScalarOp(instName, A, scalar, rlenA, clenA, C, op);
}
/**
* Helper method to launch binary scalar-matrix arithmetic operations CUDA kernel.
* This method is isolated to be taken advatage of from other operations
* as it accepts JCuda {@link Pointer} instances instead of {@link MatrixObject} instances.
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param a the dense input matrix (allocated on GPU)
* @param scalar the scalar value to do the op
* @param rlenA row length of matrix a
* @param clenA column lenght of matrix a
* @param c the dense output matrix
* @param op operation to perform
* @throws DMLRuntimeException throws runtime exception
*/
private static void matrixScalarOp(String instName, Pointer a, double scalar, int rlenA, int clenA, Pointer c, ScalarOperator op) throws DMLRuntimeException {
int isLeftScalar = (op instanceof LeftScalarOperator) ? 1 : 0;
int size = rlenA * clenA;
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
kernels.launchKernel("matrix_scalar_op",
ExecutionConfig.getConfigForSimpleVectorOperations(size),
a, scalar, c, size, getBinaryOp(op.fn), isLeftScalar);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_SCALAR_OP_KERNEL, System.nanoTime() - t0);
}
/**
* Utility to launch binary cellwise matrix-matrix operations CUDA kernel
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 left input matrix
* @param in2 right input matrix
* @param outputName output variable name
* @param isLeftTransposed true if left matrix is transposed
* @param isRightTransposed true if right matrix is transposed
* @param op operator
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void matrixMatrixOp(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2,
String outputName, boolean isLeftTransposed, boolean isRightTransposed, BinaryOperator op) throws DMLRuntimeException {
boolean isEmpty1 = isSparseAndEmpty(in1);
boolean isEmpty2 = isSparseAndEmpty(in2);
int rlenA = (int) in1.getNumRows();
int rlenB = (int) in2.getNumRows();
int clenA = (int) in1.getNumColumns();
int clenB = (int) in2.getNumColumns();
int vecStatusA = getVectorStatus(rlenA, clenA).code();
int vecStatusB = getVectorStatus(rlenB, clenB).code();
if (isEmpty1 && isEmpty2){
MatrixObject out = ec.getMatrixObject(outputName);
ec.allocateGPUMatrixObject(outputName);
// When both inputs are empty, the output is empty too (except in the case of division)
if (op.fn instanceof Divide) {
((JCudaObject) out.getGPUObject()).allocateAndFillDense(Double.NaN);
} else {
((JCudaObject) out.getGPUObject()).allocateSparseAndEmpty();
}
}
// Check for M1 * M2 when M1 is empty; if M2 is a vector then fallback to general case
else if(isEmpty1 && clenB != 1 && rlenB != 1) {
// C = empty_in1 op in2 ==> becomes ==> C = 0.0 op in2
matrixScalarArithmetic(ec, instName, in2, outputName, isRightTransposed, new LeftScalarOperator(op.fn, 0.0));
}
// Check for M1 * M2 when M2 is empty; if M1 is a vector then fallback to general case
else if(isEmpty2 && clenA != 1 && rlenA != 1) {
// C = in1 op empty_in2 ==> becomes ==> C = in1 op 0.0
matrixScalarArithmetic(ec, instName, in1, outputName, isLeftTransposed, new RightScalarOperator(op.fn, 0.0));
}
else {
Pointer A = getDensePointer(in1, instName); // TODO: FIXME: Implement sparse binCellSparseOp kernel
Pointer B = getDensePointer(in2, instName); // TODO: FIXME: Implement sparse binCellSparseOp kernel
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer C = getDensePointer(out, instName);
int maxRlen = Math.max(rlenA, rlenB);
int maxClen = Math.max(clenA, clenB);
matrixMatrixOp(instName, A, B, maxRlen, maxClen, vecStatusA, vecStatusB, C, op);
}
}
/**
* Do an elementwise matrix-matrix arithmetic operation on the GPU
* c = a op b
* Either rows and cols in A are the same as in B or
* one of them is a vector or both are.
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param a The input matrix a allocated on the GPU
* @param b The input matrix b allocated on the GPU
* @param maxRlen the maximum of the row lengths between a & b
* @param maxClen the maximum of the column lengths between a & b
* @param vecStatusA if matrix A is a vector
* @param vecStatusB if matrix B is a vector
* @param c output matrix of size (maxRlen, maxClen) allocated on GPU
* @param op the operation to perform
* @throws DMLRuntimeException
*/
private static void matrixMatrixOp(String instName, Pointer a, Pointer b, int maxRlen, int maxClen, int vecStatusA, int vecStatusB, Pointer c, BinaryOperator op) throws DMLRuntimeException {
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
kernels.launchKernel("matrix_matrix_cellwise_op",
ExecutionConfig.getConfigForSimpleMatrixOperations(maxRlen, maxClen),
a, b, c, maxRlen, maxClen, vecStatusA, vecStatusB, getBinaryOp(op.fn));
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_MATRIX_MATRIX_CELLWISE_OP_KERNEL, System.nanoTime() - t0);
}
/**
* This enum declares the different vector shapes
* as they recognized in the invoked CUDA kernel(s).
*/
enum VectorShape {
COLUMN (1),
ROW (2),
NONE (0);
private final int code;
VectorShape(int code) {
this.code = code;
}
int code() { return code; }
}
/**
* Given the number of rows and columns, returns
* whether this is a row vector, column vector or neither.
* @param rows
* @param cols
* @return 1 for column vector, 2 for row vector, 0 for neither
*/
private static VectorShape getVectorStatus(long rows, long cols) {
if(cols == 1)
return VectorShape.COLUMN;
else if(rows == 1)
return VectorShape.ROW;
else
return VectorShape.NONE;
}
private static boolean isVector(MatrixObject in) {
return in.getNumRows() == 1 || in.getNumColumns() == 1;
}
private static boolean isSparseAndEmpty(MatrixObject in1) {
boolean isSparse1 = isInSparseFormat(in1);
boolean isEmpty1 = isSparse1 && (((JCudaObject)in1.getGPUObject()).jcudaSparseMatrixPtr.nnz == 0);
return isEmpty1;
}
private static void deviceCopy(ExecutionContext ec, String instName, MatrixObject src, String outputName, boolean isInputTransposed) throws DMLRuntimeException {
if(!isInputTransposed)
deviceCopy(ec, instName, src, outputName);
else
transpose(ec, instName, src, outputName);
}
/**
* Performs a deep device copy of a matrix on the GPU
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param src source matrix
* @param outputName destination variable name
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void deviceCopy(ExecutionContext ec, String instName, MatrixObject src, String outputName) throws DMLRuntimeException {
Pointer srcPtr = getDensePointer(src, instName); // TODO: FIXME: Implement sparse kernel
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer destPtr = getDensePointer(out, instName);
deviceCopy(instName, srcPtr, destPtr, (int)src.getNumRows(), (int)src.getNumColumns());
}
private static void compareAndSet(ExecutionContext ec, String instName, MatrixObject in, String outputName, double compareVal, double tolerance,
double ifEqualsVal, double ifLessThanVal, double ifGreaterThanVal) throws DMLRuntimeException {
Pointer A = getDensePointer(in, instName); // TODO: FIXME: Implement sparse kernel
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer ret = getDensePointer(out, instName);
int rlen = (int) out.getNumRows();
int clen = (int) out.getNumColumns();
// out.getMatrixCharacteristics().setNonZeros(rlen*clen);
// compareAndSet(double* A, double* ret, int rlen, int clen, double compareVal, double ifEqualsVal, double ifNotEqualsVal)
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
kernels.launchKernel("compare_and_set",
ExecutionConfig.getConfigForSimpleMatrixOperations(rlen, clen),
A, ret, rlen, clen, compareVal, tolerance, ifEqualsVal, ifLessThanVal, ifGreaterThanVal);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_COMPARE_AND_SET_KERNEL, System.nanoTime() - t0);
}
/**
* Fills an an array on the GPU with a given scalar value
* @param ec currently active instance of the {@link ExecutionContext}
* @param instName name of the invoking instruction to record{@link Statistics}.
* @param constant scalar value with which to fill the matrix
* @param outputName (internal) name of the matrix that is to be filled
* @throws DMLRuntimeException if error
*/
private static void setOutputToConstant(ExecutionContext ec, String instName, double constant, String outputName) throws DMLRuntimeException {
if(constant == 0) {
// TODO: Create sparse empty block instead
}
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer A = getDensePointer(out, instName);
int rlen = (int) out.getNumRows();
int clen = (int) out.getNumColumns();
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
int size = rlen * clen;
kernels.launchKernel("fill", ExecutionConfig.getConfigForSimpleVectorOperations(size),
A, constant, size);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_FILL_KERNEL, System.nanoTime() - t0);
}
/**
* Performs a deep copy of input device double pointer corresponding to matrix
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param src source matrix
* @param dest destination matrix
* @param rlen number of rows
* @param clen number of columns
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void deviceCopy(String instName, Pointer src, Pointer dest, int rlen, int clen) throws DMLRuntimeException {
long t0=0;
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
int size = rlen * clen * Sizeof.DOUBLE;
cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t0);
}
/**
* Helper function to get numeric value for binary op.
* This number is passed down to the CUDA kernel
* and the appropriate binary operation is performed on the GPU.
* op = {0=plus, 1=minus, 2=multiply, 3=divide, 4=power,
* 5=less, 6=lessequal, 7=greater, 8=greaterequal, 9=equal, 10=notequal,
* 11=min, 12=max, 13=and, 14=or, 15=log}
*/
private static int getBinaryOp(ValueFunction fn) throws DMLRuntimeException {
if(fn instanceof Plus) return 0;
else if(fn instanceof Minus) return 1;
else if(fn instanceof Multiply) return 2;
else if(fn instanceof Divide) return 3;
else if(fn instanceof Power) return 4;
else if(fn instanceof LessThan) return 5;
else if(fn instanceof LessThanEquals) return 6;
else if(fn instanceof GreaterThan) return 7;
else if(fn instanceof GreaterThanEquals) return 8;
else if(fn instanceof Equals) return 9;
else if(fn instanceof NotEquals) return 10;
else if(fn instanceof And) return 13;
else if(fn instanceof Or) return 14;
else if(fn instanceof Multiply2) return 2;
else if(fn instanceof Power2) return 4;
throw new DMLRuntimeException("The given value function is not supported:" + fn.getClass().getName());
}
/**
* Performs sparse and dense dgeam given two input matrices
* C = alpha* op( A ) + beta* op ( B )
* where op = transpose or not (specified by isLeftTransposed and isRightTransposed).
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 left input matrix
* @param in2 right input matrix
* @param outputName output variable name
* @param isLeftTransposed true if left matrix is transposed
* @param isRightTransposed true if right matrix is transposed
* @param alpha alpha
* @param beta beta
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void dgeam(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2, String outputName,
boolean isLeftTransposed, boolean isRightTransposed, double alpha, double beta) throws DMLRuntimeException {
Pointer alphaPtr = pointerTo(alpha);
Pointer betaPtr = pointerTo(beta);
int transa = isLeftTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int transb = isRightTransposed ? CUBLAS_OP_T : CUBLAS_OP_N;
int m = (int) in1.getNumRows();
int n = (int) in1.getNumColumns();
if(!isLeftTransposed && isRightTransposed) {
m = (int) in1.getNumColumns();
n = (int) in1.getNumRows();
}
int lda = isLeftTransposed ? n : m;
int ldb = isRightTransposed ? n : m;
int ldc = m;
MatrixObject out = ec.getMatrixObject(outputName);
boolean isSparse1 = isInSparseFormat(in1);
// boolean isEmpty1 = isSparse1 && (((JCudaObject)in1.getGPUObject()).jcudaSparseMatrixPtr.nnz == 0);
boolean isSparse2 = isInSparseFormat(in2);
// boolean isEmpty2 = isSparse2 && (((JCudaObject)in2.getGPUObject()).jcudaSparseMatrixPtr.nnz == 0);
long t0=0,t1=0;
// TODO: Implement sparse-dense matrix cublasDgeam kernel
if(isSparse1 || isSparse2) {
// Invoke cuSparse when either are in sparse format
// Perform sparse-sparse dgeam
if(!isInSparseFormat(in1)) {
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
((JCudaObject)in1.getGPUObject()).denseToSparse();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
}
CSRPointer A = ((JCudaObject)in1.getGPUObject()).jcudaSparseMatrixPtr;
if(!isInSparseFormat(in2)) {
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
((JCudaObject)in2.getGPUObject()).denseToSparse();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_TO_SPARSE, System.nanoTime() - t0);
}
CSRPointer B = ((JCudaObject)in2.getGPUObject()).jcudaSparseMatrixPtr;
ec.allocateGPUMatrixObject(outputName);
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
CSRPointer C = CSRPointer.allocateForDgeam(cusparseHandle, A, B, m, n);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_ALLOCATE_LIB, System.nanoTime() - t1);
((JCudaObject)out.getGPUObject()).setSparseMatrixCudaPointer(C);
long sizeOfC = CSRPointer.estimateSize(C.nnz, out.getNumRows());
out.getGPUObject().setDeviceModify(sizeOfC);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
JCusparse.cusparseDcsrgeam(cusparseHandle, m, n, alphaPtr, A.descr, (int)A.nnz, A.val, A.rowPtr, A.colInd, betaPtr,
B.descr, (int)B.nnz, B.val, B.rowPtr, B.colInd,
C.descr, C.val, C.rowPtr, C.colInd);
cudaDeviceSynchronize();
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_SPARSE_DGEAM_LIB, System.nanoTime() - t0);
}
else {
// Dense-Dense dgeam
Pointer A = getDensePointer(in1, instName);
Pointer B = getDensePointer(in2, instName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer C = getDensePointer(out, instName);
if (GPUStatistics.DISPLAY_STATISTICS) t0 = System.nanoTime();
JCublas2.cublasDgeam(cublasHandle, transa, transb, m, n, alphaPtr, A, lda, betaPtr, B, ldb, C, ldc);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DENSE_DGEAM_LIB, System.nanoTime() - t0);
}
}
//********************************************************************/
//****** End of Matrix-Matrix & Matrix-Scalar Functions **************/
//********************************************************************/
//********************************************************************/
//************************ Re-org Functions **************************/
//********************************************************************/
/**
* Transposes the input matrix using cublasDgeam
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in input matrix
* @param outputName output matrix name
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void transpose(ExecutionContext ec, String instName, MatrixObject in, String outputName) throws DMLRuntimeException {
// C = alpha* op( A ) + beta* op ( B )
// = 1.0 * A^T + 0.0 * A^T
dgeam(ec, instName, in, in, outputName, true, true, 1.0, 0.0);
}
//********************************************************************/
//******************* End of Re-org Functions ************************/
//********************************************************************/
//********************************************************************/
//************************ Builtin Functions *************************/
//********************************************************************/
/**
* Performs an "exp" operation on a matrix on the GPU
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix
* @param outputName output matrix name
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void exp(ExecutionContext ec, String instName, MatrixObject in1, String outputName) throws DMLRuntimeException {
JCudaObject in = ((JCudaObject)in1.getGPUObject());
boolean isSparseAndEmpty = in.isSparseAndEmpty();
long t1=0;
if (isSparseAndEmpty) {
// e^0 = 1, create a dense block full of 1s
MatrixObject out = ec.getMatrixObject(outputName);
ec.allocateGPUMatrixObject(outputName);
((JCudaObject)(out.getGPUObject())).allocateAndFillDense(1);
} else {
// Dense
MatrixObject out = getDenseMatrixOutputForGPUInstruction(ec, instName, outputName);
Pointer output = getDensePointer(out, instName);
// If the input is in sparse format, convert it to dense.
// The output will always be dense, because for all x, exp(x) > 0
Pointer input = getDensePointer(in1, instName);
int size = (int)(in1.getNumColumns() * in1.getNumRows());
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
kernels.launchKernel("matrix_exp", ExecutionConfig.getConfigForSimpleVectorOperations(size),
input, output, size);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_EXP_KERNEL, System.nanoTime() - t1);
}
}
/**
* Performs daxpy operation
*
* @param ec execution context
* @param instName the invoking instruction's name for record {@link Statistics}.
* @param in1 input matrix 1
* @param in2 input matrix 2
* @param outputName output matrix name
* @param constant pointer constant
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void axpy(ExecutionContext ec, String instName, MatrixObject in1, MatrixObject in2,
String outputName, double constant) throws DMLRuntimeException {
Pointer A = getDensePointer(in1, instName);
Pointer B = getDensePointer(in2, instName);
MatrixObject out = ec.getMatrixObject(outputName);
getDenseMatrixOutputForGPUInstruction(ec, instName, outputName); // Allocated the dense output matrix
Pointer C = getDensePointer(out, instName);
long t1=0, t2=0;
if(in1.getNumRows() == in2.getNumRows() && in1.getNumColumns() == in2.getNumColumns()) {
// Matrix-Matrix daxpy
long n = in1.getNumRows()*in2.getNumColumns(); // Since A is always a matrix
Pointer alphaPtr = pointerTo(constant);
// C <- A + alpha*B
// becomes
// C <- A
// C <- alpha*B + C
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
cudaMemcpy(C, A, n*((long)jcuda.Sizeof.DOUBLE), cudaMemcpyDeviceToDevice);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DEVICE_TO_DEVICE, System.nanoTime() - t1);
if (GPUStatistics.DISPLAY_STATISTICS) t2 = System.nanoTime();
JCublas2.cublasDaxpy(cublasHandle, (int) n, alphaPtr, B, 1, C, 1);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_LIB, System.nanoTime() - t2);
}
else {
// Matrix-Vector daxpy
// Note: Vector-Matrix operation is not supported
// daxpy_matrix_vector(double* A, double* B, double alpha, double* ret, int rlenA, int clenA, int rlenB, int clenB)
if (GPUStatistics.DISPLAY_STATISTICS) t1 = System.nanoTime();
int rlenA = (int) in1.getNumRows(); int clenA = (int) in1.getNumColumns();
int rlenB = (int) in2.getNumRows(); int clenB = (int) in2.getNumColumns();
kernels.launchKernel("daxpy_matrix_vector", ExecutionConfig.getConfigForSimpleMatrixOperations(rlenA, clenA),
A, B, constant, C, rlenA, clenA, rlenB, clenB);
if (GPUStatistics.DISPLAY_STATISTICS) GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_DAXPY_MV_KERNEL, System.nanoTime() - t1);
}
}
//********************************************************************/
//***************** END OF Builtin Functions ************************/
//********************************************************************/
/**
* Convenience method for debugging matrices on the GPU.
* @param in Pointer to a double array (matrix) on the GPU
* @param rlen row length
* @param clen column length
*/
@SuppressWarnings("unused")
private static void debugPrintMatrix(Pointer in, int rlen, int clen){
double[] data = new double[rlen * clen];
cudaMemcpy(Pointer.to(data), in, rlen*clen*Sizeof.DOUBLE, cudaMemcpyDeviceToHost);
int k=0;
for (int i=0; i mb = ec.getDenseMatrixOutputForGPUInstruction(name);
if (mb.getValue())
if (GPUStatistics.DISPLAY_STATISTICS)
GPUStatistics.maintainCPMiscTimes(instName, GPUInstruction.MISC_TIMER_ALLOCATE_DENSE_OUTPUT, System.nanoTime() - t0);
return mb.getKey();
}
}