org.apache.sysml.runtime.matrix.data.LibMatrixDNN Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix.data;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysml.api.DMLScript;
import org.apache.sysml.hops.OptimizerUtils;
import org.apache.sysml.runtime.DMLRuntimeException;
/**
* This class allows users to invoke deep learning related operations
* (such as conv2d, conv2d_backward_data, conv2d_backward_filter, maxpooling, maxpooling_backward, bias_add)
* using multiple threads.
*
* The methods accept the input matrices as MatrixBlock and the parameters using ConvolutionParameters.
*
* To run in single thread, please set ConvolutionParameters.numThreads to 1.
*/
public class LibMatrixDNN {
protected static final Log LOG = LogFactory.getLog(LibMatrixDNN.class.getName());
// ------------------------------------------------------------------------------------------------
// Useful flags for performance testing:
private static boolean DISPLAY_STATISTICS = false;
private static final boolean ALLOW_MULTI_THREADED_OPS = true;
// ------------------------------------------------------------------------------------------------
enum TaskType {
MaxPooling_Forward, MaxPooling_Backward,
// Alternate approaches that we tried but the performance was unsatisfactory be included: direct, non-looped im2col
LoopedIm2ColConv2d, LoopedIm2ColConv2dBwdFilter, LoopedIm2ColConv2dBwdData,
BiasAdd, ReluBackward
}
// ------------------------------------------------------------------------------------------------
private static AtomicLong conv2dSparseCount = new AtomicLong(0);
private static AtomicLong conv2dDenseCount = new AtomicLong(0);
private static AtomicLong conv2dBwdFilterSparseCount = new AtomicLong(0);
private static AtomicLong conv2dBwdFilterDenseCount = new AtomicLong(0);
private static AtomicLong conv2dBwdDataSparseCount = new AtomicLong(0);
private static AtomicLong conv2dBwdDataDenseCount = new AtomicLong(0);
private static AtomicLong im2colSparseCount = new AtomicLong(0);
private static AtomicLong im2colDenseCount = new AtomicLong(0);
private static AtomicLong maxPoolBwdSparseCount = new AtomicLong(0);
private static AtomicLong maxPoolBwdDenseCount = new AtomicLong(0);
private static AtomicLong loopedConvMatMultTime = new AtomicLong(0);
private static AtomicLong loopedConvIm2ColTime = new AtomicLong(0);
private static AtomicLong loopedConvBwdFilterMatMultTime = new AtomicLong(0);
private static AtomicLong loopedConvBwdFilterIm2ColTime = new AtomicLong(0);
private static AtomicLong loopedConvBwdDataMatMultTime = new AtomicLong(0);
private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0);
public static void appendStatistics(StringBuilder sb) {
if(DMLScript.STATISTICS && DISPLAY_STATISTICS && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t"
+ conv2dDenseCount.get() + "/"
+ conv2dBwdFilterDenseCount.get() + "/"
+ conv2dBwdDataDenseCount.get() + "/"
+ im2colDenseCount.get() + "/"
+ maxPoolBwdDenseCount.get() + ".\n");
sb.append("LibMatrixDNN sparse count (conv/bwdF/bwdD/im2col/maxBwd):\t"
+ conv2dSparseCount.get() + "/"
+ conv2dBwdFilterSparseCount.get() + "/"
+ conv2dBwdDataSparseCount.get() + "/"
+ im2colSparseCount.get() + "/"
+ maxPoolBwdSparseCount.get() + ".\n");
if(loopedConvMatMultTime.get() != 0 || loopedConvIm2ColTime.get() != 0) {
sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" +
String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" +
String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" +
String.format("%.3f", loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" +
String.format("%.3f", loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" +
String.format("%.3f", loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" +
String.format("%.3f", loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n");
}
}
}
public static void resetStatistics() {
conv2dDenseCount.set(0);
conv2dBwdFilterDenseCount.set(0);
conv2dBwdDataDenseCount.set(0);
im2colDenseCount.set(0);
maxPoolBwdDenseCount.set(0);
conv2dSparseCount.set(0);
conv2dBwdFilterSparseCount.set(0);
conv2dBwdDataSparseCount.set(0);
im2colSparseCount.set(0);
maxPoolBwdSparseCount.set(0);
loopedConvIm2ColTime.set(0);
loopedConvMatMultTime.set(0);
loopedConvBwdFilterMatMultTime.set(0);
loopedConvBwdFilterIm2ColTime.set(0);
loopedConvBwdDataMatMultTime.set(0);
loopedConvBwdDataCol2ImTime.set(0);
}
// ------------------------------------------------------------------------------------------------
/**
* This method computes the backpropogation errors for previous layer of convolution operation
*
* @param filter filter used in conv2d
* @param dout errors from next layer
* @param outputBlock output errors
* @param params convolution parameters
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardData(MatrixBlock filter, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = filter;
params.input2 = dout;
params.output = outputBlock;
if(filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S ||
dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
}
if(params.stride_h <= 0 || params.stride_w <= 0) {
throw new DMLRuntimeException("Only positive strides supported");
}
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
if(filter.isInSparseFormat() || dout.isInSparseFormat()) {
conv2dBwdDataSparseCount.addAndGet(1);
}
else {
conv2dBwdDataDenseCount.addAndGet(1);
}
}
runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
}
/**
* This method computes the backpropogation errors for filter of convolution operation
*
* @param input input image
* @param dout errors from next layer
* @param outputBlock output errors
* @param params convolution parameters
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void conv2dBackwardFilter(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.input2 = dout;
params.output = outputBlock;
if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W ||
dout.getNumRows() != params.N || dout.getNumColumns() != params.K*params.P*params.Q) {
throw new DMLRuntimeException("Incorrect input to conv2d_backward_filter");
}
if(params.stride_h <= 0 || params.stride_w <= 0) {
throw new DMLRuntimeException("Only positive strides supported");
}
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
if(input.isInSparseFormat() || dout.isInSparseFormat()) {
conv2dBwdFilterSparseCount.addAndGet(1);
}
else {
conv2dBwdFilterDenseCount.addAndGet(1);
}
}
runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
}
/**
* Performs the operation: ret += elem
* @param ret left and output matrix
* @param elem right matrix
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void elementWiseInPlaceAddition(MatrixBlock ret, MatrixBlock elem) throws DMLRuntimeException {
if(ret.getNumRows() != elem.getNumRows() || ret.getNumColumns() != elem.getNumColumns()) {
throw new DMLRuntimeException("Incorrect dimensions");
}
if(!ret.isInSparseFormat() && !elem.isInSparseFormat()) {
for(int i = 0; i < ret.getNumRows()*ret.getNumColumns(); i++) {
ret.denseBlock[i] += elem.denseBlock[i];
}
}
else if(!ret.isInSparseFormat() && elem.isInSparseFormat()) {
if(!elem.isEmptyBlock()) {
Iterator iter = elem.sparseBlock.getIterator();
int numCol = ret.getNumColumns();
while(iter.hasNext()) {
IJV ijv = iter.next();
int index = ijv.getI()*numCol + ijv.getJ();
ret.denseBlock[index] += ijv.getV();
}
}
}
else {
throw new DMLRuntimeException("Sparse return format not supported");
}
}
/**
* Performs the operation: ret += t(elem)
* @param ret left and output matrix
* @param elem right untransposed matrix
* @param params convolution parameters
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
private static void elementWiseInPlaceTransposedAddition(MatrixBlock ret, MatrixBlock elem) throws DMLRuntimeException {
if(ret.getNumRows() != elem.getNumColumns() || ret.getNumColumns() != elem.getNumRows()) {
throw new DMLRuntimeException("Incorrect dimensions");
}
int numRow = ret.getNumColumns();
if(!ret.isInSparseFormat() && !elem.isInSparseFormat()) {
int iter = 0;
for(int i = 0; i < elem.getNumRows(); i++) {
for(int j = 0; j < elem.getNumColumns(); j++, iter++) {
int index = j*numRow+i;
ret.denseBlock[index] += elem.denseBlock[iter];
}
}
}
else if(!ret.isInSparseFormat() && elem.isInSparseFormat()) {
if(!elem.isEmptyBlock()) {
Iterator iter = elem.sparseBlock.getIterator();
while(iter.hasNext()) {
IJV ijv = iter.next();
int index = ijv.getJ()*numRow + ijv.getI();
ret.denseBlock[index] += ijv.getV();
}
}
}
else {
throw new DMLRuntimeException("Sparse return format not supported");
}
}
private static void doLoopedIm2ColConv2dBwdData(int n, MatrixBlock dout_reshaped, ConvolutionParameters params) throws DMLRuntimeException {
MatrixBlock filter = params.input1;
MatrixBlock dout = params.input2;
doRotate180(n, 0, dout, dout_reshaped.denseBlock, params, true);
dout_reshaped.recomputeNonZeros();
MatrixBlock temp = new MatrixBlock(params.P*params.Q, params.C*params.R*params.S, false);
long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
LibMatrixMult.matrixMult(dout_reshaped, filter, temp, false);
long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
doCol2imOverSingleImage(n, temp, params);
long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
loopedConvBwdDataMatMultTime.addAndGet(t2-t1);
loopedConvBwdDataCol2ImTime.addAndGet(t3-t2);
}
}
private static MatrixBlock doLoopedIm2ColConv2dBwdFilter(int n,
MatrixBlock im2ColOutBlock, MatrixBlock dout_reshaped, MatrixBlock partialRetBlock, ConvolutionParameters params) throws DMLRuntimeException {
long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
doIm2col(n, im2ColOutBlock, params);
im2ColOutBlock.recomputeNonZeros();
long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
doRotate180(n, 0, params.input2, dout_reshaped.denseBlock, params, true);
dout_reshaped.recomputeNonZeros();
MatrixBlock temp = new MatrixBlock(params.C*params.R*params.S, params.K, false);
long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
LibMatrixMult.matrixMult(im2ColOutBlock, dout_reshaped, temp, false);
long t4 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0 ;
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
loopedConvBwdFilterMatMultTime.addAndGet(t4-t3);
loopedConvBwdFilterIm2ColTime.addAndGet(t2-t1);
}
if(!temp.isEmptyBlock())
elementWiseInPlaceTransposedAddition(partialRetBlock, temp);
return partialRetBlock;
}
private static void computeTensorIndexes(int j, int [] ret, int H, int W) throws DMLRuntimeException {
ret[0] = j / (H*W);
ret[1] = (j - ret[0]*(H*W))/W;
ret[2] = j % W;
}
public static void conv2d(MatrixBlock input, MatrixBlock filter, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.input2 = filter;
params.output = outputBlock;
if(input.getNumRows() != params.N || input.getNumColumns() != params.C*params.H*params.W ||
filter.getNumRows() != params.K || filter.getNumColumns() != params.C*params.R*params.S) {
throw new DMLRuntimeException("Incorrect input to conv2d");
}
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
if(input.isInSparseFormat() || filter.isInSparseFormat()) {
conv2dSparseCount.addAndGet(1);
}
else {
conv2dDenseCount.addAndGet(1);
}
}
if(!input.isInSparseFormat() && TEST_SPARSE_INPUT) {
input.denseToSparse();
}
if(!filter.isInSparseFormat() && TEST_SPARSE_FILTER) {
filter.denseToSparse();
}
runConvTask(TaskType.LoopedIm2ColConv2d, params);
}
private static void doLoopedIm2ColConv2d(int n, MatrixBlock im2ColOutBlock, ConvolutionParameters params) throws DMLRuntimeException {
long t1 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
doIm2col(n, im2ColOutBlock, params);
im2ColOutBlock.recomputeNonZeros();
long t2 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
MatrixBlock matMultOutBlock = new MatrixBlock(params.K, params.P*params.Q, false);
LibMatrixMult.matrixMult(params.input2, im2ColOutBlock, matMultOutBlock, false);
long t3 = DMLScript.STATISTICS && DISPLAY_STATISTICS ? System.nanoTime() : 0;
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
loopedConvIm2ColTime.addAndGet(t2 - t1);
loopedConvMatMultTime.addAndGet(t3 - t2);
}
// -----------------------------------------------------------------------------
// Copying is required as LibMatrixMult.matrixMult (and/or Java) is not pointer aware.
// This is not required in Native implementation
int destPos = n*params.K*params.P*params.Q;
int length = params.K*params.P*params.Q;
if(!matMultOutBlock.isEmptyBlock()) {
if(matMultOutBlock.isInSparseFormat()) {
// NOTE: Potential bottlenc to copy sparse matmult back to dense output
Iterator iter = matMultOutBlock.sparseBlock.getIterator();
final int outOffset = n*params.K*params.P*params.Q;
while(iter.hasNext()) {
IJV ijv = iter.next();
int k = ijv.getI();
int p = ijv.getJ() / params.Q;
int q = ijv.getJ() % params.Q;
params.output.denseBlock[outOffset + k*params.P*params.Q + p*params.Q + q] = ijv.getV();
}
}
else
System.arraycopy(matMultOutBlock.denseBlock, 0, params.output.denseBlock, destPos, length);
}
// -----------------------------------------------------------------------------
}
/**
* This method computes the backpropogation errors for previous layer of maxpooling operation
*
* @param input input matrix
* @param dout dout matrix
* @param outputBlock output matrix
* @param params convolution parameters
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void maxpoolingBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.input2 = dout;
params.output = outputBlock;
if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) {
throw new DMLRuntimeException("Incorrect input dimensions in maxpooling_backward:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
}
if(dout.getNumColumns() != params.C*params.P*params.Q || dout.getNumRows() != params.N) {
throw new DMLRuntimeException("Incorrect dout dimensions in maxpooling_backward:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
}
if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
if(input.isInSparseFormat() || dout.isInSparseFormat()) {
maxPoolBwdSparseCount.addAndGet(1);
}
else {
maxPoolBwdDenseCount.addAndGet(1);
}
}
if (params.output.isInSparseFormat())
throw new DMLRuntimeException("Sparse maxpooling_backward is not supported");
fillIndexesArray(params);
runConvTask(TaskType.MaxPooling_Backward, params);
}
private static void fillIndexesArray(ConvolutionParameters params) {
params.start_indexes_h = new int[params.P];
params.end_indexes_h = new int[params.P];
params.start_indexes_w = new int[params.Q];
params.end_indexes_w = new int[params.Q];
for (int p = 0; p < params.P; p++) {
int start_index_h = p * params.stride_h - params.pad_h;
final int end_index_h = Math.min(start_index_h + params.R, params.H);
start_index_h = Math.max(start_index_h, 0);
params.start_indexes_h[p] = start_index_h;
params.end_indexes_h[p] = end_index_h;
}
for (int q = 0; q < params.Q; q++) {
int start_index_w = Math.max(q * params.stride_w - params.pad_w, 0);
int end_index_w = Math.min(start_index_w + params.S, params.W);
start_index_w = Math.max(start_index_w, 0);
params.start_indexes_w[q] = start_index_w;
params.end_indexes_w[q] = end_index_w;
}
}
private static void doPoolingBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
double [] inputArray = null;
if (!params.input1.isInSparseFormat())
inputArray = params.input1.getDenseBlock();
double [] doutArray = null;
if (!params.input2.isInSparseFormat())
doutArray = params.input2.getDenseBlock();
double [] outputArray = null;
if (!params.output.isInSparseFormat())
outputArray = params.output.getDenseBlock();
else
throw new DMLRuntimeException("Only dense output supported for pooling_backward");
if(inputArray != null) {
if(doutArray != null)
doPoolingBackwardDenseDense(n, inputArray, doutArray, outputArray, params);
else
doPoolingBackwardDenseSparse(n, inputArray, params.input2, outputArray, params);
}
else {
if(doutArray != null)
doPoolingBackwardSparseDense(n, doutArray, outputArray, params);
else
doPoolingBackwardSparseSparse(n, outputArray, params);
}
}
private static void doPoolingBackwardSparseDense(int n, double [] doutArray, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
if (!params.input1.isInSparseFormat())
throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
for (int c = 0; c < params.C; c++) {
for (int p = 0; p < params.P; p++) {
for (int q = 0; q < params.Q; q++) {
double inVal = doutArray[n*params.C*params.P*params.Q + c*params.P*params.Q + p * params.Q + q];
if(inVal != 0) {
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
outputArray[maxIndex] += inVal;
}
}
}
}
}
private static void doPoolingBackwardSparseSparse(int n, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
if (!params.input1.isInSparseFormat())
throw new DMLRuntimeException("Incorrect usage: Call optimized versions");
// params.input2.isEmptyBlock() check is done by the caller
Iterator iter = params.input2.sparseBlock.getIterator(n, n+1);
int [] tensorIndexes = new int[3];
while(iter.hasNext()) {
IJV ijv = iter.next();
computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
int c = tensorIndexes[0];
int p = tensorIndexes[1];
int q = tensorIndexes[2];
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
int maxIndex = getMaxIndexSparse(p, q, inputOffset, n, c, params.input1, params);
outputArray[maxIndex] += ijv.getV();
}
}
private static void doPoolingBackwardDenseSparse(int n, double [] inputArray,
MatrixBlock dout, double [] outputArray, ConvolutionParameters params) throws DMLRuntimeException {
// dout.isEmptyBlock() check is done by the caller
Iterator iter = dout.sparseBlock.getIterator(n, n+1);
int [] tensorIndexes = new int[3];
while(iter.hasNext()) {
IJV ijv = iter.next();
computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
int c = tensorIndexes[0];
int p = tensorIndexes[1];
int q = tensorIndexes[2];
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
outputArray[maxIndex] += ijv.getV();
}
}
private static void doPoolingBackwardDenseDense(int n, double [] inputArray, double [] doutArray,
double [] outputArray, ConvolutionParameters params) {
for (int c = 0; c < params.C; c++) {
final int inputOffset = n*params.C*params.H*params.W + c*params.H*params.W;
final int outputOffset = n*params.C*params.P*params.Q + c*params.P*params.Q;
for (int p = 0; p < params.P; p++) {
for (int q = 0; q < params.Q; q++) {
int maxIndex = getMaxIndex(p, q, inputOffset, inputArray, params);
outputArray[maxIndex] += doutArray[outputOffset + p * params.Q + q];
}
}
}
}
private static int getMaxIndexSparse(int p, int q, int inputOffset, int n, int c, MatrixBlock input, ConvolutionParameters params) throws DMLRuntimeException {
if(!input.isInSparseFormat())
throw new DMLRuntimeException("Incorrect usage: Only sparse format supported");
// input.isEmptyBlock() check is done by the caller
Iterator iter = input.sparseBlock.getIterator(n, n+1);
int [] tensorIndexes = new int[3];
int start_index_h = params.start_indexes_h[p];
int end_index_h = params.end_indexes_h[p];
int start_index_w = params.start_indexes_w[q];
int end_index_w = params.end_indexes_w[q];
int maxIndex = inputOffset + start_index_h*params.W + start_index_w;
double maxVal = -Double.MAX_VALUE;
// Find maxIndex
double currDoutVal = -1;
while(iter.hasNext()) {
IJV ijv = iter.next();
computeTensorIndexes(ijv.getJ(), tensorIndexes, params.H, params.W);
if(c != tensorIndexes[0])
continue;
int h = tensorIndexes[1];
int w = tensorIndexes[2];
if(h >= start_index_h && h < end_index_h && w >= start_index_w && w < end_index_w) {
currDoutVal = ijv.getV();
if(maxVal < currDoutVal) {
maxIndex = inputOffset + h*params.W + w;
maxVal = currDoutVal;
}
}
}
return maxIndex;
}
private static int getMaxIndex(int p, int q, int inputOffset, double [] inputArray, ConvolutionParameters params) {
int start_index_h = params.start_indexes_h[p];
int end_index_h = params.end_indexes_h[p];
int start_index_w = params.start_indexes_w[q];
int end_index_w = params.end_indexes_w[q];
int maxIndex = inputOffset + start_index_h*params.W + start_index_w;
double maxVal = -Double.MAX_VALUE;
// Find maxIndex
double currDoutVal = -1;
for (int h = start_index_h; h < end_index_h; h++) {
for (int w = start_index_w; w < end_index_w; w++) {
currDoutVal = inputArray[inputOffset + h*params.W + w];
if(maxVal < currDoutVal) {
maxIndex = inputOffset + h*params.W + w;
maxVal = currDoutVal;
}
}
}
return maxIndex;
}
/**
* This method computes the backpropagation errors for previous layer of relu operation
*
* @param input input matrix
* @param dout errors from next layer
* @param outputBlock output matrix
* @param numThreads number of threads
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void reluBackward(MatrixBlock input, MatrixBlock dout, MatrixBlock outputBlock, int numThreads) throws DMLRuntimeException {
int N = input.getNumRows();
ConvolutionParameters params = new ConvolutionParameters(N, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, numThreads);
params.input1 = input;
params.input2 = dout;
params.output = outputBlock;
if(input.getNumRows() != dout.getNumRows() || input.getNumColumns() != dout.getNumColumns()) {
throw new DMLRuntimeException("Incorrect dimensions for relu_backward:" +
input.getNumRows() + " != " + dout.getNumRows() + " || " + input.getNumColumns() + " != " + dout.getNumColumns());
}
runConvTask(TaskType.ReluBackward, params);
}
private static void doReluBackward(int n, ConvolutionParameters params) throws DMLRuntimeException {
// (X > 0) * dout
double [] outputArray = params.output.getDenseBlock();
int numOutCols = params.input1.getNumColumns();
if(!params.input1.isInSparseFormat() && !params.input2.isInSparseFormat()) {
double [] inputArr = params.input1.getDenseBlock();
double [] doutArr = params.input2.getDenseBlock();
for(int i = n*numOutCols; i < (n+1)*numOutCols; i++) {
outputArray[i] = inputArr[i] > 0 ? doutArr[i] : 0;
}
}
else {
// Perform (X > 0)
if(params.input1.isInSparseFormat()) {
Iterator iter = params.input1.sparseBlock.getIterator(n, n+1);
while(iter.hasNext()) {
IJV ijv = iter.next();
int i = ijv.getI();
int j = ijv.getJ();
outputArray[i*numOutCols + j] = ijv.getV() > 0 ? 1 : 0;
}
}
else {
double [] inputArr = params.input1.getDenseBlock();
for(int i = n*numOutCols; i < (n+1)*numOutCols; i++) {
outputArray[i] = inputArr[i] > 0 ? 1 : 0;
}
}
// Then perform (X > 0) * dout
if(params.input2.isInSparseFormat()) {
Iterator iter = params.input2.sparseBlock.getIterator(n, n+1);
while(iter.hasNext()) {
IJV ijv = iter.next();
int i = ijv.getI();
int j = ijv.getJ();
outputArray[i*numOutCols + j] *= ijv.getV();
}
}
else {
double [] doutArr = params.input2.getDenseBlock();
for(int i = n*numOutCols; i < (n+1)*numOutCols; i++) {
outputArray[i] *= doutArr[i];
}
}
}
}
/**
* Performs the operation corresponding to the DML script:
* ones = matrix(1, rows=1, cols=Hout*Wout)
* output = input + matrix(bias %*% ones, rows=1, cols=F*Hout*Wout)
* This operation is often followed by conv2d and hence we have introduced bias_add(input, bias) built-in function
*
* @param input input matrix
* @param bias bias matrix
* @param outputBlock output matrix
* @param numThreads number of threads
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
public static void biasAdd(MatrixBlock input, MatrixBlock bias, MatrixBlock outputBlock, int numThreads) throws DMLRuntimeException {
int N = input.getNumRows();
int K = bias.getNumRows();
int PQ = input.getNumColumns() / K;
ConvolutionParameters params = new ConvolutionParameters(N, PQ, -1, -1, K, -1, -1, -1, -1, -1, -1, numThreads);
params.input1 = input;
params.input2 = bias;
params.output = outputBlock;
if(!input.isInSparseFormat() && TEST_SPARSE_INPUT) {
input.denseToSparse();
}
if(!bias.isInSparseFormat() && TEST_SPARSE_FILTER) {
bias.denseToSparse();
}
if(bias.getNumColumns() != 1 || input.getNumColumns() % K != 0) {
throw new DMLRuntimeException("Incorrect inputs for bias_add: input[" + N + " X " + input.getNumColumns() + "] and bias[" + K + " X " + bias.getNumColumns() + "]");
}
if(input.isEmptyBlock()) {
double [] outputArray = outputBlock.getDenseBlock();
for(int n = 0; n < N; n++)
fillBias(bias, outputArray, n, n+1, N, K, PQ);
}
else {
runConvTask(TaskType.BiasAdd, params);
}
}
private static void doBiasAdd(int n1, int n2, ConvolutionParameters params) throws DMLRuntimeException {
double [] outputArray = params.output.getDenseBlock();
int PQ = params.C;
int numOutCols = params.input1.getNumColumns();
if(!params.input1.isInSparseFormat() && !params.input2.isInSparseFormat()) {
double [] inputArr = params.input1.getDenseBlock();
double [] biasArr = params.input2.getDenseBlock();
int K = params.K;
int index = n1*K*PQ;
for(int n = n1; n < n2; n++) {
for(int k = 0; k < K; k++) {
for(int pq = 0; pq < PQ; pq++, index++) {
outputArray[index] = inputArr[index] + biasArr[k];
}
}
}
}
else {
fillBias(params.input2, outputArray, n1, n2, params.N, params.K, PQ);
if(params.input1.isInSparseFormat()) {
Iterator iter = params.input1.sparseBlock.getIterator(n1, n2);
while(iter.hasNext()) {
IJV ijv = iter.next();
int i = ijv.getI();
int j = ijv.getJ();
outputArray[i*numOutCols + j] += ijv.getV();
}
}
else {
double [] inputArr = params.input1.getDenseBlock();
for(int i = n1*numOutCols; i < n2*numOutCols; i++) {
outputArray[i] += inputArr[i];
}
}
}
}
private static void fillBias(MatrixBlock bias, double [] outputArray, int n1, int n2, int N, int K, int PQ) {
if(bias.isInSparseFormat()) {
Iterator iter = bias.sparseBlock.getIterator();
while(iter.hasNext()) {
IJV ijv = iter.next();
int k = ijv.getI();
double val = ijv.getV();
for(int n = n1; n < n2; n++) {
int fromIndex = n*K*PQ + k*PQ;
Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val);
}
}
}
else {
double [] biasArr = bias.getDenseBlock();
for(int n = n1; n < n2; n++) {
for(int k = 0; k < K; k++) {
int fromIndex = n*K*PQ + k*PQ;
double val = biasArr[k];
Arrays.fill(outputArray, fromIndex, fromIndex + PQ, val);
}
}
}
}
public static void maxpooling(MatrixBlock input, MatrixBlock outputBlock, ConvolutionParameters params) throws DMLRuntimeException {
params.input1 = input;
params.output = outputBlock;
if(input.getNumColumns() != params.C*params.H*params.W || input.getNumRows() != params.N) {
throw new DMLRuntimeException("Incorrect input dimensions in maxpooling:" + input.getNumRows() + " " + input.getNumColumns() + " " + params.N + " " + params.K*params.P*params.Q);
}
fillIndexesArray(params);
runConvTask(TaskType.MaxPooling_Forward, params);
}
private static void doPooling(int n, ConvolutionParameters params) throws DMLRuntimeException {
double [] inputArray = null;
if (!params.input1.isInSparseFormat())
inputArray = params.input1.getDenseBlock();
double [] outputArray = null;
if (!params.output.isInSparseFormat())
outputArray = params.output.getDenseBlock();
else
throw new DMLRuntimeException("Expected the output to be allocated in dense format");
final int inOffset = n*params.C*params.H*params.W;
int out_index = n*params.C*params.P*params.Q;
final int HW = params.H*params.W;
if(inputArray != null) {
for (int c = 0; c < params.C; c++) {
final int inOffset1 = inOffset + c*HW;
for (int p = 0; p < params.P; p++) {
for (int q = 0; q < params.Q; q++, out_index++) {
for (int h = params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) {
for (int w = params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) {
outputArray[out_index] = Math.max(outputArray[out_index], inputArray[inOffset1 + h*params.W + w]);
}
}
}
}
}
}
else {
// TODO: Optimize sparse maxpooling
// Low priority after adding fused relu_maxpooling operator as output of conv2d expected to be dense
for (int c = 0; c < params.C; c++) {
for (int p = 0; p < params.P; p++) {
for (int q = 0; q < params.Q; q++, out_index++) {
for (int h = params.start_indexes_h[p]; h < params.end_indexes_h[p]; h++) {
for (int w = params.start_indexes_w[q]; w < params.end_indexes_w[q]; w++) {
double inVal = params.input1.quickGetValue(n, c*HW + h*params.W + w);
outputArray[out_index] = Math.max(outputArray[out_index], inVal);
}
}
}
}
}
}
}
private static void doRotate180(int inputN, int outputN, MatrixBlock input,
double [] outputArray, ConvolutionParameters params, boolean zeroOutSparseOutput) throws DMLRuntimeException {
double [] inputArray = null;
if (!input.isInSparseFormat())
inputArray = input.getDenseBlock();
if(outputArray == null)
throw new DMLRuntimeException("Sparse output is not supported for rotate180");
int outputOffset = outputN*params.K*params.P*params.Q;
if(inputArray != null) {
for (int k = 0; k < params.K; k++) {
for (int p = 0; p < params.P; p++) {
for (int q = 0; q < params.Q; q++) {
outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = inputArray[inputN*params.K*params.P*params.Q + k*params.P*params.Q + p*params.Q + q];
}
}
}
}
else {
if(zeroOutSparseOutput)
Arrays.fill(outputArray, 0);
if(!input.isEmptyBlock()) {
Iterator iter = input.sparseBlock.getIterator(inputN, inputN+1);
int [] tensorIndexes = new int[3];
while(iter.hasNext()) {
IJV ijv = iter.next();
computeTensorIndexes(ijv.getJ(), tensorIndexes, params.P, params.Q);
int k = tensorIndexes[0];
int p = tensorIndexes[1];
int q = tensorIndexes[2];
outputArray[outputOffset + p*params.Q*params.K + q*params.K + k] = ijv.getV();
}
}
}
}
// ----------------------------------------------------------------------------------------------------------------
private static void addMatrixBlocks(int poolSize, TaskType type, ConvolutionParameters params,
ConcurrentLinkedQueue im2ColOutBlocks, ConcurrentLinkedQueue doutReshapedBlocks,
ConcurrentLinkedQueue partialRetBlocks) {
for(int i = 0; i < poolSize; i++) {
if(type == TaskType.LoopedIm2ColConv2d || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
MatrixBlock im2ColOutBlock = new MatrixBlock(params.C*params.R*params.S, params.P*params.Q, false);
im2ColOutBlock.allocateDenseBlock(true);
im2ColOutBlocks.add(im2ColOutBlock);
}
if(type == TaskType.LoopedIm2ColConv2dBwdFilter) {
MatrixBlock partialRetBlock = new MatrixBlock(params.K, params.C*params.R*params.S, false);
partialRetBlock.allocateDenseBlock(true);
partialRetBlocks.add(partialRetBlock);
}
if(type == TaskType.LoopedIm2ColConv2dBwdData || type == TaskType.LoopedIm2ColConv2dBwdFilter) {
MatrixBlock doutReshapedBlock = new MatrixBlock(params.P*params.Q, params.K, false);
doutReshapedBlock.allocateDenseBlock(true);
doutReshapedBlocks.add(doutReshapedBlock);
}
}
}
// Methods to execute convolution-related tasks using multiple threads.
private static void runConvTask(TaskType type, ConvolutionParameters params) throws DMLRuntimeException {
int constrainedNumThreads = OptimizerUtils.getConstrainedNumThreads(params.numThreads);
ConcurrentLinkedQueue im2ColOutBlocks = new ConcurrentLinkedQueue();
ConcurrentLinkedQueue doutReshapedBlocks = new ConcurrentLinkedQueue();
ConcurrentLinkedQueue partialRetBlocks = new ConcurrentLinkedQueue();
if (ALLOW_MULTI_THREADED_OPS && params.isOutputThreadSafe() && constrainedNumThreads > 1) {
int poolSize = Math.min(constrainedNumThreads, params.N);
addMatrixBlocks(poolSize, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks);
ArrayList tasks = new ArrayList();
int NSize = params.N - poolSize;
if(NSize >= constrainedNumThreads) {
for(int n = 0; n < params.N; n++)
tasks.add(new ConvTask(n, n+1, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
}
else {
int numNTasks = (int) Math.ceil(((double) NSize) / constrainedNumThreads);
for (int n = 0; n < NSize; n += numNTasks) {
tasks.add(new ConvTask(n, Math.min(NSize, n+numNTasks), type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
}
for (int n = NSize; n < params.N; n++)
tasks.add(new ConvTask(n, n+1, type, params, im2ColOutBlocks, doutReshapedBlocks, partialRetBlocks));
}
ExecutorService pool = Executors.newFixedThreadPool( poolSize );
List> taskret;
try {
taskret = pool.invokeAll(tasks);
pool.shutdown();
for( Future