scripts.nn.test.grad_check.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
* Gradient checks for various architectures.
*/
source("nn/layers/affine.dml") as affine
source("nn/layers/low_rank_affine.dml") as low_rank_affine
source("nn/layers/batch_norm1d.dml") as batch_norm1d
source("nn/layers/batch_norm2d.dml") as batch_norm2d
source("nn/layers/conv2d.dml") as conv2d
source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
source("nn/layers/conv2d_depthwise.dml") as conv2d_depthwise
source("nn/layers/conv2d_transpose.dml") as conv2d_transpose
source("nn/layers/conv2d_transpose_depthwise.dml") as conv2d_transpose_depthwise
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/cross_entropy_loss2d.dml") as cross_entropy_loss2d
source("nn/layers/dropout.dml") as dropout
source("nn/layers/fm.dml") as fm
source("nn/layers/l1_loss.dml") as l1_loss
source("nn/layers/l1_reg.dml") as l1_reg
source("nn/layers/l2_loss.dml") as l2_loss
source("nn/layers/l2_reg.dml") as l2_reg
source("nn/layers/log_loss.dml") as log_loss
source("nn/layers/lstm.dml") as lstm
source("nn/layers/max_pool2d.dml") as max_pool2d
source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
source("nn/layers/avg_pool2d_builtin.dml") as avg_pool2d_builtin
source("nn/layers/upsample2d.dml") as upsample2d
source("nn/layers/relu.dml") as relu
source("nn/layers/rnn.dml") as rnn
source("nn/layers/scale_shift1d.dml") as scale_shift1d
source("nn/layers/scale_shift2d.dml") as scale_shift2d
source("nn/layers/sigmoid.dml") as sigmoid
source("nn/layers/softmax.dml") as softmax
source("nn/layers/softmax2d.dml") as softmax2d
source("nn/layers/tanh.dml") as tanh
source("nn/test/conv2d_simple.dml") as conv2d_simple
source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
source("nn/test/util.dml") as test_util
source("nn/util.dml") as util
source("nn/layers/elu.dml") as elu
affine = function() {
/*
* Gradient check for the affine layer.
*/
print("Grad checking the affine layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
M = 10 # num neurons
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=M)
[W, b] = affine::init(D, M)
# Compute analytical gradients of loss wrt parameters
out = affine::forward(X, W, b)
dout = l2_loss::backward(out, y)
[dX, dW, db] = affine::backward(dout, X, W, b)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = affine::forward(X, W, b)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = affine::forward(X, W, b)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
outmh = affine::forward(X, W, b)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
outph = affine::forward(X, W, b)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
outmh = affine::forward(X, W, b)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
outph = affine::forward(X, W, b)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
low_rank_affine = function() {
/*
* Gradient check for the low rank affine layer.
*/
print("Grad checking the low rank affine layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
M = 10 # num neurons
R = 2 # rank
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=M)
[U, V, b] = low_rank_affine::init(D, M, R)
# Compute analytical gradients of loss wrt parameters
out = low_rank_affine::forward(X, U, V, b)
dout = l2_loss::backward(out, y)
[dX, dU, dV, db] = low_rank_affine::backward(dout, X, U, V, b)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking U.")
for (i in 1:nrow(U)) {
for (j in 1:ncol(U)) {
# Compute numerical derivative
old = as.scalar(U[i,j])
U[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
U[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
U[i,j] = old # reset
dU_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dU[i,j]), dU_num, lossph, lossmh)
}
}
print(" - Grad checking V.")
for (i in 1:nrow(V)) {
for (j in 1:ncol(V)) {
# Compute numerical derivative
old = as.scalar(V[i,j])
V[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
V[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
V[i,j] = old # reset
dV_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
outmh = low_rank_affine::forward(X, U, V, b)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
outph = low_rank_affine::forward(X, U, V, b)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
batch_norm1d = function() {
/*
* Gradient check for the 1D batch normalization layer.
*/
print("Grad checking the 1D batch normalization layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
mu = 0.9 # momentum
eps = 1e-5 # epsilon
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
gamma = rand(rows=1, cols=D)
beta = rand(rows=1, cols=D)
ema_mean = rand(rows=1, cols=D)
ema_var = rand(rows=1, cols=D)
#[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)
# Check training & testing modes
for (i in 1:2) {
if (i == 1)
mode = 'train'
else
mode = 'test'
print(" - Grad checking the '"+mode+"' mode.")
# Compute analytical gradients of loss wrt parameters
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
cache_mean, cache_var, cache_norm,
X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
}
}
batch_norm2d = function() {
/*
* Gradient check for the 2D (spatial) batch normalization layer.
*/
print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")
# Generate data
N = 3 # num examples
C = 2 # num channels
Hin = 5 # input height
Win = 5 # input width
mu = 0.9 # momentum
eps = 1e-5 # epsilon
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*Hin*Win)
gamma = rand(rows=C, cols=1)
beta = rand(rows=C, cols=1)
ema_mean = rand(rows=C, cols=1)
ema_var = rand(rows=C, cols=1)
#[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)
# Check training & testing modes
for (i in 1:2) {
if (i == 1)
mode = 'train'
else
mode = 'test'
print(" - Grad checking the '"+mode+"' mode.")
# Compute analytical gradients of loss wrt parameters
[out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
cache_mean, cache_var, cache_norm,
X, gamma, beta, C, Hin, Win, mode,
ema_mean, ema_var, mu, eps)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
[outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
[outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
}
}
conv2d = function() {
/*
* Gradient check for the 2D convolutional layer using `im2col`.
*/
print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
F = 4 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=F*Hin*Win)
# Create layers
[W, b] = conv2d::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_builtin = function() {
/*
* Gradient check for the 2D convolutional layer using built-in
* functions.
*/
print("Grad checking the built-in 2D convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
F = 4 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=F*Hin*Win)
# Create layers
[W, b] = conv2d_builtin::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_simple = function() {
/*
* Gradient check for the simple reference 2D convolutional layer.
*/
print("Grad checking the simple reference 2D convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
F = 4 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=F*Hin*Win)
# Create layers
[W, b] = conv2d_simple::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_depthwise = function() {
/*
* Gradient check for the 2D depthwise convolutional layer.
*/
print("Grad checking the 2D depthwise convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 3 # num channels
Hin = 5 # input height
Win = 5 # input width
M = 4 # depth multiplier
Hf = 3 # filter height
Wf = 3 # filter width
stride = 1
pad = 1
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*M*Hin*Win)
# Create layers
[W, b] = conv2d_depthwise::init(C, M, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
[dX, dW, db] = conv2d_depthwise::backward(dout, Hout, Wout, X, W, b, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_transpose = function() {
/*
* Gradient check for the 2D transpose convolutional layer.
*/
print("Grad checking the 2D transpose convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 3 # input height
Win = 3 # input width
F = 2 # num filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 2
pad = 1
out_pad = 1
X = rand(rows=N, cols=C*Hin*Win)
# Create layers
[W, b] = conv2d_transpose::init(F, C, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
y = rand(rows=N, cols=F*Hout*Wout)
dout = l2_loss::backward(out,y)
[dX, dW, db] = conv2d_transpose::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad, out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
conv2d_transpose_depthwise = function() {
/*
* Gradient check for the 2D depthwise transpose convolutional layer.
*/
print("Grad checking the 2D depthwise transpose convolutional layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 8 # num channels
Hin = 3 # input height
Win = 3 # input width
M = 4 # depth of filters
Hf = 3 # filter height
Wf = 3 # filter width
stride = 2
pad = 1
out_pad = 1
X = rand(rows=N, cols=C*Hin*Win)
# Create layers
[W, b] = conv2d_transpose_depthwise::init(C, M, Hf, Wf)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
y = rand(rows=N, cols=C/M*Hout*Wout)
dout = l2_loss::backward(out,y)
[dX, dW, db] = conv2d_transpose_depthwise::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, M,
Hf, Wf, stride, stride, pad, pad)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
stride, stride, pad, pad,
out_pad, out_pad)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
}
cross_entropy_loss = function() {
/*
* Gradient check for the cross-entropy loss function.
*/
print("Grad checking the cross-entropy loss function.")
# Generate data
N = 3 # num examples
K = 10 # num targets
pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
pred = softmax::forward(pred) # normalized probs
y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
y = softmax::forward(y) # normalized probs
# Compute analytical gradient
dpred = cross_entropy_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = cross_entropy_loss::forward(pred, y)
pred[i,j] = old + h
lossph = cross_entropy_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
cross_entropy_loss2d = function() {
/*
* Gradient check for the 2D cross-entropy loss function.
*/
print("Grad checking the 2D cross-entropy loss function.")
# Generate data
N = 3 # num examples
C = 10 # num targets
Hin = 5 # example height
Win = 5 # example width
pred = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
pred = softmax2d::forward(pred, C) # normalized probs
y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
y = softmax2d::forward(y, C) # normalized probs
# Compute analytical gradient
dpred = cross_entropy_loss2d::backward(pred, y, C)
# Grad check
h = 1e-6
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = cross_entropy_loss2d::forward(pred, y, C)
pred[i,j] = old + h
lossph = cross_entropy_loss2d::forward(pred, y, C)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
dropout = function() {
/*
* Gradient check for the (inverted) dropout layer.
*/
print("Grad checking the (inverted) dropout layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 100 # num neurons
p = 0.5 # probability of dropping neuron output
seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) # random seed
X = rand(rows=N, cols=M)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
[out, mask] = dropout::forward(X, p, seed)
dout = l2_loss::backward(out, y)
dX = dropout::backward(dout, X, p, mask)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, mask] = dropout::forward(X, p, seed)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, mask] = dropout::forward(X, p, seed)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
fm = function() {
/*
* Gradient check for the factorization machines.
*/
print("Grad checking the factorization machines with L2 loss.")
# Generate data
n = 5# num examples
d = 100 # num features
k = 2 # factorization dimensionality
X = rand(rows=n, cols=d)
y = rand(rows=n, cols=1)
[w0, W, V] = fm::init(n, d, k)
# Compute analytical gradients of loss wrt parameters
out = fm::forward(X, w0, W, V)
dout = l2_loss::backward(out, y)
[dw0, dW, dV] = fm::backward(dout, X, w0, W, V)
# Grad check
h = 1e-5
print(" - Grad checking w0.")
for (i in 1:nrow(w0)) {
for (j in 1:ncol(w0)) {
# Compute numerical derivative
old = as.scalar(w0[i,j])
w0[i,j] = old - h # h = 1e-5
outmh = fm::forward(X, w0, W, V)
lossmh = l2_loss::forward(outmh, y)
w0[i,j] = old + h # h = 1e-5
outph = fm::forward(X, w0, W, V)
lossph = l2_loss::forward(outph, y)
w0[i,j] = old # reset
dw0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dw0[i,j]), dw0_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h # h = 1e-5
outmh = fm::forward(X, w0, W, V)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h # h = 1e-5
outph = fm::forward(X, w0, W, V)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking V.")
for (i in 1:nrow(V)) {
for(i in 1:ncol(V)) {
# Compute numerical derivative
old = as.scalar(V[i,j])
V[i,j] = old - h # h = 1e-5
outmh = fm::forward(X, w0, W, V)
lossmh = l2_loss::forward(outmh, y)
V[i,j] = old + h # h= 1e-5
outph = fm::forward(X, w0, W, V)
lossph = l2_loss::forward(outph, y)
V[i,j] = old # reset
dV_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
}
}
}
l1_loss = function() {
/*
* Gradient check for the L1 loss function.
*/
print("Grad checking the L1 loss function.")
# Generate data
N = 3 # num examples
D = 2 # num targets
pred = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
# Compute analytical gradient
dpred = l1_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = l1_loss::forward(pred, y)
pred[i,j] = old + h
lossph = l1_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
l1_reg = function() {
/*
* Gradient check for the L1 regularization function.
*/
print("Grad checking the L1 regularization function.")
# Generate data
D = 5 # num features
M = 3 # num neurons
lambda = 0.01
W = rand(rows=D, cols=M)
# Compute analytical gradient
dW = l1_reg::backward(W, lambda)
# Grad check
h = 1e-5
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
reg_lossmh = l1_reg::forward(W, lambda)
W[i,j] = old + h
reg_lossph = l1_reg::forward(W, lambda)
W[i,j] = old # reset W[i,j]
dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
reg_lossph, reg_lossmh)
}
}
}
l2_loss = function() {
/*
* Gradient check for the L2 loss function.
*/
print("Grad checking the L2 loss function.")
# Generate data
N = 3 # num examples
D = 2 # num targets
pred = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
# Compute analytical gradient
dpred = l2_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = l2_loss::forward(pred, y)
pred[i,j] = old + h
lossph = l2_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
l2_reg = function() {
/*
* Gradient check for the L2 regularization function.
*/
print("Grad checking the L2 regularization function.")
# Generate data
D = 5 # num features
M = 3 # num neurons
lambda = 0.01
W = rand(rows=D, cols=M)
# Compute analytical gradient
dW = l2_reg::backward(W, lambda)
# Grad check
h = 1e-5
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
reg_lossmh = l2_reg::forward(W, lambda)
W[i,j] = old + h
reg_lossph = l2_reg::forward(W, lambda)
W[i,j] = old # reset W[i,j]
dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
reg_lossph, reg_lossmh)
}
}
}
log_loss = function() {
/*
* Gradient check for the log loss function.
*/
print("Grad checking the log loss function.")
# Generate data
N = 20 # num examples
D = 1 # num targets
pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
# Compute analytical gradient
dpred = log_loss::backward(pred, y)
# Grad check
h = 1e-5
for (i in 1:nrow(pred)) {
for (j in 1:ncol(pred)) {
# Compute numerical derivative
old = as.scalar(pred[i,j])
pred[i,j] = old - h
lossmh = log_loss::forward(pred, y)
pred[i,j] = old + h
lossph = log_loss::forward(pred, y)
pred[i,j] = old # reset W[i,j]
dpred_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
}
}
}
lstm = function() {
/*
* Gradient check for the LSTM layer.
*/
print("Grad checking the LSTM layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 5 # num features
T = 15 # num timesteps (sequence length)
M = 10 # num neurons
X = rand(rows=N, cols=T*D)
yc = rand(rows=N, cols=M)
out0 = rand(rows=N, cols=M)
c0 = rand(rows=N, cols=M)
[W, b, dummy, dummy2] = lstm::init(N, D, M)
# test with (1) outputs from all timesteps, and (2) output from the final timestep
for (i in 1:2) {
if (i == 1) {
return_seq = TRUE
y = rand(rows=N, cols=T*M)
}
else {
return_seq = FALSE
y = rand(rows=N, cols=M)
}
print(" - Grad checking with return_seq = " + return_seq)
# Compute analytical gradients of loss wrt parameters
[out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
dout = l2_loss::backward(out, y)
dc = l2_loss::backward(c, yc)
[dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
cache_out, cache_c, cache_ifog)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
X[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
W[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
b[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
print(" - Grad checking out0.")
for (i in 1:nrow(out0)) {
for (j in 1:ncol(out0)) {
# Compute numerical derivative
old = as.scalar(out0[i,j])
out0[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
out0[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
out0[i,j] = old # reset
dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
}
}
print(" - Grad checking c0.")
for (i in 1:nrow(c0)) {
for (j in 1:ncol(c0)) {
# Compute numerical derivative
old = as.scalar(c0[i,j])
c0[i,j] = old - h
[outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outmh = l2_loss::forward(outmh, y)
loss_cmh = l2_loss::forward(cmh, yc)
lossmh = loss_outmh + loss_cmh
c0[i,j] = old + h
[outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
loss_outph = l2_loss::forward(outph, y)
loss_cph = l2_loss::forward(cph, yc)
lossph = loss_outph + loss_cph
c0[i,j] = old # reset
dc0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
}
}
}
}
max_pool2d = function() {
/*
* Gradient check for the 2D max pooling layer.
*/
print("Grad checking the 2D max pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
max_pool2d_builtin = function() {
/*
* Gradient check for the 2D max pooling layer.
*/
print("Grad checking the built-in 2D max pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
avg_pool2d_builtin = function() {
/*
* Gradient check for the 2D avg pooling layer.
*/
print("Grad checking the built-in 2D avg pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
dout = l2_loss::backward(out, y)
dX = avg_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
max_pool2d_simple = function() {
/*
* Gradient check for the simple reference 2D max pooling layer.
*/
print("Grad checking the simple reference 2D max pooling layer with L2 loss.")
# Generate data
N = 2 # num examples
C = 2 # num channels
Hin = 4 # input height
Win = 4 # input width
Hf = 2 # pool filter height
Wf = 2 # pool filter width
stride = 2
X = rand(rows=N, cols=C*Hin*Win)
for (pad in 0:1) {
print(" - Grad checking w/ pad="+pad+".")
Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
y = rand(rows=N, cols=C*Hout*Wout)
# Compute analytical gradients of loss wrt parameters
[out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
dout = l2_loss::backward(out, y)
dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
}
upsample2d = function() {
print("Grad checking the upsample2d layer with L2 loss.")
C=2; Hin=3; Win=3; size_h=2; size_w=2
# Generate data
N = 3 # num examples
M = C*Hin*Win # num neurons
X = rand(rows=N, cols=M, min=-5, max=5)
y = rand(rows=N, cols=M*size_h*size_w)
# Compute analytical gradients of loss wrt parameters
out = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
dout = l2_loss::backward(out, y)
dX = upsample2d::backward(dout, C, Hin, Win, size_h, size_w)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
relu = function() {
/*
* Gradient check for the ReLU nonlinearity layer.
*
* NOTE: This could result in a false-negative in which the test
* fails due to a kink being crossed in the nonlinearity. This
* occurs when the tests, f(x-h) and f(x+h), end up on opposite
* sides of the zero threshold of max(0, fx). For now, just run
* the tests again. In the future, we can explicitly check for
* this and rerun the test automatically.
*/
print("Grad checking the ReLU nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M, min=-5, max=5)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
out = relu::forward(X)
dout = l2_loss::backward(out, y)
dX = relu::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = relu::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = relu::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
rnn = function() {
/*
* Gradient check for the simple RNN layer.
*/
print("Grad checking the simple RNN layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 5 # num features
T = 15 # num timesteps (sequence length)
M = 10 # num neurons
X = rand(rows=N, cols=T*D)
out0 = rand(rows=N, cols=M)
[W, b, dummy] = rnn::init(N, D, M)
# test with (1) outputs from all timesteps, and (2) output from the final timestep
for (i in 1:2) {
if (i == 1) {
return_seq = TRUE
y = rand(rows=N, cols=T*M)
}
else {
return_seq = FALSE
y = rand(rows=N, cols=M)
}
print(" - Grad checking with return_seq = " + return_seq)
# Compute analytical gradients of loss wrt parameters
[out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
dout = l2_loss::backward(out, y)
[dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W.")
for (i in 1:nrow(W)) {
for (j in 1:ncol(W)) {
# Compute numerical derivative
old = as.scalar(W[i,j])
W[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
W[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
W[i,j] = old # reset
dW_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
}
}
print(" - Grad checking b.")
for (i in 1:nrow(b)) {
for (j in 1:ncol(b)) {
# Compute numerical derivative
old = as.scalar(b[i,j])
b[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
b[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
b[i,j] = old # reset
db_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
}
}
print(" - Grad checking out0.")
for (i in 1:nrow(out0)) {
for (j in 1:ncol(out0)) {
# Compute numerical derivative
old = as.scalar(out0[i,j])
out0[i,j] = old - h
[outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossmh = l2_loss::forward(outmh, y)
out0[i,j] = old + h
[outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
lossph = l2_loss::forward(outph, y)
out0[i,j] = old # reset
dout0_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
}
}
}
}
scale_shift1d = function() {
/*
* Gradient check for the 1D scale & shift layer.
*/
print("Grad checking the 1D scale & shift layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 100 # num features
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=D)
[gamma, beta] = scale_shift1d::init(D)
# Compute analytical gradients of loss wrt parameters
out = scale_shift1d::forward(X, gamma, beta)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = scale_shift1d::forward(X, gamma, beta)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = scale_shift1d::forward(X, gamma, beta)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
outmh = scale_shift1d::forward(X, gamma, beta)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
outph = scale_shift1d::forward(X, gamma, beta)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
outmh = scale_shift1d::forward(X, gamma, beta)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
outph = scale_shift1d::forward(X, gamma, beta)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
}
scale_shift2d = function() {
/*
* Gradient check for the 2D scale & shift layer.
*/
print("Grad checking the 2D scale & shift layer with L2 loss.")
# Generate data
N = 3 # num examples
C = 2 # num channels
Hin = 5 # input height
Win = 5 # input width
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*Hin*Win)
[gamma, beta] = scale_shift2d::init(C)
# Compute analytical gradients of loss wrt parameters
out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
dout = l2_loss::backward(out, y)
[dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking gamma.")
for (i in 1:nrow(gamma)) {
for (j in 1:ncol(gamma)) {
# Compute numerical derivative
old = as.scalar(gamma[i,j])
gamma[i,j] = old - h
outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossmh = l2_loss::forward(outmh, y)
gamma[i,j] = old + h
outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossph = l2_loss::forward(outph, y)
gamma[i,j] = old # reset
dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
lossph, lossmh)
}
}
print(" - Grad checking beta.")
for (i in 1:nrow(beta)) {
for (j in 1:ncol(beta)) {
# Compute numerical derivative
old = as.scalar(beta[i,j])
beta[i,j] = old - h
outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossmh = l2_loss::forward(outmh, y)
beta[i,j] = old + h
outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
lossph = l2_loss::forward(outph, y)
beta[i,j] = old # reset
dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
lossph, lossmh)
}
}
}
sigmoid = function() {
/*
* Gradient check for the sigmoid nonlinearity layer.
*/
print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
out = sigmoid::forward(X)
dout = l2_loss::backward(out, y)
dX = sigmoid::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = sigmoid::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = sigmoid::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
softmax = function() {
/*
* Gradient check for the softmax layer.
*/
print("Grad checking the softmax layer with L2 loss.")
# Generate data
N = 3 # num examples
D = 10 # num classes
X = rand(rows=N, cols=D)
y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
y = y / rowSums(y)
# Compute analytical gradients of loss wrt parameters
out = softmax::forward(X)
dout = l2_loss::backward(out, y)
dX = softmax::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = softmax::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = softmax::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
softmax2d = function() {
/*
* Gradient check for the 2D softmax layer.
*/
print("Grad checking the 2D softmax layer with L2 loss.")
# Generate data
N = 3 # num examples
C = 10 # num classes
Hin = 5 # example height
Win = 5 # example width
X = rand(rows=N, cols=C*Hin*Win)
y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
y_C_NHW = util::transpose_NCHW_to_CNHW(y, C)
y_NHW_C = t(y_C_NHW)
y_NHW_C = y_NHW_C / rowSums(y_NHW_C)
# Compute analytical gradients of loss wrt parameters
out = softmax2d::forward(X, C)
out_C_NHW = util::transpose_NCHW_to_CNHW(out, C)
out_NHW_C = t(out_C_NHW)
dout_NHW_C = l2_loss::backward(out_NHW_C, y_NHW_C)
dout_C_NHW = t(dout_NHW_C)
dout = util::transpose_NCHW_to_CNHW(dout_C_NHW, N)
dX = softmax2d::backward(dout, X, C)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = softmax2d::forward(X, C)
outmh_C_NHW = util::transpose_NCHW_to_CNHW(outmh, C)
outmh_NHW_C = t(outmh_C_NHW)
lossmh = l2_loss::forward(outmh_NHW_C, y_NHW_C)
X[i,j] = old + h
outph = softmax2d::forward(X, C)
outph_C_NHW = util::transpose_NCHW_to_CNHW(outph, C)
outph_NHW_C = t(outph_C_NHW)
lossph = l2_loss::forward(outph_NHW_C, y_NHW_C)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
tanh = function() {
/*
* Gradient check for the hyperbolic tangent (tanh) nonlinearity
* layer.
*/
print("Grad checking the tanh nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M)
y = rand(rows=N, cols=M)
# Compute analytical gradients of loss wrt parameters
out = tanh::forward(X)
dout = l2_loss::backward(out, y)
dX = tanh::backward(dout, X)
# Grad check
h = 1e-5
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = tanh::forward(X)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = tanh::forward(X)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}
two_layer_affine_l2_net = function() {
/*
* Gradient check for a two-layer, fully-connected, feed-forward
* network with ReLU nonlinearity and L2 loss.
*
* NOTE: This could result in a false-negative in which the test
* fails due to a kink being crossed in the ReLU nonlinearity. This
* occurs when the tests, f(x-h) and f(x+h), end up on opposite
* sides of the zero threshold of max(0, fx). For now, just run
* the tests again. In the future, we can explicitly check for
* this and rerun the test automatically.
*/
print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
"nonlinearity, and an L2 loss function.")
# Generate input data
N = 1000 # num examples
D = 100 # num features
yD = 5 # num targets
X = rand(rows=N, cols=D, pdf="normal")
y = rand(rows=N, cols=yD)
# Create 2-layer, fully-connected network
M = 10 # number of hidden neurons
[W1, b1] = affine::init(D, M)
[W2, b2] = affine::init(M, yD)
W2 = W2 / sqrt(2) # different initialization, since being fed into l2 loss, instead of relu
# Optimize for short "burn-in" time to move to characteristic
# mode of operation and unmask any real issues.
print(" - Burn-in:")
lr = 0.01
decay = 0.99
for(i in 1:5) {
# Compute forward and backward passes of net
[pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
print(" - L2 loss: " + loss)
# Optimize with basic SGD
W1 = W1 - lr * dW1
b1 = b1 - lr * db1
W2 = W2 - lr * dW2
b2 = b2 - lr * db2
lr = lr * decay
}
# Compute analytical gradients
[pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
# Grad check
h = 1e-6
print(" - Grad checking X.")
for (i in 1:2) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old_x = as.scalar(X[i,j])
X[i,j] = old_x - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
X[i,j] = old_x + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
X[i,j] = old_x # reset X[i,j]
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
print(" - Grad checking W1.")
for (i in 1:nrow(W1)) {
for (j in 1:ncol(W1)) {
# Compute numerical derivative
old_w = as.scalar(W1[i,j])
W1[i,j] = old_w - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W1[i,j] = old_w + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W1[i,j] = old_w # reset W[i,j]
dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
}
}
print(" - Grad checking W2.")
for (i in 1:nrow(W2)) {
for (j in 1:ncol(W2)) {
# Compute numerical derivative
old_w = as.scalar(W2[i,j])
W2[i,j] = old_w - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W2[i,j] = old_w + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
W2[i,j] = old_w # reset W[i,j]
dWij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
}
}
print(" - Grad checking b1.")
for (i in 1:nrow(b1)) {
for (j in 1:ncol(b1)) {
# Compute numerical derivative
old_b = as.scalar(b1[i,j])
b1[i,j] = old_b - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b1[i,j] = old_b + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b1[i,j] = old_b # reset b[1,j]
dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
}
}
print(" - Grad checking b2.")
for (i in 1:nrow(b2)) {
for (j in 1:ncol(b2)) {
# Compute numerical derivative
old_b = as.scalar(b2[i,j])
b2[i,j] = old_b - h
[lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b2[i,j] = old_b + h
[lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
b2[i,j] = old_b # reset b[1,j]
dbij_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
}
}
}
/*
* Test network with forward/backward functions.
*/
two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2)
return (matrix[double] pred, double loss,
matrix[double] dX,
matrix[double] dW1, matrix[double] db1,
matrix[double] dW2, matrix[double] db2) {
# Compute forward pass
[loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
# Compute backward pass
[dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
}
two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2)
return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
# Compute forward pass
hout = affine::forward(X, W1, b1)
aout = relu::forward(hout)
pred = affine::forward(aout, W2, b2)
# Compute loss
loss = l2_loss::forward(pred, y)
}
two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
matrix[double] aout, matrix[double] hout,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2)
return (matrix[double] dX, matrix[double] dpred,
matrix[double] daout, matrix[double] dhout,
matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
# Compute backward pass
dpred = l2_loss::backward(pred, y)
[daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
dhout = relu::backward(daout, hout)
[dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
}
elu = function() {
/*
* Gradient check for ELU nonlinearity
* layer.
*/
print("Grad checking ELU nonlinearity layer with L2 loss.")
# Generate data
N = 3 # num examples
M = 10 # num neurons
X = rand(rows=N, cols=M, min=-5, max=5)
y = rand(rows=N, cols=M)
out = elu::forward(X, 1)
dout = l2_loss::backward(out, y)
dX = elu::backward(dout, X, 1)
# Grad check
h = 1e-5
print(" - Grad checking X.")
for (i in 1:nrow(X)) {
for (j in 1:ncol(X)) {
# Compute numerical derivative
old = as.scalar(X[i,j])
X[i,j] = old - h
outmh = elu::forward(X, 1)
lossmh = l2_loss::forward(outmh, y)
X[i,j] = old + h
outph = elu::forward(X, 1)
lossph = l2_loss::forward(outph, y)
X[i,j] = old # reset
dX_num = (lossph-lossmh) / (2*h) # numerical derivative
# Check error
rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
}
}
}