scripts.nn.test.grad_check.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

/*
 * Gradient checks for various architectures.
 */
source("nn/layers/affine.dml") as affine
source("nn/layers/low_rank_affine.dml") as low_rank_affine
source("nn/layers/batch_norm1d.dml") as batch_norm1d
source("nn/layers/batch_norm2d.dml") as batch_norm2d
source("nn/layers/conv2d.dml") as conv2d
source("nn/layers/conv2d_builtin.dml") as conv2d_builtin
source("nn/layers/conv2d_depthwise.dml") as conv2d_depthwise
source("nn/layers/conv2d_transpose.dml") as conv2d_transpose
source("nn/layers/conv2d_transpose_depthwise.dml") as conv2d_transpose_depthwise
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/cross_entropy_loss2d.dml") as cross_entropy_loss2d
source("nn/layers/dropout.dml") as dropout
source("nn/layers/fm.dml") as fm
source("nn/layers/l1_loss.dml") as l1_loss
source("nn/layers/l1_reg.dml") as l1_reg
source("nn/layers/l2_loss.dml") as l2_loss
source("nn/layers/l2_reg.dml") as l2_reg
source("nn/layers/log_loss.dml") as log_loss
source("nn/layers/lstm.dml") as lstm
source("nn/layers/max_pool2d.dml") as max_pool2d
source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin
source("nn/layers/avg_pool2d_builtin.dml") as avg_pool2d_builtin
source("nn/layers/upsample2d.dml") as upsample2d
source("nn/layers/relu.dml") as relu
source("nn/layers/rnn.dml") as rnn
source("nn/layers/scale_shift1d.dml") as scale_shift1d
source("nn/layers/scale_shift2d.dml") as scale_shift2d
source("nn/layers/sigmoid.dml") as sigmoid
source("nn/layers/softmax.dml") as softmax
source("nn/layers/softmax2d.dml") as softmax2d
source("nn/layers/tanh.dml") as tanh
source("nn/test/conv2d_simple.dml") as conv2d_simple
source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple
source("nn/test/util.dml") as test_util
source("nn/util.dml") as util
source("nn/layers/elu.dml") as elu

affine = function() {
  /*
   * Gradient check for the affine layer.
   */
  print("Grad checking the affine layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  D = 100 # num features
  M = 10 # num neurons
  X = rand(rows=N, cols=D)
  y = rand(rows=N, cols=M)
  [W, b] = affine::init(D, M)

  # Compute analytical gradients of loss wrt parameters
  out = affine::forward(X, W, b)
  dout = l2_loss::backward(out, y)
  [dX, dW, db] = affine::backward(dout, X, W, b)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = affine::forward(X, W, b)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = affine::forward(X, W, b)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      outmh = affine::forward(X, W, b)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h
      outph = affine::forward(X, W, b)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      outmh = affine::forward(X, W, b)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      outph = affine::forward(X, W, b)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

low_rank_affine = function() {
  /*
   * Gradient check for the low rank affine layer.
   */
  print("Grad checking the low rank affine layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  D = 100 # num features
  M = 10 # num neurons
  R = 2 # rank
  X = rand(rows=N, cols=D)
  y = rand(rows=N, cols=M)
  [U, V, b] = low_rank_affine::init(D, M, R)

  # Compute analytical gradients of loss wrt parameters
  out = low_rank_affine::forward(X, U, V, b)
  dout = l2_loss::backward(out, y)
  [dX, dU, dV, db] = low_rank_affine::backward(dout, X, U, V, b)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = low_rank_affine::forward(X, U, V, b)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = low_rank_affine::forward(X, U, V, b)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking U.")
  for (i in 1:nrow(U)) {
    for (j in 1:ncol(U)) {
      # Compute numerical derivative
      old = as.scalar(U[i,j])
      U[i,j] = old - h
      outmh = low_rank_affine::forward(X, U, V, b)
      lossmh = l2_loss::forward(outmh, y)
      U[i,j] = old + h
      outph = low_rank_affine::forward(X, U, V, b)
      lossph = l2_loss::forward(outph, y)
      U[i,j] = old  # reset
      dU_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dU[i,j]), dU_num, lossph, lossmh)
    }
  }

  print(" - Grad checking V.")
  for (i in 1:nrow(V)) {
    for (j in 1:ncol(V)) {
      # Compute numerical derivative
      old = as.scalar(V[i,j])
      V[i,j] = old - h
      outmh = low_rank_affine::forward(X, U, V, b)
      lossmh = l2_loss::forward(outmh, y)
      V[i,j] = old + h
      outph = low_rank_affine::forward(X, U, V, b)
      lossph = l2_loss::forward(outph, y)
      V[i,j] = old  # reset
      dV_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      outmh = low_rank_affine::forward(X, U, V, b)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      outph = low_rank_affine::forward(X, U, V, b)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

batch_norm1d = function() {
  /*
   * Gradient check for the 1D batch normalization layer.
   */
  print("Grad checking the 1D batch normalization layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  D = 100 # num features
  mu = 0.9  # momentum
  eps = 1e-5  # epsilon
  X = rand(rows=N, cols=D)
  y = rand(rows=N, cols=D)
  gamma = rand(rows=1, cols=D)
  beta = rand(rows=1, cols=D)
  ema_mean = rand(rows=1, cols=D)
  ema_var = rand(rows=1, cols=D)
  #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D)

  # Check training & testing modes
  for (i in 1:2) {
    if (i == 1)
      mode = 'train'
    else
      mode = 'test'
    print(" - Grad checking the '"+mode+"' mode.")

    # Compute analytical gradients of loss wrt parameters
    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
        batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
    dout = l2_loss::backward(out, y)
    [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd,
                                                 cache_mean, cache_var, cache_norm,
                                                 X, gamma, beta, mode, ema_mean, ema_var, mu, eps)

    # Grad check
    h = 1e-5
    print("   - Grad checking X.")
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }

    print("   - Grad checking gamma.")
    for (i in 1:nrow(gamma)) {
      for (j in 1:ncol(gamma)) {
        # Compute numerical derivative
        old = as.scalar(gamma[i,j])
        gamma[i,j] = old - h
        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
        lossmh = l2_loss::forward(outmh, y)
        gamma[i,j] = old + h
        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
        lossph = l2_loss::forward(outph, y)
        gamma[i,j] = old  # reset
        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                    lossph, lossmh)
      }
    }

    print("   - Grad checking beta.")
    for (i in 1:nrow(beta)) {
      for (j in 1:ncol(beta)) {
        # Compute numerical derivative
        old = as.scalar(beta[i,j])
        beta[i,j] = old - h
        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
        lossmh = l2_loss::forward(outmh, y)
        beta[i,j] = old + h
        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps)
        lossph = l2_loss::forward(outph, y)
        beta[i,j] = old  # reset
        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                    lossph, lossmh)
      }
    }
  }
}

batch_norm2d = function() {
  /*
   * Gradient check for the 2D (spatial) batch normalization layer.
   */
  print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  C = 2  # num channels
  Hin = 5  # input height
  Win = 5  # input width
  mu = 0.9  # momentum
  eps = 1e-5  # epsilon
  X = rand(rows=N, cols=C*Hin*Win)
  y = rand(rows=N, cols=C*Hin*Win)
  gamma = rand(rows=C, cols=1)
  beta = rand(rows=C, cols=1)
  ema_mean = rand(rows=C, cols=1)
  ema_var = rand(rows=C, cols=1)
  #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C)

  # Check training & testing modes
  for (i in 1:2) {
    if (i == 1)
      mode = 'train'
    else
      mode = 'test'
    print(" - Grad checking the '"+mode+"' mode.")

    # Compute analytical gradients of loss wrt parameters
    [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
        batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
    dout = l2_loss::backward(out, y)
    [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd,
                                                 cache_mean, cache_var, cache_norm,
                                                 X, gamma, beta, C, Hin, Win, mode,
                                                 ema_mean, ema_var, mu, eps)

    # Grad check
    h = 1e-5
    print("   - Grad checking X.")
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }

    print("   - Grad checking gamma.")
    for (i in 1:nrow(gamma)) {
      for (j in 1:ncol(gamma)) {
        # Compute numerical derivative
        old = as.scalar(gamma[i,j])
        gamma[i,j] = old - h
        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
        lossmh = l2_loss::forward(outmh, y)
        gamma[i,j] = old + h
        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
        lossph = l2_loss::forward(outph, y)
        gamma[i,j] = old  # reset
        dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                    lossph, lossmh)
      }
    }

    print("   - Grad checking beta.")
    for (i in 1:nrow(beta)) {
      for (j in 1:ncol(beta)) {
        # Compute numerical derivative
        old = as.scalar(beta[i,j])
        beta[i,j] = old - h
        [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
        lossmh = l2_loss::forward(outmh, y)
        beta[i,j] = old + h
        [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] =
            batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps)
        lossph = l2_loss::forward(outph, y)
        beta[i,j] = old  # reset
        dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                    lossph, lossmh)
      }
    }
  }
}

conv2d = function() {
  /*
   * Gradient check for the 2D convolutional layer using `im2col`.
   */
  print("Grad checking the `im2col` 2D convolutional layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 3  # num channels
  Hin = 5  # input height
  Win = 5  # input width
  F = 4  # num filters
  Hf = 3  # filter height
  Wf = 3  # filter width
  stride = 1
  pad = 1
  X = rand(rows=N, cols=C*Hin*Win)
  y = rand(rows=N, cols=F*Hin*Win)

  # Create layers
  [W, b] = conv2d::init(F, C, Hf, Wf)

  # Compute analytical gradients of loss wrt parameters
  [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
  dout = l2_loss::backward(out, y)
  [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                  pad, pad)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h
      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

conv2d_builtin = function() {
  /*
   * Gradient check for the 2D convolutional layer using built-in
   * functions.
   */
  print("Grad checking the built-in 2D convolutional layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 3  # num channels
  Hin = 5  # input height
  Win = 5  # input width
  F = 4  # num filters
  Hf = 3  # filter height
  Wf = 3  # filter width
  stride = 1
  pad = 1
  X = rand(rows=N, cols=C*Hin*Win)
  y = rand(rows=N, cols=F*Hin*Win)

  # Create layers
  [W, b] = conv2d_builtin::init(F, C, Hf, Wf)

  # Compute analytical gradients of loss wrt parameters
  [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                              pad, pad)
  dout = l2_loss::backward(out, y)
  [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
                                          stride, stride, pad, pad)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h
      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

conv2d_simple = function() {
  /*
   * Gradient check for the simple reference 2D convolutional layer.
   */
  print("Grad checking the simple reference 2D convolutional layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 3  # num channels
  Hin = 5  # input height
  Win = 5  # input width
  F = 4  # num filters
  Hf = 3  # filter height
  Wf = 3  # filter width
  stride = 1
  pad = 1
  X = rand(rows=N, cols=C*Hin*Win)
  y = rand(rows=N, cols=F*Hin*Win)

  # Create layers
  [W, b] = conv2d_simple::init(F, C, Hf, Wf)

  # Compute analytical gradients of loss wrt parameters
  [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
  dout = l2_loss::backward(out, y)
  [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
                                         stride, stride, pad, pad)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                   pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                   pad, pad)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                   pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h
      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                   pad, pad)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                   pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                   pad, pad)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

conv2d_depthwise = function() {
  /*
   * Gradient check for the 2D depthwise convolutional layer.
   */
  print("Grad checking the 2D depthwise convolutional layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 3  # num channels
  Hin = 5  # input height
  Win = 5  # input width
  M = 4  # depth multiplier
  Hf = 3  # filter height
  Wf = 3  # filter width
  stride = 1
  pad = 1
  X = rand(rows=N, cols=C*Hin*Win)
  y = rand(rows=N, cols=C*M*Hin*Win)

  # Create layers
  [W, b] = conv2d_depthwise::init(C, M, Hf, Wf)

  # Compute analytical gradients of loss wrt parameters
  [out, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                pad, pad)
  dout = l2_loss::backward(out, y)
  [dX, dW, db] = conv2d_depthwise::backward(dout, Hout, Wout, X, W, b, Hin, Win, M, Hf, Wf,
                                            stride, stride, pad, pad)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                      pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                      pad, pad)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                      pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h
      [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                      pad, pad)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                      pad, pad)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      [outph, Hout, Wout] = conv2d_depthwise::forward(X, W, b, Hin, Win, M, Hf, Wf, stride, stride,
                                                      pad, pad)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

conv2d_transpose = function() {
  /*
   * Gradient check for the 2D transpose convolutional layer.
   */
  print("Grad checking the 2D transpose convolutional layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 2  # num channels
  Hin = 3  # input height
  Win = 3  # input width
  F = 2  # num filters
  Hf = 3  # filter height
  Wf = 3  # filter width
  stride = 2
  pad = 1
  out_pad = 1
  X = rand(rows=N, cols=C*Hin*Win)

  # Create layers
  [W, b] = conv2d_transpose::init(F, C, Hf, Wf)

  # Compute analytical gradients of loss wrt parameters
  [out, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                pad, pad, out_pad, out_pad)
  y = rand(rows=N, cols=F*Hout*Wout)
  dout = l2_loss::backward(out,y)
  [dX, dW, db] = conv2d_transpose::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf,
                                            stride, stride, pad, pad)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                      pad, pad, out_pad, out_pad)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                      pad, pad, out_pad, out_pad)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                      pad, pad, out_pad, out_pad)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h
      [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                      pad, pad, out_pad, out_pad)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                      pad, pad, out_pad, out_pad)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      [outph, Hout, Wout] = conv2d_transpose::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride,
                                                      pad, pad, out_pad, out_pad)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

conv2d_transpose_depthwise = function() {
  /*
   * Gradient check for the 2D depthwise transpose convolutional layer.
   */
  print("Grad checking the 2D depthwise transpose convolutional layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 8  # num channels
  Hin = 3  # input height
  Win = 3  # input width
  M = 4  # depth of filters
  Hf = 3  # filter height
  Wf = 3  # filter width
  stride = 2
  pad = 1
  out_pad = 1
  X = rand(rows=N, cols=C*Hin*Win)

  # Create layers
  [W, b] = conv2d_transpose_depthwise::init(C, M, Hf, Wf)

  # Compute analytical gradients of loss wrt parameters
  [out, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                          stride, stride, pad, pad,
                                                          out_pad, out_pad)
  y = rand(rows=N, cols=C/M*Hout*Wout)
  dout = l2_loss::backward(out,y)
  [dX, dW, db] = conv2d_transpose_depthwise::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, M,
                                                      Hf, Wf, stride, stride, pad, pad)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                stride, stride, pad, pad,
                                                                out_pad, out_pad)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                stride, stride, pad, pad,
                                                                out_pad, out_pad)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                stride, stride, pad, pad,
                                                                out_pad, out_pad)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h
      [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                stride, stride, pad, pad,
                                                                out_pad, out_pad)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b.")
  for (i in 1:nrow(b)) {
    for (j in 1:ncol(b)) {
      # Compute numerical derivative
      old = as.scalar(b[i,j])
      b[i,j] = old - h
      [outmh, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                stride, stride, pad, pad,
                                                                out_pad, out_pad)
      lossmh = l2_loss::forward(outmh, y)
      b[i,j] = old + h
      [outph, Hout, Wout] = conv2d_transpose_depthwise::forward(X, W, b, C, Hin, Win, M, Hf, Wf,
                                                                stride, stride, pad, pad,
                                                                out_pad, out_pad)
      lossph = l2_loss::forward(outph, y)
      b[i,j] = old  # reset
      db_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
    }
  }
}

cross_entropy_loss = function() {
  /*
   * Gradient check for the cross-entropy loss function.
   */
  print("Grad checking the cross-entropy loss function.")

  # Generate data
  N = 3  # num examples
  K = 10  # num targets
  pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
  pred = softmax::forward(pred)  # normalized probs
  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
  y = softmax::forward(y)  # normalized probs

  # Compute analytical gradient
  dpred = cross_entropy_loss::backward(pred, y)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(pred)) {
    for (j in 1:ncol(pred)) {
      # Compute numerical derivative
      old = as.scalar(pred[i,j])
      pred[i,j] = old - h
      lossmh = cross_entropy_loss::forward(pred, y)
      pred[i,j] = old + h
      lossph = cross_entropy_loss::forward(pred, y)
      pred[i,j] = old  # reset W[i,j]
      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
    }
  }
}

cross_entropy_loss2d = function() {
  /*
   * Gradient check for the 2D cross-entropy loss function.
   */
  print("Grad checking the 2D cross-entropy loss function.")

  # Generate data
  N = 3  # num examples
  C = 10  # num targets
  Hin = 5  # example height
  Win = 5  # example width
  pred = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
  pred = softmax2d::forward(pred, C)  # normalized probs

  y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
  y = softmax2d::forward(y, C)  # normalized probs

  # Compute analytical gradient
  dpred = cross_entropy_loss2d::backward(pred, y, C)

  # Grad check
  h = 1e-6
  for (i in 1:nrow(pred)) {
    for (j in 1:ncol(pred)) {
      # Compute numerical derivative
      old = as.scalar(pred[i,j])
      pred[i,j] = old - h
      lossmh = cross_entropy_loss2d::forward(pred, y, C)
      pred[i,j] = old + h
      lossph = cross_entropy_loss2d::forward(pred, y, C)
      pred[i,j] = old  # reset W[i,j]
      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
    }
  }
}

dropout = function() {
  /*
   * Gradient check for the (inverted) dropout layer.
   */
  print("Grad checking the (inverted) dropout layer with L2 loss.")

  # Generate data
  N = 3  # num examples
  M = 100  # num neurons
  p = 0.5  # probability of dropping neuron output
  seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000))))  # random seed
  X = rand(rows=N, cols=M)
  y = rand(rows=N, cols=M)

  # Compute analytical gradients of loss wrt parameters
  [out, mask] = dropout::forward(X, p, seed)
  dout = l2_loss::backward(out, y)
  dX = dropout::backward(dout, X, p, mask)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      [outmh, mask] = dropout::forward(X, p, seed)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      [outph, mask] = dropout::forward(X, p, seed)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }
}

fm = function() {
  /*
   * Gradient check for the factorization machines.
   */
  print("Grad checking the factorization machines with L2 loss.")

  # Generate data
  n = 5# num examples
  d = 100 # num features
  k = 2 # factorization dimensionality
  X = rand(rows=n, cols=d)
  y = rand(rows=n, cols=1)
  [w0, W, V] = fm::init(n, d, k)

  # Compute analytical gradients of loss wrt parameters
  out = fm::forward(X, w0, W, V)
  dout = l2_loss::backward(out, y)
  [dw0, dW, dV] = fm::backward(dout, X, w0, W, V)

  # Grad check
  h = 1e-5

  print(" - Grad checking w0.")
  for (i in 1:nrow(w0)) {
    for (j in 1:ncol(w0)) {
      # Compute numerical derivative
      old = as.scalar(w0[i,j])
      w0[i,j] = old - h  # h = 1e-5
      outmh = fm::forward(X, w0, W, V)
      lossmh = l2_loss::forward(outmh, y)
      w0[i,j] = old + h  # h = 1e-5
      outph = fm::forward(X, w0, W, V)
      lossph = l2_loss::forward(outph, y)
      w0[i,j] = old  # reset
      dw0_num = (lossph-lossmh) / (2*h) # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dw0[i,j]), dw0_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W.")
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h  # h = 1e-5
      outmh = fm::forward(X, w0, W, V)
      lossmh = l2_loss::forward(outmh, y)
      W[i,j] = old + h  # h = 1e-5
      outph = fm::forward(X, w0, W, V)
      lossph = l2_loss::forward(outph, y)
      W[i,j] = old  # reset
      dW_num = (lossph-lossmh) / (2*h) # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
    }
  }

  print(" - Grad checking V.")
  for (i in 1:nrow(V)) {
    for(i in 1:ncol(V)) {
      # Compute numerical derivative
      old = as.scalar(V[i,j])
      V[i,j] = old - h  # h = 1e-5
      outmh = fm::forward(X, w0, W, V)
      lossmh = l2_loss::forward(outmh, y)
      V[i,j] = old + h  # h= 1e-5
      outph = fm::forward(X, w0, W, V)
      lossph = l2_loss::forward(outph, y)
      V[i,j] = old  # reset
      dV_num = (lossph-lossmh) / (2*h) # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dV[i,j]), dV_num, lossph, lossmh)
    }
  }
}

l1_loss = function() {
  /*
   * Gradient check for the L1 loss function.
   */
  print("Grad checking the L1 loss function.")

  # Generate data
  N = 3 # num examples
  D = 2 # num targets
  pred = rand(rows=N, cols=D)
  y = rand(rows=N, cols=D)

  # Compute analytical gradient
  dpred = l1_loss::backward(pred, y)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(pred)) {
    for (j in 1:ncol(pred)) {
      # Compute numerical derivative
      old = as.scalar(pred[i,j])
      pred[i,j] = old - h
      lossmh = l1_loss::forward(pred, y)
      pred[i,j] = old + h
      lossph = l1_loss::forward(pred, y)
      pred[i,j] = old  # reset W[i,j]
      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
    }
  }
}

l1_reg = function() {
  /*
   * Gradient check for the L1 regularization function.
   */
  print("Grad checking the L1 regularization function.")

  # Generate data
  D = 5 # num features
  M = 3 # num neurons
  lambda = 0.01
  W = rand(rows=D, cols=M)

  # Compute analytical gradient
  dW = l1_reg::backward(W, lambda)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      reg_lossmh = l1_reg::forward(W, lambda)
      W[i,j] = old + h
      reg_lossph = l1_reg::forward(W, lambda)
      W[i,j] = old  # reset W[i,j]
      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
                                                  reg_lossph, reg_lossmh)
    }
  }
}

l2_loss = function() {
  /*
   * Gradient check for the L2 loss function.
   */
  print("Grad checking the L2 loss function.")

  # Generate data
  N = 3 # num examples
  D = 2 # num targets
  pred = rand(rows=N, cols=D)
  y = rand(rows=N, cols=D)

  # Compute analytical gradient
  dpred = l2_loss::backward(pred, y)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(pred)) {
    for (j in 1:ncol(pred)) {
      # Compute numerical derivative
      old = as.scalar(pred[i,j])
      pred[i,j] = old - h
      lossmh = l2_loss::forward(pred, y)
      pred[i,j] = old + h
      lossph = l2_loss::forward(pred, y)
      pred[i,j] = old  # reset W[i,j]
      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
    }
  }
}

l2_reg = function() {
  /*
   * Gradient check for the L2 regularization function.
   */
  print("Grad checking the L2 regularization function.")

  # Generate data
  D = 5 # num features
  M = 3 # num neurons
  lambda = 0.01
  W = rand(rows=D, cols=M)

  # Compute analytical gradient
  dW = l2_reg::backward(W, lambda)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(W)) {
    for (j in 1:ncol(W)) {
      # Compute numerical derivative
      old = as.scalar(W[i,j])
      W[i,j] = old - h
      reg_lossmh = l2_reg::forward(W, lambda)
      W[i,j] = old + h
      reg_lossph = l2_reg::forward(W, lambda)
      W[i,j] = old  # reset W[i,j]
      dW_num = (reg_lossph-reg_lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num,
                                                  reg_lossph, reg_lossmh)
    }
  }
}

log_loss = function() {
  /*
   * Gradient check for the log loss function.
   */
  print("Grad checking the log loss function.")

  # Generate data
  N = 20 # num examples
  D = 1 # num targets
  pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
  y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))

  # Compute analytical gradient
  dpred = log_loss::backward(pred, y)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(pred)) {
    for (j in 1:ncol(pred)) {
      # Compute numerical derivative
      old = as.scalar(pred[i,j])
      pred[i,j] = old - h
      lossmh = log_loss::forward(pred, y)
      pred[i,j] = old + h
      lossph = log_loss::forward(pred, y)
      pred[i,j] = old  # reset W[i,j]
      dpred_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh)
    }
  }
}

lstm = function() {
  /*
   * Gradient check for the LSTM layer.
   */
  print("Grad checking the LSTM layer with L2 loss.")

  # Generate data
  N = 3  # num examples
  D = 5  # num features
  T = 15 # num timesteps (sequence length)
  M = 10 # num neurons
  X = rand(rows=N, cols=T*D)
  yc = rand(rows=N, cols=M)
  out0 = rand(rows=N, cols=M)
  c0 = rand(rows=N, cols=M)
  [W, b, dummy, dummy2] = lstm::init(N, D, M)

  # test with (1) outputs from all timesteps, and (2) output from the final timestep
  for (i in 1:2) {
    if (i == 1) {
      return_seq = TRUE
      y = rand(rows=N, cols=T*M)
    }
    else {
      return_seq = FALSE
      y = rand(rows=N, cols=M)
    }

    print(" - Grad checking with return_seq = " + return_seq)

    # Compute analytical gradients of loss wrt parameters
    [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
    dout = l2_loss::backward(out, y)
    dc = l2_loss::backward(c, yc)
    [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0,
                                              cache_out, cache_c, cache_ifog)

    # Grad check
    h = 1e-5
    print("   - Grad checking X.")
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outmh = l2_loss::forward(outmh, y)
        loss_cmh = l2_loss::forward(cmh, yc)
        lossmh = loss_outmh + loss_cmh
        X[i,j] = old + h
        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outph = l2_loss::forward(outph, y)
        loss_cph = l2_loss::forward(cph, yc)
        lossph = loss_outph + loss_cph
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }

    print("   - Grad checking W.")
    for (i in 1:nrow(W)) {
      for (j in 1:ncol(W)) {
        # Compute numerical derivative
        old = as.scalar(W[i,j])
        W[i,j] = old - h
        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outmh = l2_loss::forward(outmh, y)
        loss_cmh = l2_loss::forward(cmh, yc)
        lossmh = loss_outmh + loss_cmh
        W[i,j] = old + h
        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outph = l2_loss::forward(outph, y)
        loss_cph = l2_loss::forward(cph, yc)
        lossph = loss_outph + loss_cph
        W[i,j] = old  # reset
        dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
      }
    }

    print("   - Grad checking b.")
    for (i in 1:nrow(b)) {
      for (j in 1:ncol(b)) {
        # Compute numerical derivative
        old = as.scalar(b[i,j])
        b[i,j] = old - h
        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outmh = l2_loss::forward(outmh, y)
        loss_cmh = l2_loss::forward(cmh, yc)
        lossmh = loss_outmh + loss_cmh
        b[i,j] = old + h
        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outph = l2_loss::forward(outph, y)
        loss_cph = l2_loss::forward(cph, yc)
        lossph = loss_outph + loss_cph
        b[i,j] = old  # reset
        db_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
      }
    }

    print("   - Grad checking out0.")
    for (i in 1:nrow(out0)) {
      for (j in 1:ncol(out0)) {
        # Compute numerical derivative
        old = as.scalar(out0[i,j])
        out0[i,j] = old - h
        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outmh = l2_loss::forward(outmh, y)
        loss_cmh = l2_loss::forward(cmh, yc)
        lossmh = loss_outmh + loss_cmh
        out0[i,j] = old + h
        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outph = l2_loss::forward(outph, y)
        loss_cph = l2_loss::forward(cph, yc)
        lossph = loss_outph + loss_cph
        out0[i,j] = old  # reset
        dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
      }
    }

    print("   - Grad checking c0.")
    for (i in 1:nrow(c0)) {
      for (j in 1:ncol(c0)) {
        # Compute numerical derivative
        old = as.scalar(c0[i,j])
        c0[i,j] = old - h
        [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outmh = l2_loss::forward(outmh, y)
        loss_cmh = l2_loss::forward(cmh, yc)
        lossmh = loss_outmh + loss_cmh
        c0[i,j] = old + h
        [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0)
        loss_outph = l2_loss::forward(outph, y)
        loss_cph = l2_loss::forward(cph, yc)
        lossph = loss_outph + loss_cph
        c0[i,j] = old  # reset
        dc0_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh)
      }
    }
  }
}

max_pool2d = function() {
  /*
   * Gradient check for the 2D max pooling layer.
   */
  print("Grad checking the 2D max pooling layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 2  # num channels
  Hin = 4  # input height
  Win = 4  # input width
  Hf = 2  # pool filter height
  Wf = 2  # pool filter width
  stride = 2
  X = rand(rows=N, cols=C*Hin*Win)

  for (pad in 0:1) {
    print(" - Grad checking w/ pad="+pad+".")
    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
    y = rand(rows=N, cols=C*Hout*Wout)

    # Compute analytical gradients of loss wrt parameters
    [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
    dout = l2_loss::backward(out, y)
    dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)

    # Grad check
    h = 1e-5
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }
  }
}

max_pool2d_builtin = function() {
  /*
   * Gradient check for the 2D max pooling layer.
   */
  print("Grad checking the built-in 2D max pooling layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 2  # num channels
  Hin = 4  # input height
  Win = 4  # input width
  Hf = 2  # pool filter height
  Wf = 2  # pool filter width
  stride = 2
  X = rand(rows=N, cols=C*Hin*Win)

  for (pad in 0:1) {
    print(" - Grad checking w/ pad="+pad+".")
    Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
    Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
    y = rand(rows=N, cols=C*Hout*Wout)

    # Compute analytical gradients of loss wrt parameters
    [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
    dout = l2_loss::backward(out, y)
    dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
                                      pad, pad)

    # Grad check
    h = 1e-5
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                          pad, pad)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                          pad, pad)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }
  }
}

avg_pool2d_builtin = function() {
  /*
   * Gradient check for the 2D avg pooling layer.
   */
  print("Grad checking the built-in 2D avg pooling layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 2  # num channels
  Hin = 4  # input height
  Win = 4  # input width
  Hf = 2  # pool filter height
  Wf = 2  # pool filter width
  stride = 2
  X = rand(rows=N, cols=C*Hin*Win)

  for (pad in 0:1) {
    print(" - Grad checking w/ pad="+pad+".")
    Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1))
    Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1))
    y = rand(rows=N, cols=C*Hout*Wout)

    # Compute analytical gradients of loss wrt parameters
    [out, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                    pad, pad)
    dout = l2_loss::backward(out, y)
    dX = avg_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
                                      pad, pad)

    # Grad check
    h = 1e-5
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                          pad, pad)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        [outph, Hout, Wout] = avg_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                          pad, pad)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }
  }
}


max_pool2d_simple = function() {
  /*
   * Gradient check for the simple reference 2D max pooling layer.
   */
  print("Grad checking the simple reference 2D max pooling layer with L2 loss.")

  # Generate data
  N = 2  # num examples
  C = 2  # num channels
  Hin = 4  # input height
  Win = 4  # input width
  Hf = 2  # pool filter height
  Wf = 2  # pool filter width
  stride = 2
  X = rand(rows=N, cols=C*Hin*Win)

  for (pad in 0:1) {
    print(" - Grad checking w/ pad="+pad+".")
    Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1))
    Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1))
    y = rand(rows=N, cols=C*Hout*Wout)

    # Compute analytical gradients of loss wrt parameters
    [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
    dout = l2_loss::backward(out, y)
    dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride,
                                     pad, pad)

    # Grad check
    h = 1e-5
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                         pad, pad)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride,
                                                         pad, pad)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }
  }
}

upsample2d = function() {
  print("Grad checking the upsample2d layer with L2 loss.")

  C=2; Hin=3; Win=3; size_h=2; size_w=2
  # Generate data
  N = 3 # num examples
  M = C*Hin*Win # num neurons
  X = rand(rows=N, cols=M, min=-5, max=5)
  y = rand(rows=N, cols=M*size_h*size_w)

  # Compute analytical gradients of loss wrt parameters
  out = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
  dout = l2_loss::backward(out, y)
  dX = upsample2d::backward(dout, C, Hin, Win, size_h, size_w)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = upsample2d::forward(X, C, Hin, Win, size_h, size_w)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }
}

relu = function() {
  /*
   * Gradient check for the ReLU nonlinearity layer.
   *
   * NOTE: This could result in a false-negative in which the test
   * fails due to a kink being crossed in the nonlinearity.  This
   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
   * sides of the zero threshold of max(0, fx).  For now, just run
   * the tests again.  In the future, we can explicitly check for
   * this and rerun the test automatically.
   */
  print("Grad checking the ReLU nonlinearity layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  M = 10 # num neurons
  X = rand(rows=N, cols=M, min=-5, max=5)
  y = rand(rows=N, cols=M)

  # Compute analytical gradients of loss wrt parameters
  out = relu::forward(X)
  dout = l2_loss::backward(out, y)
  dX = relu::backward(dout, X)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = relu::forward(X)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = relu::forward(X)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }
}

rnn = function() {
  /*
   * Gradient check for the simple RNN layer.
   */
  print("Grad checking the simple RNN layer with L2 loss.")

  # Generate data
  N = 3  # num examples
  D = 5  # num features
  T = 15 # num timesteps (sequence length)
  M = 10 # num neurons
  X = rand(rows=N, cols=T*D)
  out0 = rand(rows=N, cols=M)
  [W, b, dummy] = rnn::init(N, D, M)

  # test with (1) outputs from all timesteps, and (2) output from the final timestep
  for (i in 1:2) {
    if (i == 1) {
      return_seq = TRUE
      y = rand(rows=N, cols=T*M)
    }
    else {
      return_seq = FALSE
      y = rand(rows=N, cols=M)
    }

    print(" - Grad checking with return_seq = " + return_seq)

    # Compute analytical gradients of loss wrt parameters
    [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
    dout = l2_loss::backward(out, y)
    [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out)

    # Grad check
    h = 1e-5
    print("   - Grad checking X.")
    for (i in 1:nrow(X)) {
      for (j in 1:ncol(X)) {
        # Compute numerical derivative
        old = as.scalar(X[i,j])
        X[i,j] = old - h
        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossmh = l2_loss::forward(outmh, y)
        X[i,j] = old + h
        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossph = l2_loss::forward(outph, y)
        X[i,j] = old  # reset
        dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
      }
    }

    print("   - Grad checking W.")
    for (i in 1:nrow(W)) {
      for (j in 1:ncol(W)) {
        # Compute numerical derivative
        old = as.scalar(W[i,j])
        W[i,j] = old - h
        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossmh = l2_loss::forward(outmh, y)
        W[i,j] = old + h
        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossph = l2_loss::forward(outph, y)
        W[i,j] = old  # reset
        dW_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
      }
    }

    print("   - Grad checking b.")
    for (i in 1:nrow(b)) {
      for (j in 1:ncol(b)) {
        # Compute numerical derivative
        old = as.scalar(b[i,j])
        b[i,j] = old - h
        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossmh = l2_loss::forward(outmh, y)
        b[i,j] = old + h
        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossph = l2_loss::forward(outph, y)
        b[i,j] = old  # reset
        db_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
      }
    }

    print("   - Grad checking out0.")
    for (i in 1:nrow(out0)) {
      for (j in 1:ncol(out0)) {
        # Compute numerical derivative
        old = as.scalar(out0[i,j])
        out0[i,j] = old - h
        [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossmh = l2_loss::forward(outmh, y)
        out0[i,j] = old + h
        [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0)
        lossph = l2_loss::forward(outph, y)
        out0[i,j] = old  # reset
        dout0_num = (lossph-lossmh) / (2*h)  # numerical derivative

        # Check error
        rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh)
      }
    }
  }
}

scale_shift1d = function() {
  /*
   * Gradient check for the 1D scale & shift layer.
   */
  print("Grad checking the 1D scale & shift layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  D = 100 # num features
  X = rand(rows=N, cols=D)
  y = rand(rows=N, cols=D)
  [gamma, beta] = scale_shift1d::init(D)

  # Compute analytical gradients of loss wrt parameters
  out = scale_shift1d::forward(X, gamma, beta)
  dout = l2_loss::backward(out, y)
  [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = scale_shift1d::forward(X, gamma, beta)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = scale_shift1d::forward(X, gamma, beta)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking gamma.")
  for (i in 1:nrow(gamma)) {
    for (j in 1:ncol(gamma)) {
      # Compute numerical derivative
      old = as.scalar(gamma[i,j])
      gamma[i,j] = old - h
      outmh = scale_shift1d::forward(X, gamma, beta)
      lossmh = l2_loss::forward(outmh, y)
      gamma[i,j] = old + h
      outph = scale_shift1d::forward(X, gamma, beta)
      lossph = l2_loss::forward(outph, y)
      gamma[i,j] = old  # reset
      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                  lossph, lossmh)
    }
  }

  print(" - Grad checking beta.")
  for (i in 1:nrow(beta)) {
    for (j in 1:ncol(beta)) {
      # Compute numerical derivative
      old = as.scalar(beta[i,j])
      beta[i,j] = old - h
      outmh = scale_shift1d::forward(X, gamma, beta)
      lossmh = l2_loss::forward(outmh, y)
      beta[i,j] = old + h
      outph = scale_shift1d::forward(X, gamma, beta)
      lossph = l2_loss::forward(outph, y)
      beta[i,j] = old  # reset
      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                  lossph, lossmh)
    }
  }
}

scale_shift2d = function() {
  /*
   * Gradient check for the 2D scale & shift layer.
   */
  print("Grad checking the 2D scale & shift layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  C = 2  # num channels
  Hin = 5  # input height
  Win = 5  # input width
  X = rand(rows=N, cols=C*Hin*Win)
  y = rand(rows=N, cols=C*Hin*Win)
  [gamma, beta] = scale_shift2d::init(C)

  # Compute analytical gradients of loss wrt parameters
  out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
  dout = l2_loss::backward(out, y)
  [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win)

  # Grad check
  h = 1e-5
  print(" - Grad checking X.")
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking gamma.")
  for (i in 1:nrow(gamma)) {
    for (j in 1:ncol(gamma)) {
      # Compute numerical derivative
      old = as.scalar(gamma[i,j])
      gamma[i,j] = old - h
      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
      lossmh = l2_loss::forward(outmh, y)
      gamma[i,j] = old + h
      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
      lossph = l2_loss::forward(outph, y)
      gamma[i,j] = old  # reset
      dgamma_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num,
                                                  lossph, lossmh)
    }
  }

  print(" - Grad checking beta.")
  for (i in 1:nrow(beta)) {
    for (j in 1:ncol(beta)) {
      # Compute numerical derivative
      old = as.scalar(beta[i,j])
      beta[i,j] = old - h
      outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
      lossmh = l2_loss::forward(outmh, y)
      beta[i,j] = old + h
      outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win)
      lossph = l2_loss::forward(outph, y)
      beta[i,j] = old  # reset
      dbeta_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num,
                                                  lossph, lossmh)
    }
  }
}

sigmoid = function() {
  /*
   * Gradient check for the sigmoid nonlinearity layer.
   */
  print("Grad checking the sigmoid nonlinearity layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  M = 10 # num neurons
  X = rand(rows=N, cols=M)
  y = rand(rows=N, cols=M)

  # Compute analytical gradients of loss wrt parameters
  out = sigmoid::forward(X)
  dout = l2_loss::backward(out, y)
  dX = sigmoid::backward(dout, X)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = sigmoid::forward(X)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = sigmoid::forward(X)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }
}

softmax = function() {
  /*
   * Gradient check for the softmax layer.
   */
  print("Grad checking the softmax layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  D = 10 # num classes
  X = rand(rows=N, cols=D)
  y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
  y = y / rowSums(y)

  # Compute analytical gradients of loss wrt parameters
  out = softmax::forward(X)
  dout = l2_loss::backward(out, y)
  dX = softmax::backward(dout, X)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = softmax::forward(X)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = softmax::forward(X)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }
}

softmax2d = function() {
  /*
   * Gradient check for the 2D softmax layer.
   */
  print("Grad checking the 2D softmax layer with L2 loss.")

  # Generate data
  N = 3  # num examples
  C = 10  # num classes
  Hin = 5  # example height
  Win = 5  # example width

  X = rand(rows=N, cols=C*Hin*Win)
  y = rand(rows=N, cols=C*Hin*Win, min=0, max=1, pdf="uniform")
  y_C_NHW = util::transpose_NCHW_to_CNHW(y, C)
  y_NHW_C = t(y_C_NHW)
  y_NHW_C = y_NHW_C / rowSums(y_NHW_C)

  # Compute analytical gradients of loss wrt parameters
  out = softmax2d::forward(X, C)
  out_C_NHW = util::transpose_NCHW_to_CNHW(out, C)
  out_NHW_C = t(out_C_NHW)

  dout_NHW_C = l2_loss::backward(out_NHW_C, y_NHW_C)
  dout_C_NHW = t(dout_NHW_C)
  dout = util::transpose_NCHW_to_CNHW(dout_C_NHW, N)
  dX = softmax2d::backward(dout, X, C)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = softmax2d::forward(X, C)
      outmh_C_NHW = util::transpose_NCHW_to_CNHW(outmh, C)
      outmh_NHW_C = t(outmh_C_NHW)
      lossmh = l2_loss::forward(outmh_NHW_C, y_NHW_C)

      X[i,j] = old + h
      outph = softmax2d::forward(X, C)
      outph_C_NHW = util::transpose_NCHW_to_CNHW(outph, C)
      outph_NHW_C = t(outph_C_NHW)
      lossph = l2_loss::forward(outph_NHW_C, y_NHW_C)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }
}

tanh = function() {
  /*
   * Gradient check for the hyperbolic tangent (tanh) nonlinearity
   * layer.
   */
  print("Grad checking the tanh nonlinearity layer with L2 loss.")

  # Generate data
  N = 3 # num examples
  M = 10 # num neurons
  X = rand(rows=N, cols=M)
  y = rand(rows=N, cols=M)

  # Compute analytical gradients of loss wrt parameters
  out = tanh::forward(X)
  dout = l2_loss::backward(out, y)
  dX = tanh::backward(dout, X)

  # Grad check
  h = 1e-5
  for (i in 1:nrow(X)) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old = as.scalar(X[i,j])
      X[i,j] = old - h
      outmh = tanh::forward(X)
      lossmh = l2_loss::forward(outmh, y)
      X[i,j] = old + h
      outph = tanh::forward(X)
      lossph = l2_loss::forward(outph, y)
      X[i,j] = old  # reset
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }
}

two_layer_affine_l2_net = function() {
  /*
   * Gradient check for a two-layer, fully-connected, feed-forward
   * network with ReLU nonlinearity and L2 loss.
   *
   * NOTE: This could result in a false-negative in which the test
   * fails due to a kink being crossed in the ReLU nonlinearity.  This
   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
   * sides of the zero threshold of max(0, fx).  For now, just run
   * the tests again.  In the future, we can explicitly check for
   * this and rerun the test automatically.
   */
  print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " +
        "nonlinearity, and an L2 loss function.")

  # Generate input data
  N = 1000 # num examples
  D = 100 # num features
  yD = 5 # num targets
  X = rand(rows=N, cols=D, pdf="normal")
  y = rand(rows=N, cols=yD)

  # Create 2-layer, fully-connected network
  M = 10 # number of hidden neurons
  [W1, b1] = affine::init(D, M)
  [W2, b2] = affine::init(M, yD)
  W2 = W2 / sqrt(2)  # different initialization, since being fed into l2 loss, instead of relu

  # Optimize for short "burn-in" time to move to characteristic
  # mode of operation and unmask any real issues.
  print(" - Burn-in:")
  lr = 0.01
  decay = 0.99
  for(i in 1:5) {
    # Compute forward and backward passes of net
    [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)
    print("   - L2 loss: " + loss)

    # Optimize with basic SGD
    W1 = W1 - lr * dW1
    b1 = b1 - lr * db1
    W2 = W2 - lr * dW2
    b2 = b2 - lr * db2
    lr = lr * decay
  }

  # Compute analytical gradients
  [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2)

  # Grad check
  h = 1e-6
  print(" - Grad checking X.")
  for (i in 1:2) {
    for (j in 1:ncol(X)) {
      # Compute numerical derivative
      old_x = as.scalar(X[i,j])
      X[i,j] = old_x - h
      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      X[i,j] = old_x + h
      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      X[i,j] = old_x  # reset X[i,j]
      dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W1.")
  for (i in 1:nrow(W1)) {
    for (j in 1:ncol(W1)) {
      # Compute numerical derivative
      old_w = as.scalar(W1[i,j])
      W1[i,j] = old_w - h
      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      W1[i,j] = old_w + h
      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      W1[i,j] = old_w  # reset W[i,j]
      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh)
    }
  }

  print(" - Grad checking W2.")
  for (i in 1:nrow(W2)) {
    for (j in 1:ncol(W2)) {
      # Compute numerical derivative
      old_w = as.scalar(W2[i,j])
      W2[i,j] = old_w - h
      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      W2[i,j] = old_w + h
      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      W2[i,j] = old_w  # reset W[i,j]
      dWij_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b1.")
  for (i in 1:nrow(b1)) {
    for (j in 1:ncol(b1)) {
      # Compute numerical derivative
      old_b = as.scalar(b1[i,j])
      b1[i,j] = old_b - h
      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      b1[i,j] = old_b + h
      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      b1[i,j] = old_b  # reset b[1,j]
      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh)
    }
  }

  print(" - Grad checking b2.")
  for (i in 1:nrow(b2)) {
    for (j in 1:ncol(b2)) {
      # Compute numerical derivative
      old_b = as.scalar(b2[i,j])
      b2[i,j] = old_b - h
      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      b2[i,j] = old_b + h
      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)
      b2[i,j] = old_b  # reset b[1,j]
      dbij_num = (lossph-lossmh) / (2*h)  # numerical derivative

      # Check error
      rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh)
    }
  }
}

/*
 * Test network with forward/backward functions.
 */
two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
                                       matrix[double] W1, matrix[double] b1,
                                       matrix[double] W2, matrix[double] b2)
    return (matrix[double] pred, double loss,
            matrix[double] dX,
            matrix[double] dW1, matrix[double] db1,
            matrix[double] dW2, matrix[double] db2) {
  # Compute forward pass
  [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2)

  # Compute backward pass
  [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
      two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
}

two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
                                           matrix[double] W1, matrix[double] b1,
                                           matrix[double] W2, matrix[double] b2)
    return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) {
  # Compute forward pass
  hout = affine::forward(X, W1, b1)
  aout = relu::forward(hout)
  pred = affine::forward(aout, W2, b2)

  # Compute loss
  loss = l2_loss::forward(pred, y)
}

two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred,
                                            matrix[double] aout, matrix[double] hout,
                                            matrix[double] W1, matrix[double] b1,
                                            matrix[double] W2, matrix[double] b2)
    return (matrix[double] dX, matrix[double] dpred,
            matrix[double] daout, matrix[double] dhout,
            matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) {
  # Compute backward pass
  dpred = l2_loss::backward(pred, y)
  [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
  dhout = relu::backward(daout, hout)
  [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
}

elu = function() {
  /*
   * Gradient check for ELU nonlinearity
   * layer.
   */
   print("Grad checking ELU nonlinearity layer with L2 loss.")
   # Generate data
   N = 3 # num examples
   M = 10 # num neurons

   X = rand(rows=N, cols=M, min=-5, max=5)
   y = rand(rows=N, cols=M)

   out = elu::forward(X, 1)
   dout = l2_loss::backward(out, y)
   dX = elu::backward(dout, X, 1)

   # Grad check
   h = 1e-5
   print(" - Grad checking X.")
   for (i in 1:nrow(X)) {
     for (j in 1:ncol(X)) {
       # Compute numerical derivative
       old = as.scalar(X[i,j])
       X[i,j] = old - h
       outmh = elu::forward(X, 1)
       lossmh = l2_loss::forward(outmh, y)
       X[i,j] = old + h
       outph = elu::forward(X, 1)
       lossph = l2_loss::forward(outph, y)
       X[i,j] = old  # reset
       dX_num = (lossph-lossmh) / (2*h)  # numerical derivative

       # Check error
       rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
     }
   }
}