scripts.nn.examples.mnist_lenet.dml Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Declarative Machine Learning
There is a newer version: 1.2.0
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------

/*
 * MNIST LeNet Example
 */
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/conv2d_builtin.dml") as conv2d
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/dropout.dml") as dropout
source("nn/layers/l2_reg.dml") as l2_reg
source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
source("nn/layers/relu.dml") as relu
source("nn/layers/softmax.dml") as softmax
source("nn/optim/sgd_nesterov.dml") as sgd_nesterov

train = function(matrix[double] X, matrix[double] Y,
                 matrix[double] X_val, matrix[double] Y_val,
                 int C, int Hin, int Win, int epochs)
    return (matrix[double] W1, matrix[double] b1,
            matrix[double] W2, matrix[double] b2,
            matrix[double] W3, matrix[double] b3,
            matrix[double] W4, matrix[double] b4) {
  /*
   * Trains a convolutional net using the "LeNet" architecture.
   *
   * The input matrix, X, has N examples, each represented as a 3D
   * volume unrolled into a single vector.  The targets, Y, have K
   * classes, and are one-hot encoded.
   *
   * Inputs:
   *  - X: Input data matrix, of shape (N, C*Hin*Win).
   *  - Y: Target matrix, of shape (N, K).
   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
   *  - Y_val: Target validation matrix, of shape (N, K).
   *  - C: Number of input channels (dimensionality of input depth).
   *  - Hin: Input height.
   *  - Win: Input width.
   *  - epochs: Total number of full training loops over the full data set.
   *
   * Outputs:
   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
   *  - b1: 1st layer biases vector, of shape (F1, 1).
   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
   *  - b2: 2nd layer biases vector, of shape (F2, 1).
   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
   *  - b3: 3rd layer biases vector, of shape (1, N3).
   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
   *  - b4: 4th layer biases vector, of shape (1, K).
   */
  N = nrow(X)
  K = ncol(Y)

  # Create network:
  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
  Hf = 5  # filter height
  Wf = 5  # filter width
  stride = 1
  pad = 2  # For same dimensions, (Hf - stride) / 2

  F1 = 32  # num conv filters in conv1
  F2 = 64  # num conv filters in conv2
  N3 = 512  # num nodes in affine3
  # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)

  [W1, b1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
  [W2, b2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
  [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
  [W4, b4] = affine::init(N3, K)  # inputs: (N, N3)
  W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu

  # Initialize SGD w/ Nesterov momentum optimizer
  lr = 0.01  # learning rate
  mu = 0.9  #0.5  # momentum
  decay = 0.95  # learning rate decay constant
  vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
  vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
  vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
  vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)

  # Regularization
  lambda = 5e-04

  # Optimize
  print("Starting optimization")
  batch_size = 64
  iters = ceil(N / batch_size)
  for (e in 1:epochs) {
    for(i in 1:iters) {
      # Get next batch
      beg = ((i-1) * batch_size) %% N + 1
      end = min(N, beg + batch_size - 1)
      X_batch = X[beg:end,]
      y_batch = Y[beg:end,]

      # Compute forward pass
      ## layer 1: conv1 -> relu1 -> pool1
      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf,
                                                stride, stride, pad, pad)
      outr1 = relu::forward(outc1)
      [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
                                                    strideh=2, stridew=2, pad=0, pad=0)
      ## layer 2: conv2 -> relu2 -> pool2
      [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
                                                stride, stride, pad, pad)
      outr2 = relu::forward(outc2)
      [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
                                                    strideh=2, stridew=2, pad=0, pad=0)
      ## layer 3:  affine3 -> relu3 -> dropout
      outa3 = affine::forward(outp2, W3, b3)
      outr3 = relu::forward(outa3)
      [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
      ## layer 4:  affine4 -> softmax
      outa4 = affine::forward(outd3, W4, b4)
      probs = softmax::forward(outa4)

      # Compute loss & accuracy for training & validation data every 100 iterations.
      if (i %% 100 == 0) {
        # Compute training loss & accuracy
        loss_data = cross_entropy_loss::forward(probs, y_batch)
        loss_reg_W1 = l2_reg::forward(W1, lambda)
        loss_reg_W2 = l2_reg::forward(W2, lambda)
        loss_reg_W3 = l2_reg::forward(W3, lambda)
        loss_reg_W4 = l2_reg::forward(W4, lambda)
        loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
        accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))

        # Compute validation loss & accuracy
        probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
        loss_val = cross_entropy_loss::forward(probs_val, Y_val)
        accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))

        # Output results
        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
              + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
      }

      # Compute data backward pass
      ## loss:
      dprobs = cross_entropy_loss::backward(probs, y_batch)
      ## layer 4:  affine4 -> softmax
      douta4 = softmax::backward(dprobs, outa4)
      [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
      ## layer 3:  affine3 -> relu3 -> dropout
      doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
      douta3 = relu::backward(doutr3, outa3)
      [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
      ## layer 2: conv2 -> relu2 -> pool2
      doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
                                    strideh=2, stridew=2, pad=0, pad=0)
      doutc2 = relu::backward(doutr2, outc2)
      [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
                                            Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
      ## layer 1: conv1 -> relu1 -> pool1
      doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
                                    strideh=2, stridew=2, pad=0, pad=0)
      doutc1 = relu::backward(doutr1, outc1)
      [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
                                              Hf, Wf, stride, stride, pad, pad)

      # Compute regularization backward pass
      dW1_reg = l2_reg::backward(W1, lambda)
      dW2_reg = l2_reg::backward(W2, lambda)
      dW3_reg = l2_reg::backward(W3, lambda)
      dW4_reg = l2_reg::backward(W4, lambda)
      dW1 = dW1 + dW1_reg
      dW2 = dW2 + dW2_reg
      dW3 = dW3 + dW3_reg
      dW4 = dW4 + dW4_reg

      # Optimize with SGD w/ Nesterov momentum
      [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
      [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
      [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
      [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
      [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
      [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
      [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
      [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
    }
    # Anneal momentum towards 0.999
    #mu = mu + (0.999 - mu)/(1+epochs-e)
    # Decay learning rate
    lr = lr * decay
  }
}

predict = function(matrix[double] X, int C, int Hin, int Win,
                   matrix[double] W1, matrix[double] b1,
                   matrix[double] W2, matrix[double] b2,
                   matrix[double] W3, matrix[double] b3,
                   matrix[double] W4, matrix[double] b4)
    return (matrix[double] probs) {
  /*
   * Computes the class probability predictions of a convolutional
   * net using the "LeNet" architecture.
   *
   * The input matrix, X, has N examples, each represented as a 3D
   * volume unrolled into a single vector.
   *
   * Inputs:
   *  - X: Input data matrix, of shape (N, C*Hin*Win).
   *  - C: Number of input channels (dimensionality of input depth).
   *  - Hin: Input height.
   *  - Win: Input width.
   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
   *  - b1: 1st layer biases vector, of shape (F1, 1).
   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
   *  - b2: 2nd layer biases vector, of shape (F2, 1).
   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
   *  - b3: 3rd layer biases vector, of shape (1, N3).
   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
   *  - b4: 4th layer biases vector, of shape (1, K).
   *
   * Outputs:
   *  - probs: Class probabilities, of shape (N, K).
   */
  N = nrow(X)

  # Network:
  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
  Hf = 5  # filter height
  Wf = 5  # filter width
  stride = 1
  pad = 2  # For same dimensions, (Hf - stride) / 2

  F1 = nrow(W1)  # num conv filters in conv1
  F2 = nrow(W2)  # num conv filters in conv2
  N3 = ncol(W3)  # num nodes in affine3
  K = ncol(W4)  # num nodes in affine4, equal to number of target dimensions (num classes)

  # Compute predictions over mini-batches
  probs = matrix(0, rows=N, cols=K)
  batch_size = 64
  iters = ceil(N / batch_size)
  for(i in 1:iters) {
    # Get next batch
    beg = ((i-1) * batch_size) %% N + 1
    end = min(N, beg + batch_size - 1)
    X_batch = X[beg:end,]

    # Compute forward pass
    ## layer 1: conv1 -> relu1 -> pool1
    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
                                              pad, pad)
    outr1 = relu::forward(outc1)
    [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
                                                  strideh=2, stridew=2, pad=0, pad=0)
    ## layer 2: conv2 -> relu2 -> pool2
    [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
                                              stride, stride, pad, pad)
    outr2 = relu::forward(outc2)
    [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
                                                  strideh=2, stridew=2, pad=0, pad=0)
    ## layer 3:  affine3 -> relu3
    outa3 = affine::forward(outp2, W3, b3)
    outr3 = relu::forward(outa3)
    ## layer 4:  affine4 -> softmax
    outa4 = affine::forward(outr3, W4, b4)
    probs_batch = softmax::forward(outa4)

    # Store predictions
    probs[beg:end,] = probs_batch
  }
}

eval = function(matrix[double] probs, matrix[double] Y)
    return (double loss, double accuracy) {
  /*
   * Evaluates a convolutional net using the "LeNet" architecture.
   *
   * The probs matrix contains the class probability predictions
   * of K classes over N examples.  The targets, Y, have K classes,
   * and are one-hot encoded.
   *
   * Inputs:
   *  - probs: Class probabilities, of shape (N, K).
   *  - Y: Target matrix, of shape (N, K).
   *
   * Outputs:
   *  - loss: Scalar loss, of shape (1).
   *  - accuracy: Scalar accuracy, of shape (1).
   */
  # Compute loss & accuracy
  loss = cross_entropy_loss::forward(probs, Y)
  correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
  accuracy = mean(correct_pred)
}

generate_dummy_data = function()
    return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
  /*
   * Generate a dummy dataset similar to the MNIST dataset.
   *
   * Outputs:
   *  - X: Input data matrix, of shape (N, D).
   *  - Y: Target matrix, of shape (N, K).
   *  - C: Number of input channels (dimensionality of input depth).
   *  - Hin: Input height.
   *  - Win: Input width.
   */
  # Generate dummy input data
  N = 1024  # num examples
  C = 1  # num input channels
  Hin = 28  # input height
  Win = 28  # input width
  K = 10  # num target classes
  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
  classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
  Y = table(seq(1, N), classes)  # one-hot encoding
}