scripts.nn.examples.mnist_lenet.dml Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of systemml Show documentation
Show all versions of systemml Show documentation
Declarative Machine Learning
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#-------------------------------------------------------------
/*
* MNIST LeNet Example
*/
# Imports
source("nn/layers/affine.dml") as affine
source("nn/layers/conv2d_builtin.dml") as conv2d
source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
source("nn/layers/dropout.dml") as dropout
source("nn/layers/l2_reg.dml") as l2_reg
source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
source("nn/layers/relu.dml") as relu
source("nn/layers/softmax.dml") as softmax
source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
train = function(matrix[double] X, matrix[double] Y,
matrix[double] X_val, matrix[double] Y_val,
int C, int Hin, int Win, int epochs)
return (matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2,
matrix[double] W3, matrix[double] b3,
matrix[double] W4, matrix[double] b4) {
/*
* Trains a convolutional net using the "LeNet" architecture.
*
* The input matrix, X, has N examples, each represented as a 3D
* volume unrolled into a single vector. The targets, Y, have K
* classes, and are one-hot encoded.
*
* Inputs:
* - X: Input data matrix, of shape (N, C*Hin*Win).
* - Y: Target matrix, of shape (N, K).
* - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
* - Y_val: Target validation matrix, of shape (N, K).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
* - epochs: Total number of full training loops over the full data set.
*
* Outputs:
* - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
* - b1: 1st layer biases vector, of shape (F1, 1).
* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
* - b2: 2nd layer biases vector, of shape (F2, 1).
* - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
* - b3: 3rd layer biases vector, of shape (1, N3).
* - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
* - b4: 4th layer biases vector, of shape (1, K).
*/
N = nrow(X)
K = ncol(Y)
# Create network:
# conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
Hf = 5 # filter height
Wf = 5 # filter width
stride = 1
pad = 2 # For same dimensions, (Hf - stride) / 2
F1 = 32 # num conv filters in conv1
F2 = 64 # num conv filters in conv2
N3 = 512 # num nodes in affine3
# Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
[W1, b1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win)
[W2, b2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2))
[W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3) # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
[W4, b4] = affine::init(N3, K) # inputs: (N, N3)
W4 = W4 / sqrt(2) # different initialization, since being fed into softmax, instead of relu
# Initialize SGD w/ Nesterov momentum optimizer
lr = 0.01 # learning rate
mu = 0.9 #0.5 # momentum
decay = 0.95 # learning rate decay constant
vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
# Regularization
lambda = 5e-04
# Optimize
print("Starting optimization")
batch_size = 64
iters = ceil(N / batch_size)
for (e in 1:epochs) {
for(i in 1:iters) {
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X[beg:end,]
y_batch = Y[beg:end,]
# Compute forward pass
## layer 1: conv1 -> relu1 -> pool1
[outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf,
stride, stride, pad, pad)
outr1 = relu::forward(outc1)
[outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
strideh=2, stridew=2, pad=0, pad=0)
## layer 2: conv2 -> relu2 -> pool2
[outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
stride, stride, pad, pad)
outr2 = relu::forward(outc2)
[outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
strideh=2, stridew=2, pad=0, pad=0)
## layer 3: affine3 -> relu3 -> dropout
outa3 = affine::forward(outp2, W3, b3)
outr3 = relu::forward(outa3)
[outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
## layer 4: affine4 -> softmax
outa4 = affine::forward(outd3, W4, b4)
probs = softmax::forward(outa4)
# Compute loss & accuracy for training & validation data every 100 iterations.
if (i %% 100 == 0) {
# Compute training loss & accuracy
loss_data = cross_entropy_loss::forward(probs, y_batch)
loss_reg_W1 = l2_reg::forward(W1, lambda)
loss_reg_W2 = l2_reg::forward(W2, lambda)
loss_reg_W3 = l2_reg::forward(W3, lambda)
loss_reg_W4 = l2_reg::forward(W4, lambda)
loss = loss_data + loss_reg_W1 + loss_reg_W2 + loss_reg_W3 + loss_reg_W4
accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch))
# Compute validation loss & accuracy
probs_val = predict(X_val, C, Hin, Win, W1, b1, W2, b2, W3, b3, W4, b4)
loss_val = cross_entropy_loss::forward(probs_val, Y_val)
accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val))
# Output results
print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: "
+ accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val)
}
# Compute data backward pass
## loss:
dprobs = cross_entropy_loss::backward(probs, y_batch)
## layer 4: affine4 -> softmax
douta4 = softmax::backward(dprobs, outa4)
[doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
## layer 3: affine3 -> relu3 -> dropout
doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
douta3 = relu::backward(doutr3, outa3)
[doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
## layer 2: conv2 -> relu2 -> pool2
doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
strideh=2, stridew=2, pad=0, pad=0)
doutc2 = relu::backward(doutr2, outc2)
[doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
## layer 1: conv1 -> relu1 -> pool1
doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
strideh=2, stridew=2, pad=0, pad=0)
doutc1 = relu::backward(doutr1, outc1)
[dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, W1, b1, C, Hin, Win,
Hf, Wf, stride, stride, pad, pad)
# Compute regularization backward pass
dW1_reg = l2_reg::backward(W1, lambda)
dW2_reg = l2_reg::backward(W2, lambda)
dW3_reg = l2_reg::backward(W3, lambda)
dW4_reg = l2_reg::backward(W4, lambda)
dW1 = dW1 + dW1_reg
dW2 = dW2 + dW2_reg
dW3 = dW3 + dW3_reg
dW4 = dW4 + dW4_reg
# Optimize with SGD w/ Nesterov momentum
[W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
[b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
[W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
[b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
[W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
[b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
[W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
[b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
}
# Anneal momentum towards 0.999
#mu = mu + (0.999 - mu)/(1+epochs-e)
# Decay learning rate
lr = lr * decay
}
}
predict = function(matrix[double] X, int C, int Hin, int Win,
matrix[double] W1, matrix[double] b1,
matrix[double] W2, matrix[double] b2,
matrix[double] W3, matrix[double] b3,
matrix[double] W4, matrix[double] b4)
return (matrix[double] probs) {
/*
* Computes the class probability predictions of a convolutional
* net using the "LeNet" architecture.
*
* The input matrix, X, has N examples, each represented as a 3D
* volume unrolled into a single vector.
*
* Inputs:
* - X: Input data matrix, of shape (N, C*Hin*Win).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
* - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
* - b1: 1st layer biases vector, of shape (F1, 1).
* - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
* - b2: 2nd layer biases vector, of shape (F2, 1).
* - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
* - b3: 3rd layer biases vector, of shape (1, N3).
* - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
* - b4: 4th layer biases vector, of shape (1, K).
*
* Outputs:
* - probs: Class probabilities, of shape (N, K).
*/
N = nrow(X)
# Network:
# conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
Hf = 5 # filter height
Wf = 5 # filter width
stride = 1
pad = 2 # For same dimensions, (Hf - stride) / 2
F1 = nrow(W1) # num conv filters in conv1
F2 = nrow(W2) # num conv filters in conv2
N3 = ncol(W3) # num nodes in affine3
K = ncol(W4) # num nodes in affine4, equal to number of target dimensions (num classes)
# Compute predictions over mini-batches
probs = matrix(0, rows=N, cols=K)
batch_size = 64
iters = ceil(N / batch_size)
for(i in 1:iters) {
# Get next batch
beg = ((i-1) * batch_size) %% N + 1
end = min(N, beg + batch_size - 1)
X_batch = X[beg:end,]
# Compute forward pass
## layer 1: conv1 -> relu1 -> pool1
[outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, Hf, Wf, stride, stride,
pad, pad)
outr1 = relu::forward(outc1)
[outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, Hf=2, Wf=2,
strideh=2, stridew=2, pad=0, pad=0)
## layer 2: conv2 -> relu2 -> pool2
[outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
stride, stride, pad, pad)
outr2 = relu::forward(outc2)
[outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, Hf=2, Wf=2,
strideh=2, stridew=2, pad=0, pad=0)
## layer 3: affine3 -> relu3
outa3 = affine::forward(outp2, W3, b3)
outr3 = relu::forward(outa3)
## layer 4: affine4 -> softmax
outa4 = affine::forward(outr3, W4, b4)
probs_batch = softmax::forward(outa4)
# Store predictions
probs[beg:end,] = probs_batch
}
}
eval = function(matrix[double] probs, matrix[double] Y)
return (double loss, double accuracy) {
/*
* Evaluates a convolutional net using the "LeNet" architecture.
*
* The probs matrix contains the class probability predictions
* of K classes over N examples. The targets, Y, have K classes,
* and are one-hot encoded.
*
* Inputs:
* - probs: Class probabilities, of shape (N, K).
* - Y: Target matrix, of shape (N, K).
*
* Outputs:
* - loss: Scalar loss, of shape (1).
* - accuracy: Scalar accuracy, of shape (1).
*/
# Compute loss & accuracy
loss = cross_entropy_loss::forward(probs, Y)
correct_pred = rowIndexMax(probs) == rowIndexMax(Y)
accuracy = mean(correct_pred)
}
generate_dummy_data = function()
return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
/*
* Generate a dummy dataset similar to the MNIST dataset.
*
* Outputs:
* - X: Input data matrix, of shape (N, D).
* - Y: Target matrix, of shape (N, K).
* - C: Number of input channels (dimensionality of input depth).
* - Hin: Input height.
* - Win: Input width.
*/
# Generate dummy input data
N = 1024 # num examples
C = 1 # num input channels
Hin = 28 # input height
Win = 28 # input width
K = 10 # num target classes
X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))
Y = table(seq(1, N), classes) # one-hot encoding
}