http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/grad_check.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/test/grad_check.dml b/scripts/nn/test/grad_check.dml new file mode 100644 index 0000000..f3bc9a7 --- /dev/null +++ b/scripts/nn/test/grad_check.dml @@ -0,0 +1,1769 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Gradient checks for various architectures. + */ +source("nn/layers/affine.dml") as affine +source("nn/layers/batch_norm1d.dml") as batch_norm1d +source("nn/layers/batch_norm2d.dml") as batch_norm2d +source("nn/layers/conv2d.dml") as conv2d +source("nn/layers/conv2d_builtin.dml") as conv2d_builtin +source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss +source("nn/layers/dropout.dml") as dropout +source("nn/layers/l1_loss.dml") as l1_loss +source("nn/layers/l1_reg.dml") as l1_reg +source("nn/layers/l2_loss.dml") as l2_loss +source("nn/layers/l2_reg.dml") as l2_reg +source("nn/layers/log_loss.dml") as log_loss +source("nn/layers/lstm.dml") as lstm +source("nn/layers/max_pool2d.dml") as max_pool2d +source("nn/layers/max_pool2d_builtin.dml") as max_pool2d_builtin +source("nn/layers/relu.dml") as relu +source("nn/layers/rnn.dml") as rnn +source("nn/layers/scale_shift1d.dml") as scale_shift1d +source("nn/layers/scale_shift2d.dml") as scale_shift2d +source("nn/layers/sigmoid.dml") as sigmoid +source("nn/layers/softmax.dml") as softmax +source("nn/layers/tanh.dml") as tanh +source("nn/test/conv2d_simple.dml") as conv2d_simple +source("nn/test/max_pool2d_simple.dml") as max_pool2d_simple +source("nn/test/util.dml") as test_util + +affine = function() { + /* + * Gradient check for the affine layer. + */ + print("Grad checking the affine layer with L2 loss.") + + # Generate data + N = 3 # num examples + D = 100 # num features + M = 10 # num neurons + X = rand(rows=N, cols=D) + y = rand(rows=N, cols=M) + [W, b] = affine::init(D, M) + + # Compute analytical gradients of loss wrt parameters + out = affine::forward(X, W, b) + dout = l2_loss::backward(out, y) + [dX, dW, db] = affine::backward(dout, X, W, b) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = affine::forward(X, W, b) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = affine::forward(X, W, b) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking W.") + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + outmh = affine::forward(X, W, b) + lossmh = l2_loss::forward(outmh, y) + W[i,j] = old + h + outph = affine::forward(X, W, b) + lossph = l2_loss::forward(outph, y) + W[i,j] = old # reset + dW_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) + } + } + + print(" - Grad checking b.") + for (i in 1:nrow(b)) { + for (j in 1:ncol(b)) { + # Compute numerical derivative + old = as.scalar(b[i,j]) + b[i,j] = old - h + outmh = affine::forward(X, W, b) + lossmh = l2_loss::forward(outmh, y) + b[i,j] = old + h + outph = affine::forward(X, W, b) + lossph = l2_loss::forward(outph, y) + b[i,j] = old # reset + db_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) + } + } +} + +batch_norm1d = function() { + /* + * Gradient check for the 1D batch normalization layer. + */ + print("Grad checking the 1D batch normalization layer with L2 loss.") + + # Generate data + N = 3 # num examples + D = 100 # num features + mu = 0.9 # momentum + eps = 1e-5 # epsilon + X = rand(rows=N, cols=D) + y = rand(rows=N, cols=D) + gamma = rand(rows=1, cols=D) + beta = rand(rows=1, cols=D) + ema_mean = rand(rows=1, cols=D) + ema_var = rand(rows=1, cols=D) + #[dummy, dummy, ema_mean, ema_var] = batch_norm1d::init(D) + + # Check training & testing modes + for (i in 1:2) { + if (i == 1) + mode = 'train' + else + mode = 'test' + print(" - Grad checking the '"+mode+"' mode.") + + # Compute analytical gradients of loss wrt parameters + [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + dout = l2_loss::backward(out, y) + [dX, dgamma, dbeta] = batch_norm1d::backward(dout, out, ema_mean_upd, ema_var_upd, + cache_mean, cache_var, cache_norm, + X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking gamma.") + for (i in 1:nrow(gamma)) { + for (j in 1:ncol(gamma)) { + # Compute numerical derivative + old = as.scalar(gamma[i,j]) + gamma[i,j] = old - h + [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + lossmh = l2_loss::forward(outmh, y) + gamma[i,j] = old + h + [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + lossph = l2_loss::forward(outph, y) + gamma[i,j] = old # reset + dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, + lossph, lossmh) + } + } + + print(" - Grad checking beta.") + for (i in 1:nrow(beta)) { + for (j in 1:ncol(beta)) { + # Compute numerical derivative + old = as.scalar(beta[i,j]) + beta[i,j] = old - h + [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + lossmh = l2_loss::forward(outmh, y) + beta[i,j] = old + h + [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm1d::forward(X, gamma, beta, mode, ema_mean, ema_var, mu, eps) + lossph = l2_loss::forward(outph, y) + beta[i,j] = old # reset + dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, + lossph, lossmh) + } + } + } +} + +batch_norm2d = function() { + /* + * Gradient check for the 2D (spatial) batch normalization layer. + */ + print("Grad checking the 2D (spatial) batch normalization layer with L2 loss.") + + # Generate data + N = 3 # num examples + C = 2 # num channels + Hin = 5 # input height + Win = 5 # input width + mu = 0.9 # momentum + eps = 1e-5 # epsilon + X = rand(rows=N, cols=C*Hin*Win) + y = rand(rows=N, cols=C*Hin*Win) + gamma = rand(rows=C, cols=1) + beta = rand(rows=C, cols=1) + ema_mean = rand(rows=C, cols=1) + ema_var = rand(rows=C, cols=1) + #[dummy, dummy, ema_mean, ema_var] = batch_norm2d::init(C) + + # Check training & testing modes + for (i in 1:2) { + if (i == 1) + mode = 'train' + else + mode = 'test' + print(" - Grad checking the '"+mode+"' mode.") + + # Compute analytical gradients of loss wrt parameters + [out, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) + dout = l2_loss::backward(out, y) + [dX, dgamma, dbeta] = batch_norm2d::backward(dout, out, ema_mean_upd, ema_var_upd, + cache_mean, cache_var, cache_norm, + X, gamma, beta, C, Hin, Win, mode, + ema_mean, ema_var, mu, eps) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking gamma.") + for (i in 1:nrow(gamma)) { + for (j in 1:ncol(gamma)) { + # Compute numerical derivative + old = as.scalar(gamma[i,j]) + gamma[i,j] = old - h + [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) + lossmh = l2_loss::forward(outmh, y) + gamma[i,j] = old + h + [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) + lossph = l2_loss::forward(outph, y) + gamma[i,j] = old # reset + dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, + lossph, lossmh) + } + } + + print(" - Grad checking beta.") + for (i in 1:nrow(beta)) { + for (j in 1:ncol(beta)) { + # Compute numerical derivative + old = as.scalar(beta[i,j]) + beta[i,j] = old - h + [outmh, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) + lossmh = l2_loss::forward(outmh, y) + beta[i,j] = old + h + [outph, ema_mean_upd, ema_var_upd, cache_mean, cache_var, cache_norm] = + batch_norm2d::forward(X, gamma, beta, C, Hin, Win, mode, ema_mean, ema_var, mu, eps) + lossph = l2_loss::forward(outph, y) + beta[i,j] = old # reset + dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, + lossph, lossmh) + } + } + } +} + +conv2d = function() { + /* + * Gradient check for the 2D convolutional layer using `im2col`. + */ + print("Grad checking the `im2col` 2D convolutional layer with L2 loss.") + + # Generate data + N = 2 # num examples + C = 2 # num channels + Hin = 5 # input height + Win = 5 # input width + F = 2 # num filters + Hf = 3 # filter height + Wf = 3 # filter width + stride = 1 + pad = 1 + X = rand(rows=N, cols=C*Hin*Win) + y = rand(rows=N, cols=F*Hin*Win) + + # Create layers + [W, b] = conv2d::init(F, C, Hf, Wf) + + # Compute analytical gradients of loss wrt parameters + [out, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + dout = l2_loss::backward(out, y) + [dX, dW, db] = conv2d::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking W.") + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossmh = l2_loss::forward(outmh, y) + W[i,j] = old + h + [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossph = l2_loss::forward(outph, y) + W[i,j] = old # reset + dW_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) + } + } + + print(" - Grad checking b.") + for (i in 1:nrow(b)) { + for (j in 1:ncol(b)) { + # Compute numerical derivative + old = as.scalar(b[i,j]) + b[i,j] = old - h + [outmh, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossmh = l2_loss::forward(outmh, y) + b[i,j] = old + h + [outph, Hout, Wout] = conv2d::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossph = l2_loss::forward(outph, y) + b[i,j] = old # reset + db_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) + } + } +} + +conv2d_builtin = function() { + /* + * Gradient check for the 2D convolutional layer using built-in + * functions. + */ + print("Grad checking the built-in 2D convolutional layer with L2 loss.") + + # Generate data + N = 2 # num examples + C = 2 # num channels + Hin = 5 # input height + Win = 5 # input width + F = 2 # num filters + Hf = 3 # filter height + Wf = 3 # filter width + stride = 1 + pad = 1 + X = rand(rows=N, cols=C*Hin*Win) + y = rand(rows=N, cols=F*Hin*Win) + + # Create layers + [W, b] = conv2d_builtin::init(F, C, Hf, Wf) + + # Compute analytical gradients of loss wrt parameters + [out, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + dout = l2_loss::backward(out, y) + [dX, dW, db] = conv2d_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, + stride, stride, pad, pad) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking W.") + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + W[i,j] = old + h + [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + W[i,j] = old # reset + dW_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) + } + } + + print(" - Grad checking b.") + for (i in 1:nrow(b)) { + for (j in 1:ncol(b)) { + # Compute numerical derivative + old = as.scalar(b[i,j]) + b[i,j] = old - h + [outmh, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + b[i,j] = old + h + [outph, Hout, Wout] = conv2d_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + b[i,j] = old # reset + db_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) + } + } +} + +conv2d_simple = function() { + /* + * Gradient check for the simple reference 2D convolutional layer. + */ + print("Grad checking the simple reference 2D convolutional layer with L2 loss.") + + # Generate data + N = 2 # num examples + C = 2 # num channels + Hin = 5 # input height + Win = 5 # input width + F = 2 # num filters + Hf = 3 # filter height + Wf = 3 # filter width + stride = 1 + pad = 1 + X = rand(rows=N, cols=C*Hin*Win) + y = rand(rows=N, cols=F*Hin*Win) + + # Create layers + [W, b] = conv2d_simple::init(F, C, Hf, Wf) + + # Compute analytical gradients of loss wrt parameters + [out, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + dout = l2_loss::backward(out, y) + [dX, dW, db] = conv2d_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, + stride, stride, pad, pad) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking W.") + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + W[i,j] = old + h + [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + W[i,j] = old # reset + dW_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) + } + } + + print(" - Grad checking b.") + for (i in 1:nrow(b)) { + for (j in 1:ncol(b)) { + # Compute numerical derivative + old = as.scalar(b[i,j]) + b[i,j] = old - h + [outmh, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + b[i,j] = old + h + [outph, Hout, Wout] = conv2d_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + b[i,j] = old # reset + db_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) + } + } +} + +cross_entropy_loss = function() { + /* + * Gradient check for the cross-entropy loss function. + */ + print("Grad checking the cross-entropy loss function.") + + # Generate data + N = 3 # num examples + K = 10 # num targets + pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform") + pred = pred / rowSums(pred) # normalized probs + y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform") + y = y / rowSums(y) # normalized probs + + # Compute analytical gradient + dpred = cross_entropy_loss::backward(pred, y) + + # Grad check + h = 1e-5 + for (i in 1:nrow(pred)) { + for (j in 1:ncol(pred)) { + # Compute numerical derivative + old = as.scalar(pred[i,j]) + pred[i,j] = old - h + lossmh = cross_entropy_loss::forward(pred, y) + pred[i,j] = old + h + lossph = cross_entropy_loss::forward(pred, y) + pred[i,j] = old # reset W[i,j] + dpred_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) + } + } +} + +dropout = function() { + /* + * Gradient check for the (inverted) dropout layer. + */ + print("Grad checking the (inverted) dropout layer with L2 loss.") + + # Generate data + N = 3 # num examples + M = 100 # num neurons + p = 0.5 # probability of dropping neuron output + seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) # random seed + X = rand(rows=N, cols=M) + y = rand(rows=N, cols=M) + + # Compute analytical gradients of loss wrt parameters + [out, mask] = dropout::forward(X, p, seed) + dout = l2_loss::backward(out, y) + dX = dropout::backward(dout, X, p, mask) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, mask] = dropout::forward(X, p, seed) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, mask] = dropout::forward(X, p, seed) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } +} + +l1_loss = function() { + /* + * Gradient check for the L1 loss function. + */ + print("Grad checking the L1 loss function.") + + # Generate data + N = 3 # num examples + D = 2 # num targets + pred = rand(rows=N, cols=D) + y = rand(rows=N, cols=D) + + # Compute analytical gradient + dpred = l1_loss::backward(pred, y) + + # Grad check + h = 1e-5 + for (i in 1:nrow(pred)) { + for (j in 1:ncol(pred)) { + # Compute numerical derivative + old = as.scalar(pred[i,j]) + pred[i,j] = old - h + lossmh = l1_loss::forward(pred, y) + pred[i,j] = old + h + lossph = l1_loss::forward(pred, y) + pred[i,j] = old # reset W[i,j] + dpred_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) + } + } +} + +l1_reg = function() { + /* + * Gradient check for the L1 regularization function. + */ + print("Grad checking the L1 regularization function.") + + # Generate data + D = 5 # num features + M = 3 # num neurons + lambda = 0.01 + W = rand(rows=D, cols=M) + + # Compute analytical gradient + dW = l1_reg::backward(W, lambda) + + # Grad check + h = 1e-5 + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + reg_lossmh = l1_reg::forward(W, lambda) + W[i,j] = old + h + reg_lossph = l1_reg::forward(W, lambda) + W[i,j] = old # reset W[i,j] + dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, + reg_lossph, reg_lossmh) + } + } +} + +l2_loss = function() { + /* + * Gradient check for the L2 loss function. + */ + print("Grad checking the L2 loss function.") + + # Generate data + N = 3 # num examples + D = 2 # num targets + pred = rand(rows=N, cols=D) + y = rand(rows=N, cols=D) + + # Compute analytical gradient + dpred = l2_loss::backward(pred, y) + + # Grad check + h = 1e-5 + for (i in 1:nrow(pred)) { + for (j in 1:ncol(pred)) { + # Compute numerical derivative + old = as.scalar(pred[i,j]) + pred[i,j] = old - h + lossmh = l2_loss::forward(pred, y) + pred[i,j] = old + h + lossph = l2_loss::forward(pred, y) + pred[i,j] = old # reset W[i,j] + dpred_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) + } + } +} + +l2_reg = function() { + /* + * Gradient check for the L2 regularization function. + */ + print("Grad checking the L2 regularization function.") + + # Generate data + D = 5 # num features + M = 3 # num neurons + lambda = 0.01 + W = rand(rows=D, cols=M) + + # Compute analytical gradient + dW = l2_reg::backward(W, lambda) + + # Grad check + h = 1e-5 + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + reg_lossmh = l2_reg::forward(W, lambda) + W[i,j] = old + h + reg_lossph = l2_reg::forward(W, lambda) + W[i,j] = old # reset W[i,j] + dW_num = (reg_lossph-reg_lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, + reg_lossph, reg_lossmh) + } + } +} + +log_loss = function() { + /* + * Gradient check for the log loss function. + */ + print("Grad checking the log loss function.") + + # Generate data + N = 20 # num examples + D = 1 # num targets + pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform") + y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform")) + + # Compute analytical gradient + dpred = log_loss::backward(pred, y) + + # Grad check + h = 1e-5 + for (i in 1:nrow(pred)) { + for (j in 1:ncol(pred)) { + # Compute numerical derivative + old = as.scalar(pred[i,j]) + pred[i,j] = old - h + lossmh = log_loss::forward(pred, y) + pred[i,j] = old + h + lossph = log_loss::forward(pred, y) + pred[i,j] = old # reset W[i,j] + dpred_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dpred[i,j]), dpred_num, lossph, lossmh) + } + } +} + +lstm = function() { + /* + * Gradient check for the LSTM layer. + */ + print("Grad checking the LSTM layer with L2 loss.") + + # Generate data + N = 3 # num examples + D = 10 # num features + T = 15 # num timesteps (sequence length) + M = 5 # num neurons + return_seq = TRUE + X = rand(rows=N, cols=T*D) + y = rand(rows=N, cols=T*M) + yc = rand(rows=N, cols=M) + out0 = rand(rows=N, cols=M) + c0 = rand(rows=N, cols=M) + [W, b, dummy, dummy2] = lstm::init(N, D, M) + + # Compute analytical gradients of loss wrt parameters + [out, c, cache_out, cache_c, cache_ifog] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + dout = l2_loss::backward(out, y) + dc = l2_loss::backward(c, yc) + [dX, dW, db, dout0, dc0] = lstm::backward(dout, dc, X, W, b, T, D, return_seq, out0, c0, + cache_out, cache_c, cache_ifog) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outmh = l2_loss::forward(outmh, y) + loss_cmh = l2_loss::forward(cmh, yc) + lossmh = loss_outmh + loss_cmh + X[i,j] = old + h + [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outph = l2_loss::forward(outph, y) + loss_cph = l2_loss::forward(cph, yc) + lossph = loss_outph + loss_cph + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking W.") + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outmh = l2_loss::forward(outmh, y) + loss_cmh = l2_loss::forward(cmh, yc) + lossmh = loss_outmh + loss_cmh + W[i,j] = old + h + [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outph = l2_loss::forward(outph, y) + loss_cph = l2_loss::forward(cph, yc) + lossph = loss_outph + loss_cph + W[i,j] = old # reset + dW_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) + } + } + + print(" - Grad checking b.") + for (i in 1:nrow(b)) { + for (j in 1:ncol(b)) { + # Compute numerical derivative + old = as.scalar(b[i,j]) + b[i,j] = old - h + [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outmh = l2_loss::forward(outmh, y) + loss_cmh = l2_loss::forward(cmh, yc) + lossmh = loss_outmh + loss_cmh + b[i,j] = old + h + [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outph = l2_loss::forward(outph, y) + loss_cph = l2_loss::forward(cph, yc) + lossph = loss_outph + loss_cph + b[i,j] = old # reset + db_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) + } + } + + print(" - Grad checking out0.") + for (i in 1:nrow(out0)) { + for (j in 1:ncol(out0)) { + # Compute numerical derivative + old = as.scalar(out0[i,j]) + out0[i,j] = old - h + [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outmh = l2_loss::forward(outmh, y) + loss_cmh = l2_loss::forward(cmh, yc) + lossmh = loss_outmh + loss_cmh + out0[i,j] = old + h + [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outph = l2_loss::forward(outph, y) + loss_cph = l2_loss::forward(cph, yc) + lossph = loss_outph + loss_cph + out0[i,j] = old # reset + dout0_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh) + } + } + + print(" - Grad checking c0.") + for (i in 1:nrow(c0)) { + for (j in 1:ncol(c0)) { + # Compute numerical derivative + old = as.scalar(c0[i,j]) + c0[i,j] = old - h + [outmh, cmh, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outmh = l2_loss::forward(outmh, y) + loss_cmh = l2_loss::forward(cmh, yc) + lossmh = loss_outmh + loss_cmh + c0[i,j] = old + h + [outph, cph, cache, cache, cache] = lstm::forward(X, W, b, T, D, return_seq, out0, c0) + loss_outph = l2_loss::forward(outph, y) + loss_cph = l2_loss::forward(cph, yc) + lossph = loss_outph + loss_cph + c0[i,j] = old # reset + dc0_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dc0[i,j]), dc0_num, lossph, lossmh) + } + } +} + +max_pool2d = function() { + /* + * Gradient check for the 2D max pooling layer. + */ + print("Grad checking the 2D max pooling layer with L2 loss.") + + # Generate data + N = 2 # num examples + C = 2 # num channels + Hin = 4 # input height + Win = 4 # input width + Hf = 2 # pool filter height + Wf = 2 # pool filter width + stride = 2 + X = rand(rows=N, cols=C*Hin*Win) + + for (pad in 0:1) { + print(" - Grad checking w/ pad="+pad+".") + Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1)) + Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1)) + y = rand(rows=N, cols=C*Hout*Wout) + + # Compute analytical gradients of loss wrt parameters + [out, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + dout = l2_loss::backward(out, y) + dX = max_pool2d::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, Hout, Wout] = max_pool2d::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + } +} + +max_pool2d_builtin = function() { + /* + * Gradient check for the 2D max pooling layer. + */ + print("Grad checking the built-in 2D max pooling layer with L2 loss.") + + # Generate data + N = 2 # num examples + C = 2 # num channels + Hin = 4 # input height + Win = 4 # input width + Hf = 2 # pool filter height + Wf = 2 # pool filter width + stride = 2 + X = rand(rows=N, cols=C*Hin*Win) + + for (pad in 0:1) { + print(" - Grad checking w/ pad="+pad+".") + Hout = as.integer(floor((Hin + 2 * pad - Hf) / stride + 1)) + Wout = as.integer(floor((Win + 2 * pad - Wf) / stride + 1)) + y = rand(rows=N, cols=C*Hout*Wout) + + # Compute analytical gradients of loss wrt parameters + [out, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + dout = l2_loss::backward(out, y) + dX = max_pool2d_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, Hout, Wout] = max_pool2d_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + } +} + +max_pool2d_simple = function() { + /* + * Gradient check for the simple reference 2D max pooling layer. + */ + print("Grad checking the simple reference 2D max pooling layer with L2 loss.") + + # Generate data + N = 2 # num examples + C = 2 # num channels + Hin = 4 # input height + Win = 4 # input width + Hf = 2 # pool filter height + Wf = 2 # pool filter width + stride = 2 + X = rand(rows=N, cols=C*Hin*Win) + + for (pad in 0:1) { + print(" - Grad checking w/ pad="+pad+".") + Hout = as.integer(floor((Hin + 2*pad - Hf)/stride + 1)) + Wout = as.integer(floor((Win + 2*pad - Wf)/stride + 1)) + y = rand(rows=N, cols=C*Hout*Wout) + + # Compute analytical gradients of loss wrt parameters + [out, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, pad, pad) + dout = l2_loss::backward(out, y) + dX = max_pool2d_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, Hout, Wout] = max_pool2d_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride, + pad, pad) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + } +} + +relu = function() { + /* + * Gradient check for the ReLU nonlinearity layer. + * + * NOTE: This could result in a false-negative in which the test + * fails due to a kink being crossed in the nonlinearity. This + * occurs when the tests, f(x-h) and f(x+h), end up on opposite + * sides of the zero threshold of max(0, fx). For now, just run + * the tests again. In the future, we can explicitly check for + * this and rerun the test automatically. + */ + print("Grad checking the ReLU nonlinearity layer with L2 loss.") + + # Generate data + N = 3 # num examples + M = 10 # num neurons + X = rand(rows=N, cols=M, min=-5, max=5) + y = rand(rows=N, cols=M) + + # Compute analytical gradients of loss wrt parameters + out = relu::forward(X) + dout = l2_loss::backward(out, y) + dX = relu::backward(dout, X) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = relu::forward(X) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = relu::forward(X) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } +} + +rnn = function() { + /* + * Gradient check for the simple RNN layer. + */ + print("Grad checking the simple RNN layer with L2 loss.") + + # Generate data + N = 3 # num examples + D = 10 # num features + T = 15 # num timesteps (sequence length) + M = 5 # num neurons + return_seq = TRUE + X = rand(rows=N, cols=T*D) + y = rand(rows=N, cols=T*M) + out0 = rand(rows=N, cols=M) + [W, b, dummy] = rnn::init(N, D, M) + + # Compute analytical gradients of loss wrt parameters + [out, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + dout = l2_loss::backward(out, y) + [dX, dW, db, dout0] = rnn::backward(dout, X, W, b, T, D, return_seq, out0, cache_out) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking W.") + for (i in 1:nrow(W)) { + for (j in 1:ncol(W)) { + # Compute numerical derivative + old = as.scalar(W[i,j]) + W[i,j] = old - h + [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossmh = l2_loss::forward(outmh, y) + W[i,j] = old + h + [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossph = l2_loss::forward(outph, y) + W[i,j] = old # reset + dW_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh) + } + } + + print(" - Grad checking b.") + for (i in 1:nrow(b)) { + for (j in 1:ncol(b)) { + # Compute numerical derivative + old = as.scalar(b[i,j]) + b[i,j] = old - h + [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossmh = l2_loss::forward(outmh, y) + b[i,j] = old + h + [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossph = l2_loss::forward(outph, y) + b[i,j] = old # reset + db_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db[i,j]), db_num, lossph, lossmh) + } + } + + print(" - Grad checking out0.") + for (i in 1:nrow(out0)) { + for (j in 1:ncol(out0)) { + # Compute numerical derivative + old = as.scalar(out0[i,j]) + out0[i,j] = old - h + [outmh, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossmh = l2_loss::forward(outmh, y) + out0[i,j] = old + h + [outph, cache_out] = rnn::forward(X, W, b, T, D, return_seq, out0) + lossph = l2_loss::forward(outph, y) + out0[i,j] = old # reset + dout0_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dout0[i,j]), dout0_num, lossph, lossmh) + } + } +} + +scale_shift1d = function() { + /* + * Gradient check for the 1D scale & shift layer. + */ + print("Grad checking the 1D scale & shift layer with L2 loss.") + + # Generate data + N = 3 # num examples + D = 100 # num features + X = rand(rows=N, cols=D) + y = rand(rows=N, cols=D) + [gamma, beta] = scale_shift1d::init(D) + + # Compute analytical gradients of loss wrt parameters + out = scale_shift1d::forward(X, gamma, beta) + dout = l2_loss::backward(out, y) + [dX, dgamma, dbeta] = scale_shift1d::backward(dout, out, X, gamma, beta) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = scale_shift1d::forward(X, gamma, beta) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = scale_shift1d::forward(X, gamma, beta) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking gamma.") + for (i in 1:nrow(gamma)) { + for (j in 1:ncol(gamma)) { + # Compute numerical derivative + old = as.scalar(gamma[i,j]) + gamma[i,j] = old - h + outmh = scale_shift1d::forward(X, gamma, beta) + lossmh = l2_loss::forward(outmh, y) + gamma[i,j] = old + h + outph = scale_shift1d::forward(X, gamma, beta) + lossph = l2_loss::forward(outph, y) + gamma[i,j] = old # reset + dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, + lossph, lossmh) + } + } + + print(" - Grad checking beta.") + for (i in 1:nrow(beta)) { + for (j in 1:ncol(beta)) { + # Compute numerical derivative + old = as.scalar(beta[i,j]) + beta[i,j] = old - h + outmh = scale_shift1d::forward(X, gamma, beta) + lossmh = l2_loss::forward(outmh, y) + beta[i,j] = old + h + outph = scale_shift1d::forward(X, gamma, beta) + lossph = l2_loss::forward(outph, y) + beta[i,j] = old # reset + dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, + lossph, lossmh) + } + } +} + +scale_shift2d = function() { + /* + * Gradient check for the 2D scale & shift layer. + */ + print("Grad checking the 2D scale & shift layer with L2 loss.") + + # Generate data + N = 3 # num examples + C = 2 # num channels + Hin = 5 # input height + Win = 5 # input width + X = rand(rows=N, cols=C*Hin*Win) + y = rand(rows=N, cols=C*Hin*Win) + [gamma, beta] = scale_shift2d::init(C) + + # Compute analytical gradients of loss wrt parameters + out = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) + dout = l2_loss::backward(out, y) + [dX, dgamma, dbeta] = scale_shift2d::backward(dout, out, X, gamma, beta, C, Hin, Win) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking gamma.") + for (i in 1:nrow(gamma)) { + for (j in 1:ncol(gamma)) { + # Compute numerical derivative + old = as.scalar(gamma[i,j]) + gamma[i,j] = old - h + outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) + lossmh = l2_loss::forward(outmh, y) + gamma[i,j] = old + h + outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) + lossph = l2_loss::forward(outph, y) + gamma[i,j] = old # reset + dgamma_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dgamma[i,j]), dgamma_num, + lossph, lossmh) + } + } + + print(" - Grad checking beta.") + for (i in 1:nrow(beta)) { + for (j in 1:ncol(beta)) { + # Compute numerical derivative + old = as.scalar(beta[i,j]) + beta[i,j] = old - h + outmh = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) + lossmh = l2_loss::forward(outmh, y) + beta[i,j] = old + h + outph = scale_shift2d::forward(X, gamma, beta, C, Hin, Win) + lossph = l2_loss::forward(outph, y) + beta[i,j] = old # reset + dbeta_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dbeta[i,j]), dbeta_num, + lossph, lossmh) + } + } +} + +sigmoid = function() { + /* + * Gradient check for the sigmoid nonlinearity layer. + */ + print("Grad checking the sigmoid nonlinearity layer with L2 loss.") + + # Generate data + N = 3 # num examples + M = 10 # num neurons + X = rand(rows=N, cols=M) + y = rand(rows=N, cols=M) + + # Compute analytical gradients of loss wrt parameters + out = sigmoid::forward(X) + dout = l2_loss::backward(out, y) + dX = sigmoid::backward(dout, X) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = sigmoid::forward(X) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = sigmoid::forward(X) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } +} + +softmax = function() { + /* + * Gradient check for the softmax layer. + */ + print("Grad checking the softmax layer with L2 loss.") + + # Generate data + N = 3 # num examples + D = 10 # num classes + X = rand(rows=N, cols=D) + y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform") + y = y / rowSums(y) + + # Compute analytical gradients of loss wrt parameters + out = softmax::forward(X) + dout = l2_loss::backward(out, y) + dX = softmax::backward(dout, X) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = softmax::forward(X) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = softmax::forward(X) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } +} + +tanh = function() { + /* + * Gradient check for the hyperbolic tangent (tanh) nonlinearity + * layer. + */ + print("Grad checking the tanh nonlinearity layer with L2 loss.") + + # Generate data + N = 3 # num examples + M = 10 # num neurons + X = rand(rows=N, cols=M) + y = rand(rows=N, cols=M) + + # Compute analytical gradients of loss wrt parameters + out = tanh::forward(X) + dout = l2_loss::backward(out, y) + dX = tanh::backward(dout, X) + + # Grad check + h = 1e-5 + for (i in 1:nrow(X)) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old = as.scalar(X[i,j]) + X[i,j] = old - h + outmh = tanh::forward(X) + lossmh = l2_loss::forward(outmh, y) + X[i,j] = old + h + outph = tanh::forward(X) + lossph = l2_loss::forward(outph, y) + X[i,j] = old # reset + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } +} + +two_layer_affine_l2_net = function() { + /* + * Gradient check for a two-layer, fully-connected, feed-forward + * network with ReLU nonlinearity and L2 loss. + * + * NOTE: This could result in a false-negative in which the test + * fails due to a kink being crossed in the ReLU nonlinearity. This + * occurs when the tests, f(x-h) and f(x+h), end up on opposite + * sides of the zero threshold of max(0, fx). For now, just run + * the tests again. In the future, we can explicitly check for + * this and rerun the test automatically. + */ + print("Grad checking a two-layer, fully-connected, feed-forward network with a ReLU " + + "nonlinearity, and an L2 loss function.") + + # Generate input data + N = 1000 # num examples + D = 100 # num features + yD = 5 # num targets + X = rand(rows=N, cols=D, pdf="normal") + y = rand(rows=N, cols=yD) + + # Create 2-layer, fully-connected network + M = 10 # number of hidden neurons + [W1, b1] = affine::init(D, M) + [W2, b2] = affine::init(M, yD) + + # Optimize for short "burn-in" time to move to characteristic + # mode of operation and unmask any real issues. + print(" - Burn-in:") + lr = 0.0001 + decay = 0.99 + for(i in 1:5) { + # Compute forward and backward passes of net + [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2) + print(" - L2 loss: " + loss) + + # Optimize with basic SGD + W1 = W1 - lr * dW1 + b1 = b1 - lr * db1 + W2 = W2 - lr * dW2 + b2 = b2 - lr * db2 + lr = lr * decay + } + + # Compute analytical gradients + [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, b1, W2, b2) + + # Grad check + h = 1e-5 + print(" - Grad checking X.") + for (i in 1:2) { + for (j in 1:ncol(X)) { + # Compute numerical derivative + old_x = as.scalar(X[i,j]) + X[i,j] = old_x - h + [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + X[i,j] = old_x + h + [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + X[i,j] = old_x # reset X[i,j] + dX_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh) + } + } + + print(" - Grad checking W1.") + for (i in 1:nrow(W1)) { + for (j in 1:ncol(W1)) { + # Compute numerical derivative + old_w = as.scalar(W1[i,j]) + W1[i,j] = old_w - h + [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + W1[i,j] = old_w + h + [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + W1[i,j] = old_w # reset W[i,j] + dWij_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW1[i,j]), dWij_num, lossph, lossmh) + } + } + + print(" - Grad checking W2.") + for (i in 1:nrow(W2)) { + for (j in 1:ncol(W2)) { + # Compute numerical derivative + old_w = as.scalar(W2[i,j]) + W2[i,j] = old_w - h + [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + W2[i,j] = old_w + h + [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + W2[i,j] = old_w # reset W[i,j] + dWij_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(dW2[i,j]), dWij_num, lossph, lossmh) + } + } + + print(" - Grad checking b1.") + for (i in 1:nrow(b1)) { + for (j in 1:ncol(b1)) { + # Compute numerical derivative + old_b = as.scalar(b1[i,j]) + b1[i,j] = old_b - h + [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + b1[i,j] = old_b + h + [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + b1[i,j] = old_b # reset b[1,j] + dbij_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db1[i,j]), dbij_num, lossph, lossmh) + } + } + + print(" - Grad checking b2.") + for (i in 1:nrow(b2)) { + for (j in 1:ncol(b2)) { + # Compute numerical derivative + old_b = as.scalar(b2[i,j]) + b2[i,j] = old_b - h + [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + b2[i,j] = old_b + h + [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + b2[i,j] = old_b # reset b[1,j] + dbij_num = (lossph-lossmh) / (2*h) # numerical derivative + + # Check error + rel_error = test_util::check_rel_grad_error(as.scalar(db2[i,j]), dbij_num, lossph, lossmh) + } + } +} + +/* + * Test network with forward/backward functions. + */ +two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y, + matrix[double] W1, matrix[double] b1, + matrix[double] W2, matrix[double] b2) + return (matrix[double] pred, double loss, + matrix[double] dX, + matrix[double] dW1, matrix[double] db1, + matrix[double] dW2, matrix[double] db2) { + # Compute forward pass + [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, b2) + + # Compute backward pass + [dX, dpred, daout, dhout, dW1, db1, dW2, db2] = + two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2) +} + +two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y, + matrix[double] W1, matrix[double] b1, + matrix[double] W2, matrix[double] b2) + return (double loss, matrix[double] pred, matrix[double] aout, matrix[double] hout) { + # Compute forward pass + hout = affine::forward(X, W1, b1) + aout = relu::forward(hout) + pred = affine::forward(aout, W2, b2) + + # Compute loss + loss = l2_loss::forward(pred, y) +} + +two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] y, matrix[double] pred, + matrix[double] aout, matrix[double] hout, + matrix[double] W1, matrix[double] b1, + matrix[double] W2, matrix[double] b2) + return (matrix[double] dX, matrix[double] dpred, + matrix[double] daout, matrix[double] dhout, + matrix[double] dW1, matrix[double] db1, matrix[double] dW2, matrix[double] db2) { + # Compute backward pass + dpred = l2_loss::backward(pred, y) + [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2) + dhout = relu::backward(daout, hout) + [dX, dW1, db1] = affine::backward(dhout, X, W1, b1) +} +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/max_pool2d_simple.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/test/max_pool2d_simple.dml b/scripts/nn/test/max_pool2d_simple.dml new file mode 100644 index 0000000..188bd6e --- /dev/null +++ b/scripts/nn/test/max_pool2d_simple.dml @@ -0,0 +1,172 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Max Pooling layer. + * + * This implementation is intended to be a simple, reference version. + */ + +forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] out, int Hout, int Wout) { + /* + * Computes the forward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * This implementation is intended to be a simple, reference version. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - out: Outputs, of shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ + N = nrow(X) + Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) + Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) + + # Create output volume + out = matrix(0, rows=N, cols=C*Hout*Wout) + + # Max pooling + parfor (n in 1:N, check=0) { # all examples + Xn = matrix(X[n,], rows=C, cols=Hin*Win) + + # Pad image + pad_value = -1/0 + Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros + parfor (c in 1:C) { + Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped + Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) + Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice + Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape + } + img = Xn_padded # shape (C, (Hin+2*padh)*(Win+2*padw)) + + parfor (c in 1:C, check=0) { # all channels + img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw) + parfor (hout in 1:Hout, check=0) { # all output rows + hin = (hout-1) * strideh + 1 + parfor (wout in 1:Wout, check=0) { # all output columns + win = (wout-1) * stridew + 1 + out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] = max(img_slice[hin:hin+Hf-1, + win:win+Wf-1]) + } + } + } + } +} + +backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] dX) { + /* + * Computes the backward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + */ + N = nrow(X) + + # Create gradient volume + dX = matrix(0, rows=N, cols=C*Hin*Win) + + # Gradient of max pooling + for (n in 1:N) { # all examples + Xn = matrix(X[n,], rows=C, cols=Hin*Win) + + # Pad image + pad_value = -1/0 + Xn_padded = matrix(pad_value, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros + parfor (c in 1:C) { + Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped + Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) + Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice + Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape + } + img = Xn_padded + + dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) + for (c in 1:C) { # all channels + img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw) + dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw) + for (hout in 1:Hout, check=0) { # all output rows + hin = (hout-1) * strideh + 1 + for (wout in 1:Wout) { # all output columns + win = (wout-1) * stridew + 1 + img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1] + max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix + # gradient passes through only for the max value(s) in this patch + dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] + dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1] + + dimg_slice_patch + } + } + dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) + } + + # Unpad derivs on input + dXn = matrix(0, rows=C, cols=Hin*Win) + parfor (c in 1:C, check=0) { + dXn_padded_slice = matrix(dimg[c,], rows=(Hin+2*padh), cols=(Win+2*padw)) + dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] + dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win) + } + dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) + } +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/run_tests.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/test/run_tests.dml b/scripts/nn/test/run_tests.dml new file mode 100644 index 0000000..d8173a9 --- /dev/null +++ b/scripts/nn/test/run_tests.dml @@ -0,0 +1,90 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Script to run tests. + */ +source("nn/test/grad_check.dml") as grad_check +source("nn/test/test.dml") as test + +print("") +print("Starting grad checks.") +print("---") + +# Loss & loss-related functions +grad_check::cross_entropy_loss() +grad_check::l1_loss() +grad_check::l1_reg() +grad_check::l2_loss() +grad_check::l2_reg() +grad_check::log_loss() +print("") + +# Core layers +grad_check::affine() +grad_check::batch_norm1d() +grad_check::batch_norm2d() +grad_check::conv2d() +grad_check::conv2d_builtin() +grad_check::conv2d_simple() +grad_check::dropout() +grad_check::lstm() +grad_check::max_pool2d() +grad_check::max_pool2d_builtin() +grad_check::max_pool2d_simple() +grad_check::relu() +grad_check::rnn() +grad_check::scale_shift1d() +grad_check::scale_shift2d() +grad_check::sigmoid() +grad_check::softmax() +grad_check::tanh() +print("") + +# Example model +grad_check::two_layer_affine_l2_net() +print("") + +print("---") +print("Grad checks complete -- look for any ERRORs or WARNINGs.") +print("If any tests involving ReLUs failed, try a few times " + + "to ensure that they were not false negatives due to " + + "kinks being crossed.") +print("") + +print("") +print("Starting other tests.") +print("---") + +test::batch_norm1d() +test::batch_norm2d() +test::conv2d() +test::cross_entropy_loss() +test::im2col() +test::max_pool2d() +test::padding() +test::tanh() + +print("---") +print("Other tests complete -- look for any ERRORs or WARNINGs.") +print("") +print("") +