http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/softmax.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/softmax.dml b/scripts/nn/layers/softmax.dml new file mode 100644 index 0000000..68a7bc7 --- /dev/null +++ b/scripts/nn/layers/softmax.dml @@ -0,0 +1,87 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Softmax classifier layer. + */ + +forward = function(matrix[double] scores) + return (matrix[double] probs) { + /* + * Computes the forward pass for a softmax classifier. The inputs + * are interpreted as unnormalized, log-probabilities for each of + * N examples, and the softmax function transforms them to normalized + * probabilities. + * + * This can be interpreted as a generalization of the sigmoid + * function to multiple classes. + * + * `probs_ij = e^scores_ij / sum(e^scores_i)` + * + * Inputs: + * - scores: Inputs, of shape (N, D). + * + * Outputs: + * - probs: Outputs, of shape (N, D). + */ + # For numerical stability, we subtract the max score of an example from all scores for that + # example. This is equivalent to the original formulation: + # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i) + # == e^(scores_i+log(C)) / sum(e^(scores_i+log(C)) + # set log(C) = -max(scores_i): + # == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i)) + scores = scores - rowMaxs(scores) # numerical stability + unnorm_probs = exp(scores) # unnormalized probabilities + probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities +} + +backward = function(matrix[double] dprobs, matrix[double] scores) + return (matrix[double] dscores) { + /* + * Computes the backward pass for a softmax classifier. + * + * Note that dscores_ij has multiple source branches: + * + * ``` + * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij) + * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j + * + * dloss/dscores_ij = + * (dloss/dprobs_ij * dprobs_ij/dscores_ij) + * + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij) + * ``` + * + * Inputs: + * - dprobs: Gradient wrt `probs` from upstream, of shape (N, D). + * - scores: Inputs, of shape (N, D). + * + * Outputs: + * - dscores: Gradient wrt `scores`, of shape (N, D). + */ + scores = scores - rowMaxs(scores) # numerical stability + unnorm_probs = exp(scores) # unnormalized probabilities + probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities + # After some cancellation: + # dscores = dprobs*probs - probs*rowSums(dprobs*probs) + dtemp = dprobs * probs + dscores = dtemp - probs*rowSums(dtemp) +} +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/tanh.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/tanh.dml b/scripts/nn/layers/tanh.dml new file mode 100644 index 0000000..d849d70 --- /dev/null +++ b/scripts/nn/layers/tanh.dml @@ -0,0 +1,65 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Tanh nonlinearity layer. + */ +source("nn/layers/sigmoid.dml") as sigmoid + +forward = function(matrix[double] X) + return (matrix[double] out) { + /* + * Computes the forward pass for a tanh nonlinearity layer. + * + * ``` + * tanh(x) = (e^x - e^-x) / (e^x + e^-x) + * = 2 * sigmoid(2x) - 1 + * ``` + * + * Inputs: + * - X: Inputs, of shape (any, any). + * + * Outputs: + * - out: Outputs, of same shape as `X`. + */ + # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X)) + # Simplification of the above formulation to use the sigmoid function: + sigma2X = sigmoid::forward(2*X) + out = 2*sigma2X - 1 +} + +backward = function(matrix[double] dout, matrix[double] X) + return (matrix[double] dX) { + /* + * Computes the backward pass for a tanh nonlinearity layer. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. + * - X: Inputs, of shape (any, any). + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + sigma2X = sigmoid::forward(2*X) + out = 2*sigma2X - 1 + dX = (1-out^2) * dout +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/adagrad.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/optim/adagrad.dml b/scripts/nn/optim/adagrad.dml new file mode 100644 index 0000000..85b1c41 --- /dev/null +++ b/scripts/nn/optim/adagrad.dml @@ -0,0 +1,77 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Adagrad optimizer. + */ + +update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon, + matrix[double] cache) + return (matrix[double] X, matrix[double] cache) { + /* + * Performs an Adagrad update. + * + * This is an adaptive learning rate optimizer that maintains the + * sum of squared gradients to automatically adjust the effective + * learning rate. + * + * Reference: + * - Adaptive Subgradient Methods for Online Learning and Stochastic + * Optimization, Duchi et al. + * - http://jmlr.org/papers/v12/duchi11a.html + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. + * - lr: Learning rate. + * - epsilon: Smoothing term to avoid divide by zero errors. + * Typical values are in the range of [1e-8, 1e-4]. + * - cache: State that maintains per-parameter sum of squared + * gradients, of same shape as `X`. + * + * Outputs: + * - X: Updated parameters `X`, of same shape as input `X`. + * - cache: State that maintains per-parameter sum of squared + * gradients, of same shape as `X`. + */ + cache = cache + dX^2 + X = X - (lr * dX / (sqrt(cache)+epsilon)) +} + +init = function(matrix[double] X) + return (matrix[double] cache) { + /* + * Initialize the state for this optimizer. + * + * Note: This is just a convenience function, and state + * may be initialized manually if needed. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * + * Outputs: + * - cache: State that maintains per-parameter sum of squared + * gradients, of same shape as `X`. + */ + cache = matrix(0, rows=nrow(X), cols=ncol(X)) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/adam.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/optim/adam.dml b/scripts/nn/optim/adam.dml new file mode 100644 index 0000000..4b6fa2a --- /dev/null +++ b/scripts/nn/optim/adam.dml @@ -0,0 +1,97 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Adam optimizer. + */ + +update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2, + double epsilon, int t, matrix[double] m, matrix[double] v) + return (matrix[double] X, matrix[double] m, matrix[double] v) { + /* + * Performs an Adam update. + * + * Reference: + * - Adam: A Method for Stochastic Optimization, Kingma, Ba. + * - http://arxiv.org/abs/1412.6980 + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. + * - lr: Learning rate. Recommended value is 0.001. + * - beta1: Exponential decay rate for the 1st moment estimates. + * Recommended value is 0.9. + * - beta2: Exponential decay rate for the 2nd moment estimates. + * Recommended value is 0.999. + * - epsilon: Smoothing term to avoid divide by zero errors. + * Recommended value is 1e-8. + * - t: Timestep, starting at 0. + * - m: State containing the 1st moment (mean) estimate by + * maintaining exponential moving averages of the gradients, of + * same shape as `X`. + * - v: State containing the 2nd raw moment (uncentered variance) + * estimate by maintaining exponential moving averages of the + * squared gradients, of same shape as `X`. + * + * Outputs: + * - X: Updated parameters `X`, of same shape as input `X`. + * - m: Updated state containing the 1st moment (mean) estimate by + * maintaining exponential moving averages of the gradients, of + * same shape as `X`. + * - v: Updated state containing the 2nd raw moment (uncentered + * variance) estimate by maintaining exponential moving averages + * of the squared gradients, of same shape as `X`. + */ + t = t + 1 + m = beta1*m + (1-beta1)*dX # update biased 1st moment estimate + v = beta2*v + (1-beta2)*dX^2 # update biased 2nd raw moment estimate + # m = m / (1-beta1^t) # compute bias-corrected 1st moment estimate + # v = v / (1-beta2^t) # compute bias-corrected 2nd raw moment estimate + # X = X - (lr * m / (sqrt(v)+epsilon)) # param update + # Simplified for computational efficiency: + lr = lr * sqrt(1-beta2^t) / (1-beta1^t) + X = X - (lr * m / (sqrt(v)+epsilon)) +} + +init = function(matrix[double] X) + return (matrix[double] m, matrix[double] v) { + /* + * Initialize the state for this optimizer. + * + * Note: This is just a convenience function, and state + * may be initialized manually if needed. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * + * Outputs: + * - m: Initial state containing the 1st moment (mean) estimate by + * maintaining exponential moving averages of the gradients, of + * same shape as `X`. + * - v: Initial state containing the 2nd raw moment (uncentered + * variance) estimate by maintaining exponential moving averages + * of the squared gradients, of same shape as `X`. + */ + m = matrix(0, rows=nrow(X), cols=ncol(X)) + v = matrix(0, rows=nrow(X), cols=ncol(X)) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/rmsprop.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/optim/rmsprop.dml b/scripts/nn/optim/rmsprop.dml new file mode 100644 index 0000000..1feccaf --- /dev/null +++ b/scripts/nn/optim/rmsprop.dml @@ -0,0 +1,79 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * RMSprop optimizer. + */ + +update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate, + double epsilon, matrix[double] cache) + return (matrix[double] X, matrix[double] cache) { + /* + * Performs an RMSprop update. + * + * This is an adaptive learning rate optimizer that can be viewed + * as an adjustment of the Adagrad method to use a moving average + * of the sum of squared gradients in order to improve convergence. + * + * Reference: + * - Neural Networks for Machine Learning, Lecture 6a, Hinton, + * slide 29. + * - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. + * - lr: Learning rate. + * - decay_rate: Term controlling the rate of the moving average. + * Typical values are in the range of [0.9, 0.999]. + * - epsilon: Smoothing term to avoid divide by zero errors. + * Typical values are in the range of [1e-8, 1e-4]. + * - cache: State that maintains the moving average of the squared + * gradients, of same shape as `X`. + * + * Outputs: + * - X: Updated parameters `X`, of same shape as input `X`. + * - cache: Updated state that maintains the moving average of the + * squared gradients, of same shape as `X`. + */ + cache = decay_rate*cache + (1-decay_rate)*dX^2 + X = X - (lr * dX / (sqrt(cache)+epsilon)) +} + +init = function(matrix[double] X) + return (matrix[double] cache) { + /* + * Initialize the state for this optimizer. + * + * Note: This is just a convenience function, and state + * may be initialized manually if needed. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * + * Outputs: + * - cache: State that maintains the moving average of the squared + * gradients, of same shape as `X`. + */ + cache = matrix(0, rows=nrow(X), cols=ncol(X)) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/optim/sgd.dml b/scripts/nn/optim/sgd.dml new file mode 100644 index 0000000..3ba7eba --- /dev/null +++ b/scripts/nn/optim/sgd.dml @@ -0,0 +1,42 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Stochastic Gradient Descent (SGD) optimizer. + */ + +update = function(matrix[double] X, matrix[double] dX, double lr) + return (matrix[double] X) { + /* + * Performs a vanilla SGD update. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. + * - lr: Learning rate. + * + * Outputs: + * - X: Updated parameters `X`, of same shape as input `X`. + */ + X = X - lr*dX +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd_momentum.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/optim/sgd_momentum.dml b/scripts/nn/optim/sgd_momentum.dml new file mode 100644 index 0000000..85922da --- /dev/null +++ b/scripts/nn/optim/sgd_momentum.dml @@ -0,0 +1,71 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer. + */ + +update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) + return (matrix[double] X, matrix[double] v) { + /* + * Performs an SGD update with momentum. + * + * In SGD with momentum, we assume that the parameters have a velocity + * that continues with some momentum, and that is influenced by the + * gradient. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. + * - lr: Learning rate. + * - mu: Momentum value. + * Typical values are in the range of [0.5, 0.99], usually + * started at the lower end and annealed towards the higher end. + * - v: State maintaining the velocity of the parameters `X`, of same + * shape as `X`. + * + * Outputs: + * - X: Updated parameters `X`, of same shape as input `X`. + * - v: Updated velocity of the parameters `X`, of same shape as + * input `X`. + */ + v = mu*v - lr*dX # update velocity + X = X + v # update position +} + +init = function(matrix[double] X) + return (matrix[double] v) { + /* + * Initialize the state for this optimizer. + * + * Note: This is just a convenience function, and state + * may be initialized manually if needed. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * + * Outputs: + * - v: Initial velocity of the parameters `X`. + */ + v = matrix(0, rows=nrow(X), cols=ncol(X)) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/optim/sgd_nesterov.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/optim/sgd_nesterov.dml b/scripts/nn/optim/sgd_nesterov.dml new file mode 100644 index 0000000..3b62c6e --- /dev/null +++ b/scripts/nn/optim/sgd_nesterov.dml @@ -0,0 +1,81 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer. + */ + +update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) + return (matrix[double] X, matrix[double] v) { + /* + * Performs an SGD update with Nesterov momentum. + * + * As with regular SGD with momentum, in SGD with Nesterov momentum, + * we assume that the parameters have a velocity that continues + * with some momentum, and that is influenced by the gradient. + * In this view specifically, we perform the position update from the + * position that the momentum is about to carry the parameters to, + * rather than from the previous position. Additionally, we always + * store the parameters in their position after momentum. + * + * Reference: + * - Advances in optimizing Recurrent Networks, Bengio et al., + * section 3.5. + * - http://arxiv.org/abs/1212.0901 + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. + * - lr: Learning rate. + * - mu: Momentum value. + * Typical values are in the range of [0.5, 0.99], usually + * started at the lower end and annealed towards the higher end. + * - v: State maintaining the velocity of the parameters `X`, of same + * shape as `X`. + * + * Outputs: + * - X: Updated parameters X, of same shape as input X. + * - v: Updated velocity of the parameters X, of same shape as + * input v. + */ + v_prev = v + v = mu*v - lr*dX # update velocity + X = X - mu*v_prev + (1+mu)*v # update position, including momentum +} + +init = function(matrix[double] X) + return (matrix[double] v) { + /* + * Initialize the state for this optimizer. + * + * Note: This is just a convenience function, and state + * may be initialized manually if needed. + * + * Inputs: + * - X: Parameters to update, of shape (any, any). + * + * Outputs: + * - v: Initial velocity of the parameters `X`. + */ + v = matrix(0, rows=nrow(X), cols=ncol(X)) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/README.md ---------------------------------------------------------------------- diff --git a/scripts/nn/test/README.md b/scripts/nn/test/README.md new file mode 100644 index 0000000..b714d50 --- /dev/null +++ b/scripts/nn/test/README.md @@ -0,0 +1,32 @@ +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +# SystemML-NN Tests + +#### This folder contains tests for the *SystemML-NN* (`nn`) deep learning library. + +--- +## Tests +#### All layers are tested for correct derivatives ("gradient-checking"), and many layers also have correctness tests against simpler reference implementations. +* `grad_check.dml` - Contains gradient-checks for all layers as individual DML functions. +* `test.dml` - Contains correctness tests for several of the more complicated layers by checking against simple reference implementations, such as `conv_simple.dml`. All tests are formulated as individual DML functions. +* `tests.dml` - A DML script that runs all of the tests in `grad_check.dml` and `test.dml`. + +## Execution +* `spark-submit SystemML.jar -f nn/test/tests.dml` from the base of the project. http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/test/conv2d_simple.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/test/conv2d_simple.dml b/scripts/nn/test/conv2d_simple.dml new file mode 100644 index 0000000..9f126d0 --- /dev/null +++ b/scripts/nn/test/conv2d_simple.dml @@ -0,0 +1,213 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * 2D Convolutional layer. + * + * This implementation is intended to be a simple, reference version. + */ + +forward = function(matrix[double] X, matrix[double] W, matrix[double] b, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] out, int Hout, int Wout) { + /* + * Computes the forward pass for a 2D spatial convolutional layer with + * F filters. The input data has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * This implementation is intended to be a simple, reference version. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * - padw: Padding for left and right sides. + * + * Outputs: + * - out: Outputs, of shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ + N = nrow(X) + F = nrow(W) + Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) + Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) + + # Create output volume + out = matrix(0, rows=N, cols=F*Hout*Wout) + + # Convolution - Simple reference implementation + parfor (n in 1:N) { # all examples + Xn = matrix(X[n,], rows=C, cols=Hin*Win) + # Pad image + Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros + parfor (c in 1:C) { + Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped + Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) + Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice + Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape + } + # Convolve image with filters + parfor (f in 1:F, check=0) { # all filters + parfor (hout in 1:Hout, check=0) { # all output rows + h0 = (hout-1)*strideh + 1 + parfor (wout in 1:Wout, check=0) { # all output columns + w0 = (wout-1)*stridew + 1 + # Create a patch of the input example corresponding spatially to the filter sizes + Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros + parfor (c in 1:C, check=0) { + Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape + Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1, + cols=Hf*Wf) # reshape + } + out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] = + W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,] + } + } + } + } +} + +backward = function(matrix[double] dout, int Hout, int Wout, + matrix[double] X, matrix[double] W, matrix[double] b, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] dX, matrix[double] dW, matrix[double] db) { + /* + * Computes the backward pass for a 2D spatial convolutional layer + * with F filters. + * + * This implementation is intended to be a simple, reference version. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * - padw: Padding for left and right sides. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). + * - db: Gradient wrt `b`, of shape (F, 1). + */ + N = nrow(X) + F = nrow(W) + + # Create gradient volumes + dX = matrix(0, rows=N, cols=C*Hin*Win) + dW = matrix(0, rows=F, cols=C*Hf*Wf) + db = matrix(0, rows=F, cols=1) + + # Partial derivatives for convolution - Simple reference implementation + for (n in 1:N) { # all examples + Xn = matrix(X[n,], rows=C, cols=Hin*Win) + # Pad image + Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros + parfor (c in 1:C) { + Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped + Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) + Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice + Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape + } + dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) + for (f in 1:F) { # all filters + for (hout in 1:Hout) { # all output rows + h0 = (hout-1) * strideh + 1 + for (wout in 1:Wout) { # all output columns + w0 = (wout-1) * stridew + 1 + # Create a patch of the input example corresponding spatially to the filter sizes + Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros + dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout], + rows=C, cols=Hf*Wf) # reshape + for (c in 1:C) { + Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape + Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], + rows=1, cols=Hf*Wf) # reshape + dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw) + dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,], + rows=Hf, cols=Wf) # reshape + dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice, + rows=1, cols=(Hin+2*padh)*(Win+2*padw)) + } + dW[f,] = dW[f,] + + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf) + * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] + db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] + } + } + } + # Unpad derivs on input + dXn = matrix(0, rows=C, cols=Hin*Win) + parfor (c in 1:C, check=0) { + dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw)) + dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] + dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win) + } + dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) + } +} + +init = function(int F, int C, int Hf, int Wf) + return (matrix[double] W, matrix[double] b) { + /* + * Initialize the parameters of this layer. + * + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 + * + * Inputs: + * - F: Number of filters. + * - C: Number of input channels (dimensionality of depth). + * - Hf: Filter height. + * - Wf: Filter width. + * + * Outputs: + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + */ + W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) + b = matrix(0, rows=F, cols=1) +} +