http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml b/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml deleted file mode 100644 index a529a12..0000000 --- a/scripts/staging/SystemML-NN/nn/examples/mnist_softmax.dml +++ /dev/null @@ -1,178 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * MNIST Softmax Example - */ -# Imports -source("nn/layers/affine.dml") as affine -source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss -source("nn/layers/softmax.dml") as softmax -source("nn/optim/sgd_nesterov.dml") as sgd_nesterov - -train = function(matrix[double] X, matrix[double] y, - matrix[double] X_val, matrix[double] y_val, - int epochs) - return (matrix[double] W, matrix[double] b) { - /* - * Trains a softmax classifier. - * - * The input matrix, X, has N examples, each with D features. - * The targets, y, have K classes, and are one-hot encoded. - * - * Inputs: - * - X: Input data matrix, of shape (N, D). - * - y: Target matrix, of shape (N, K). - * - X_val: Input validation data matrix, of shape (N, C*Hin*Win). - * - y_val: Target validation matrix, of shape (N, K). - * - epochs: Total number of full training loops over the full data set. - * - * Outputs: - * - W: Weights (parameters) matrix, of shape (D, M). - * - b: Biases vector, of shape (1, M). - */ - N = nrow(X) # num examples - D = ncol(X) # num features - K = ncol(y) # num classes - - # Create softmax classifier: - # affine -> softmax - [W, b] = affine::init(D, K) - W = W / sqrt(2.0/(D)) * sqrt(1/(D)) - - # Initialize SGD w/ Nesterov momentum optimizer - lr = 0.2 # learning rate - mu = 0 # momentum - decay = 0.99 # learning rate decay constant - vW = sgd_nesterov::init(W) # optimizer momentum state for W - vb = sgd_nesterov::init(b) # optimizer momentum state for b - - # Optimize - print("Starting optimization") - batch_size = 50 - iters = 1000 #ceil(N / batch_size) - for (e in 1:epochs) { - for(i in 1:iters) { - # Get next batch - beg = ((i-1) * batch_size) %% N + 1 - end = min(N, beg + batch_size - 1) - X_batch = X[beg:end,] - y_batch = y[beg:end,] - - # Compute forward pass - ## affine & softmax: - out = affine::forward(X_batch, W, b) - probs = softmax::forward(out) - - # Compute loss & accuracy for training & validation data - loss = cross_entropy_loss::forward(probs, y_batch) - accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch)) - probs_val = predict(X_val, W, b) - loss_val = cross_entropy_loss::forward(probs_val, y_val) - accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(y_val)) - print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " + - accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val) - - # Compute backward pass - ## loss: - dprobs = cross_entropy_loss::backward(probs, y_batch) - ## affine & softmax: - dout = softmax::backward(dprobs, out) - [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b) - - # Optimize with SGD w/ Nesterov momentum - [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW) - [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb) - } - # Anneal momentum towards 0.999 - mu = mu + (0.999 - mu)/(1+epochs-e) - # Decay learning rate - lr = lr * decay - } -} - -predict = function(matrix[double] X, matrix[double] W, matrix[double] b) - return (matrix[double] probs) { - /* - * Computes the class probability predictions of a softmax classifier. - * - * The input matrix, X, has N examples, each with D features. - * - * Inputs: - * - X: Input data matrix, of shape (N, D). - * - W: Weights (parameters) matrix, of shape (D, M). - * - b: Biases vector, of shape (1, M). - * - * Outputs: - * - probs: Class probabilities, of shape (N, K). - */ - # Compute forward pass - ## affine & softmax: - out = affine::forward(X, W, b) - probs = softmax::forward(out) -} - -eval = function(matrix[double] probs, matrix[double] y) - return (double loss, double accuracy) { - /* - * Evaluates a softmax classifier. - * - * The probs matrix contains the class probability predictions - * of K classes over N examples. The targets, y, have K classes, - * and are one-hot encoded. - * - * Inputs: - * - probs: Class probabilities, of shape (N, K). - * - y: Target matrix, of shape (N, K). - * - * Outputs: - * - loss: Scalar loss, of shape (1). - * - accuracy: Scalar accuracy, of shape (1). - */ - # Compute loss & accuracy - loss = cross_entropy_loss::forward(probs, y) - correct_pred = rowIndexMax(probs) == rowIndexMax(y) - accuracy = mean(correct_pred) -} - -generate_dummy_data = function() - return (matrix[double] X, matrix[double] y, int C, int Hin, int Win) { - /* - * Generate a dummy dataset similar to the MNIST dataset. - * - * Outputs: - * - X: Input data matrix, of shape (N, D). - * - y: Target matrix, of shape (N, K). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - */ - # Generate dummy input data - N = 1024 # num examples - C = 1 # num input channels - Hin = 28 # input height - Win = 28 # input width - T = 10 # num targets - X = rand(rows=N, cols=C*Hin*Win, pdf="normal") - classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform")) - y = table(seq(1, N), classes) # one-hot encoding -} -
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/affine.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml deleted file mode 100644 index c9a740b..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/affine.dml +++ /dev/null @@ -1,92 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Affine (fully-connected) layer. - */ - -forward = function(matrix[double] X, matrix[double] W, matrix[double] b) - return (matrix[double] out) { - /* - * Computes the forward pass for an affine (fully-connected) layer - * with M neurons. The input data has N examples, each with D - * features. - * - * Inputs: - * - X: Inputs, of shape (N, D). - * - W: Weights, of shape (D, M). - * - b: Biases, of shape (1, M). - * - * Outputs: - * - out: Outputs, of shape (N, M). - */ - out = X %*% W + b -} - -backward = function(matrix[double] dout, matrix[double] X, - matrix[double] W, matrix[double] b) - return (matrix[double] dX, matrix[double] dW, matrix[double] db) { - /* - * Computes the backward pass for a fully-connected (affine) layer - * with M neurons. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of shape (N, M). - * - X: Inputs, of shape (N, D). - * - W: Weights, of shape (D, M). - * - b: Biases, of shape (1, M). - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, D). - * - dW: Gradient wrt `W`, of shape (D, M). - * - db: Gradient wrt `b`, of shape (1, M). - */ - dX = dout %*% t(W) - dW = t(X) %*% dout - db = colSums(dout) -} - -init = function(int D, int M) - return (matrix[double] W, matrix[double] b) { - /* - * Initialize the parameters of this layer. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * We use the heuristic by He et al., which limits the magnification - * of inputs/gradients during forward/backward passes by scaling - * unit-Gaussian weights by a factor of sqrt(2/n), under the - * assumption of relu neurons. - * - http://arxiv.org/abs/1502.01852 - * - * Inputs: - * - D: Dimensionality of the input features (number of features). - * - M: Number of neurons in this layer. - * - * Outputs: - * - W: Weights, of shape (D, M). - * - b: Biases, of shape (1, M). - */ - W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D) - b = matrix(0, rows=1, cols=M) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml deleted file mode 100644 index 2ccffdb..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/batch_norm1d.dml +++ /dev/null @@ -1,210 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 1D Batch Normalization layer. - */ - -forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, - string mode, matrix[double] ema_mean, matrix[double] ema_var, - double mu, double epsilon) - return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd, - matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) { - /* - * Computes the forward pass for a 1D batch normalization layer. - * The input data has N examples, each with D features. - * - * A batch normalization layer uses the per-feature sample mean and - * per-feature uncorrected sample variance during training to - * normalize each feature of the input data. Additionally, it - * introduces learnable parameters (gamma, beta) to control the - * amount of normalization. - * - * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta` - * - * This implementation maintains exponential moving averages of the - * mean and variance during training for use during testing. - * - * Reference: - * - Batch Normalization: Accelerating Deep Network Training by - * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015 - * - https://arxiv.org/abs/1502.03167 - * - * Inputs: - * - X: Inputs, of shape (N, D). - * - gamma: Scale parameters, of shape (1, D). - * - beta: Shift parameters, of shape (1, D). - * - mode: 'train' or 'test' to indicate if the model is currently - * being trained or tested. During training, the current batch - * mean and variance will be used to normalize the inputs, while - * during testing, the exponential average of the mean and - * variance over all previous batches will be used. - * - ema_mean: Exponential moving average of the mean, of - * shape (1, D). - * - ema_var: Exponential moving average of the variance, of - * shape (1, D). - * - mu: Momentum value for moving averages. - * Typical values are in the range of [0.9, 0.999]. - * - epsilon: Smoothing term to avoid divide by zero errors. - * Typical values are in the range of [1e-5, 1e-3]. - * - * Outputs: - * - out: Outputs, of shape (N, D). - * - ema_mean_upd: Updated exponential moving average of the mean, - * of shape (1, D). - * - ema_var_upd: Updated exponential moving average of the variance, - * of shape (1, D). - * - cache_mean: Cache of the batch mean, of shape (1, D). - * Note: This is used for performance during training. - * - cache_var: Cache of the batch variance, of shape (1, D). - * Note: This is used for performance during training. - * - cache_norm: Cache of the normalized inputs, of shape (N, D). - * Note: This is used for performance during training. - */ - N = nrow(X) - - if (mode == 'train') { - # Compute feature-wise mean and variance - mean = colMeans(X) # shape (1, D) - # var = (1/N) * colSums((X-mean)^2) - var = colVars(X) * ((N-1)/N) # compute uncorrected variance, of shape (1, D) - # Update moving averages - ema_mean_upd = mu*ema_mean + (1-mu)*mean - ema_var_upd = mu*ema_var + (1-mu)*var - } - else { - # Use moving averages of mean and variance during testing - mean = ema_mean - var = ema_var - ema_mean_upd = ema_mean - ema_var_upd = ema_var - } - - # Normalize, shift, and scale - # norm = (X-mean)*(var+epsilon)^(-1/2) - norm = (X-mean) / sqrt(var+epsilon) # shape (N, D) - out = norm*gamma + beta # shape (N, D) - - # Save variable for backward pass - cache_mean = mean - cache_var = var - cache_norm = norm -} - -backward = function(matrix[double] dout, matrix[double] out, - matrix[double] ema_mean_upd, matrix[double] ema_var_upd, - matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm, - matrix[double] X, matrix[double] gamma, matrix[double] beta, - string mode, matrix[double] ema_mean, matrix[double] ema_var, - double mu, double epsilon) - return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { - /* - * Computes the backward pass for a 1D batch normalization layer. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of shape (N, D). - * - out: Outputs from the forward pass, of shape (N, D). - * - ema_mean_upd: Updated exponential moving average of the mean - * from the forward pass, of shape (1, D). - * - ema_var_upd: Updated exponential moving average of the variance - * from the forward pass, of shape (1, D). - * - cache_mean: Cache of the batch mean from the forward pass, of - * shape (1, D). Note: This is used for performance during - * training. - * - cache_var: Cache of the batch variance from the forward pass, - * of shape (1, D). Note: This is used for performance during - * training. - * - cache_norm: Cache of the normalized inputs from the forward - * pass, of shape (N, D). Note: This is used for performance - * during training. - * - X: Inputs, of shape (N, D). - * - gamma: Scale parameters, of shape (1, D). - * - beta: Shift parameters, of shape (1, D). - * - mode: 'train' or 'test' to indicate if the model is currently - * being trained or tested. During training, the current batch - * mean and variance will be used to normalize the inputs, while - * during testing, the exponential average of the mean and - * variance over all previous batches will be used. - * - ema_mean: Exponential moving average of the mean, of - * shape (1, D). - * - ema_var: Exponential moving average of the variance, of - * shape (1, D). - * - mu: Momentum value for moving averages. - * Typical values are in the range of [0.9, 0.999]. - * - epsilon: Smoothing term to avoid divide by zero errors. - * Typical values are in the range of [1e-5, 1e-3]. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, D). - * - dgamma: Gradient wrt `W`, of shape (1, D). - * - dbeta: Gradient wrt `b`, of shape (1, D). - * - */ - N = nrow(X) - mean = cache_mean - var = cache_var - norm = cache_norm - centered = X-mean - - if (mode == 'train') { - # Compute gradients during training - dgamma = colSums(dout*norm) # shape (1, D) - dbeta = colSums(dout) # shape (1, D) - dnorm = dout * gamma # shape (N, D) - dvar = (-1/2) * colSums(centered * (var+epsilon)^(-3/2) * dnorm) # shape (1, D) - dmean = colSums((-dnorm/sqrt(var+epsilon)) + ((-2/N)*centered*dvar)) # shape (1, D) - dX = (dnorm/sqrt(var+epsilon)) + ((2/N)*centered*dvar) + ((1/N)*dmean) # shape (N, D) - } - else { - # Compute gradients during testing - dgamma = colSums(dout*norm) # shape (1, D) - dbeta = colSums(dout) # shape (1, D) - dnorm = dout * gamma # shape (N, D) - dX = dnorm / sqrt(var+epsilon) # shape (N, D) - } -} - -init = function(int D) - return (matrix[double] gamma, matrix[double] beta, - matrix[double] ema_mean, matrix[double] ema_var) { - /* - * Initialize the parameters of this layer. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * Inputs: - * - D: Dimensionality of the input features (number of features). - * - * Outputs: - * - gamma: Scale parameters, of shape (1, D). - * - beta: Shift parameters, of shape (1, D). - * - ema_mean: Exponential moving average of the mean, of - * shape (1, D). - * - ema_var: Exponential moving average of the variance, of - * shape (1, D). - */ - gamma = matrix(1, rows=1, cols=D) - beta = matrix(0, rows=1, cols=D) - ema_mean = matrix(0, rows=1, cols=D) - ema_var = matrix(1, rows=1, cols=D) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml deleted file mode 100644 index 49c6746..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/batch_norm2d.dml +++ /dev/null @@ -1,238 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 2D (Spatial) Batch Normalization layer. - */ -source("nn/util.dml") as util - -forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, - int C, int Hin, int Win, string mode, - matrix[double] ema_mean, matrix[double] ema_var, - double mu, double epsilon) - return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd, - matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) { - /* - * Computes the forward pass for a 2D (spatial) batch normalization - * layer. The input data has N examples, each represented as a 3D - * volume unrolled into a single vector. - * - * A spatial batch normalization layer uses the per-channel sample - * mean and per-channel uncorrected sample variance during training - * to normalize each channel of the input data. Additionally, it - * introduces learnable parameters (gamma, beta) to control the - * amount of normalization. - * - * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta` - * - * This implementation maintains exponential moving averages of the - * mean and variance during training for use during testing. - * - * Reference: - * - Batch Normalization: Accelerating Deep Network Training by - * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015 - * - https://arxiv.org/abs/1502.03167 - * - * Inputs: - * - X: Inputs, of shape (N, C*Hin*Win). - * - gamma: Scale parameters, of shape (C, 1). - * - beta: Shift parameters, of shape (C, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - mode: 'train' or 'test' to indicate if the model is currently - * being trained or tested. During training, the current batch - * mean and variance will be used to normalize the inputs, while - * during testing, the exponential average of the mean and - * variance over all previous batches will be used. - * - ema_mean: Exponential moving average of the mean, of - * shape (C, 1). - * - ema_var: Exponential moving average of the variance, of - * shape (C, 1). - * - mu: Momentum value for moving averages. - * Typical values are in the range of [0.9, 0.999]. - * - epsilon: Smoothing term to avoid divide by zero errors. - * Typical values are in the range of [1e-5, 1e-3]. - * - * Outputs: - * - out: Outputs, of shape (N, C*Hin*Win). - * - ema_mean_upd: Updated exponential moving average of the mean, - * of shape (C, 1). - * - ema_var_upd: Updated exponential moving average of the variance, - * of shape (C, 1). - * - cache_mean: Cache of the batch mean, of shape (C, 1). - * Note: This is used for performance during training. - * - cache_var: Cache of the batch variance, of shape (C, 1). - * Note: This is used for performance during training. - * - cache_norm: Cache of the normalized inputs, of - * shape (C, N*Hin*Win). Note: This is used for performance - * during training. - */ - N = nrow(X) - - if (mode == 'train') { - # Compute channel-wise mean and variance - # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion. - # - mean of total group is mean of subgroup means - # - variance is the mean of the subgroup variances + the variance of the subgroup means - subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win) - subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances - mean = rowMeans(subgrp_means) # shape (C, 1) - var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1) - # Update moving averages - ema_mean_upd = mu*ema_mean + (1-mu)*mean - ema_var_upd = mu*ema_var + (1-mu)*var - } - else { - # Use moving averages of mean and variance during testing - mean = ema_mean - var = ema_var - ema_mean_upd = ema_mean - ema_var_upd = ema_var - } - - # Normalize, shift, and scale - # norm = (X-mean)*(var+epsilon)^(-1/2) - # = (X-mean) / sqrt(var+epsilon) - centered = bias_add(X, -mean) # shape (N, C*Hin*Win) - norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win) - # out = norm*gamma + beta - scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win) - out = bias_add(scaled, beta) # shape (N, C*Hin*Win) - - # Save variable for backward pass - cache_mean = mean - cache_var = var - cache_norm = norm -} - -backward = function(matrix[double] dout, matrix[double] out, - matrix[double] ema_mean_upd, matrix[double] ema_var_upd, - matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm, - matrix[double] X, matrix[double] gamma, matrix[double] beta, - int C, int Hin, int Win, string mode, - matrix[double] ema_mean, matrix[double] ema_var, - double mu, double epsilon) - return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { - /* - * Computes the backward pass for a 2D (spatial) batch normalization - * layer. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win). - * - out: Outputs from the forward pass, of shape (N, C*Hin*Win). - * - ema_mean_upd: Updated exponential moving average of the mean - * from the forward pass, of shape (C, 1). - * - ema_var_upd: Updated exponential moving average of the variance - * from the forward pass, of shape (C, 1). - * - cache_mean: Cache of the batch mean from the forward pass, of - * shape (C, 1). Note: This is used for performance during - * training. - * - cache_var: Cache of the batch variance from the forward pass, - * of shape (C, 1). Note: This is used for performance during - * training. - * - cache_norm: Cache of the normalized inputs from the forward - * pass, of shape (C, N*Hin*Win). Note: This is used for - * performance during training. - * - X: Input data matrix to the forward pass, of - * shape (N, C*Hin*Win). - * - gamma: Scale parameters, of shape (C, 1). - * - beta: Shift parameters, of shape (C, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - mode: 'train' or 'test' to indicate if the model is currently - * being trained or tested. During training, the current batch - * mean and variance will be used to normalize the inputs, while - * during testing, the exponential average of the mean and - * variance over all previous batches will be used. - * - ema_mean: Exponential moving average of the mean, of - * shape (C, 1). - * - ema_var: Exponential moving average of the variance, of - * shape (C, 1). - * - mu: Momentum value for moving averages. - * Typical values are in the range of [0.9, 0.999]. - * - epsilon: Smoothing term to avoid divide by zero errors. - * Typical values are in the range of [1e-5, 1e-3]. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). - * - dgamma: Gradient wrt `W`, of shape (C, 1). - * - dbeta: Gradient wrt `b`, of shape (C, 1). - * - */ - N = nrow(X) - mean = cache_mean - var = cache_var - norm = cache_norm - centered = bias_add(X, -mean) # shape (N, C*Hin*Win) - - if (mode == 'train') { - # Compute gradients during training - dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1) - dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1) - dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win) - dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm, - C, Hin, Win) # shape (C, 1) - dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win) - dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win) - dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet - dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1) - dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon)) - dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean) - dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar) - dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win) - } - else { - # Compute gradients during testing - dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1) - dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1) - dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win) - dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win) - } -} - -init = function(int C) - return (matrix[double] gamma, matrix[double] beta, - matrix[double] ema_mean, matrix[double] ema_var) { - /* - * Initialize the parameters of this layer. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * Inputs: - * - C: Number of input channels (dimensionality of input depth). - * - * Outputs: - * - gamma: Scale parameters, of shape (C, 1). - * - beta: Shift parameters, of shape (C, 1). - * - ema_mean: Exponential moving average of the mean, of - * shape (C, 1). - * - ema_var: Exponential moving average of the variance, of - * shape (C, 1). - */ - gamma = matrix(1, rows=C, cols=1) - beta = matrix(0, rows=C, cols=1) - ema_mean = matrix(0, rows=C, cols=1) - ema_var = matrix(1, rows=C, cols=1) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml deleted file mode 100644 index 9d03568..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml +++ /dev/null @@ -1,194 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 2D Convolutional layer. - */ -source("nn/util.dml") as util - -forward = function(matrix[double] X, matrix[double] W, matrix[double] b, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] out, int Hout, int Wout) { - /* - * Computes the forward pass for a 2D spatial convolutional layer with - * F filters. The input data has N examples, each represented as a 3D - * volume unrolled into a single vector. - * - * This implementation uses `im2col` internally for each image to - * extract local image regions (patches) into columns, and then - * performs a matrix multiplication with the filters to compute the - * output maps. - * - * Inputs: - * - X: Inputs, of shape (N, C*Hin*Win). - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * For same output height as input, set `padh = (Hf - 1) / 2`, - * assuming `strideh = 1`. - * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` - * preserves the spatial dimensions of the input. - * - padw: Padding for left and right sides. - * For same output width as input, set `padw = (Wf - 1) / 2`, - * assuming `stridew = 1`. - * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` - * preserves the spatial dimensions of the input. - * - * Outputs: - * - out: Outputs, of shape (N, F*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - */ - N = nrow(X) - F = nrow(W) - Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) - Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) - - # Create output volume - out = matrix(0, rows=N, cols=F*Hout*Wout) - - # Convolution - im2col implementation - parfor (n in 1:N) { # all examples - Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape - - # Pad image - Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw)) - - # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout) - Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew) - - # Convolve patches with filters - outn = W %*% Xn_padded_cols + b # shape (F, Hout*Wout) - out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout) # reshape - } -} - -backward = function(matrix[double] dout, int Hout, int Wout, - matrix[double] X, matrix[double] W, matrix[double] b, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] dX, matrix[double] dW, matrix[double] db) { - /* - * Computes the backward pass for a 2D spatial convolutional layer - * with F filters. - * - * This implementation uses `im2col` and `col2im` internally. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of - * shape (N, F*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - * - X: Inputs, of shape (N, C*Hin*Win). - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * - padw: Padding for left and right sides. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). - * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). - * - db: Gradient wrt `b`, of shape (F, 1). - */ - N = nrow(X) - F = nrow(W) - - # Create gradient volumes - # Note: Create convenience gradient volumes for dW and db that will - # allow for one gradient to be stored per example, allowing for - # parallel computation at the expense of memory. We will reduce at - # the end. - dX = matrix(0, rows=N, cols=C*Hin*Win) - dWN = matrix(0, rows=N, cols=F*C*Hf*Wf) # dW = matrix(0, rows=F, cols=C*Hf*Wf) - dbN = matrix(0, rows=N, cols=F) # db = matrix(0, rows=F, cols=1) - - # Partial derivatives for convolution - im2col implementation - parfor (n in 1:N) { # all examples - doutn = matrix(dout[n,], rows=F, cols=Hout*Wout) - - # Compute dW - Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape - Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw)) - Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew) - # dW = dW + doutn %*% t(Xn_padded_cols) - dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf) - - # Compute db - # db = db + rowSums(doutn) - dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F) - - # Compute dX - dXn_padded_cols = t(W) %*% doutn # shape (C*Hf*Wf, Hout*Wout) - dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf, - strideh, stridew, "add") - dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw) - dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) # reshape - } - - # Reduce convenience gradient volumes with one gradient per example - # into single gradients for W and b. - dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf) - db = matrix(colSums(dbN), rows=F, cols=1) -} - -init = function(int F, int C, int Hf, int Wf) - return (matrix[double] W, matrix[double] b) { - /* - * Initialize the parameters of this layer. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * We use the heuristic by He et al., which limits the magnification - * of inputs/gradients during forward/backward passes by scaling - * unit-Gaussian weights by a factor of sqrt(2/n), under the - * assumption of relu neurons. - * - http://arxiv.org/abs/1502.01852 - * - * Inputs: - * - F: Number of filters. - * - C: Number of input channels (dimensionality of depth). - * - Hf: Filter height. - * - Wf: Filter width. - * - * Outputs: - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - */ - W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) - b = matrix(0, rows=F, cols=1) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml deleted file mode 100644 index bda7a9c..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml +++ /dev/null @@ -1,160 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 2D Convolutional layer. - * - * This implementation uses a built-in operator for higher performance. - */ - -forward = function(matrix[double] X, matrix[double] W, matrix[double] b, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] out, int Hout, int Wout) { - /* - * Computes the forward pass for a 2D spatial convolutional layer with - * F filters. The input data has N examples, each represented as a 3D - * volume unrolled into a single vector. - * - * This implementation uses a built-in operator for higher - * performance. - * - * Inputs: - * - X: Inputs, of shape (N, C*Hin*Win). - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - * - C: Number of input channels (dimensionality of depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * For same output height as input, set `padh = (Hf - 1) / 2`, - * assuming `strideh = 1`. - * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` - * preserves the spatial dimensions of the input. - * - padw: Padding for left and right sides. - * For same output width as input, set `padw = (Wf - 1) / 2`, - * assuming `stridew = 1`. - * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` - * preserves the spatial dimensions of the input. - * - * Outputs: - * - out: Outputs, of shape (N, F*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - */ - N = nrow(X) - F = nrow(W) - Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) - Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) - - # Convolution - built-in implementation - out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf], - stride=[strideh,stridew], padding=[padh,padw]) - - # Add bias term to each output filter - out = bias_add(out, b) -} - -backward = function(matrix[double] dout, int Hout, int Wout, - matrix[double] X, matrix[double] W, matrix[double] b, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] dX, matrix[double] dW, matrix[double] db) { - /* - * Computes the backward pass for a 2D spatial convolutional layer - * with F filters. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of - * shape (N, F*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - * - X: Inputs, of shape (N, C*Hin*Win). - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - * - C: Number of input channels (dimensionality of depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * For same output height as input, set `padh = (Hf - 1) / 2`, - * assuming `strideh = 1`. - * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` - * preserves the spatial dimensions of the input. - * - padw: Padding for left and right sides. - * For same output width as input, set `padw = (Wf - 1) / 2`, - * assuming `stridew = 1`. - * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` - * preserves the spatial dimensions of the input. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). - * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). - * - db: Gradient wrt `b`, of shape (F, 1). - */ - N = nrow(X) - F = nrow(W) - - # Partial derivatives for convolution - built-in implementation - dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw], - input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf]) - dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw], - input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf]) - - # Partial derivatives for bias vector - db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout)) -} - -init = function(int F, int C, int Hf, int Wf) - return (matrix[double] W, matrix[double] b) { - /* - * Initialize the parameters of this layer. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * We use the heuristic by He et al., which limits the magnification - * of inputs/gradients during forward/backward passes by scaling - * unit-Gaussian weights by a factor of sqrt(2/n), under the - * assumption of relu neurons. - * - http://arxiv.org/abs/1502.01852 - * - * Inputs: - * - F: Number of filters. - * - C: Number of input channels (dimensionality of depth). - * - Hf: Filter height. - * - Wf: Filter width. - * - * Outputs: - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - */ - W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) - b = matrix(0, rows=F, cols=1) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml deleted file mode 100644 index 63db502..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml +++ /dev/null @@ -1,78 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Cross-Entropy loss function. - */ - -forward = function(matrix[double] pred, matrix[double] y) - return (double loss) { - /* - * Computes the forward pass for a cross-entropy loss function. The - * inputs consist of N examples, each with K dimensions corresponding - * to normalized probabilities of K classes. - * - * ``` - * L_i = -y_i^T * log(pred_i) - * L = (1/N) sum(L_i) for i=1 to N - * ``` - * - * In these equations, `L` is the total loss, `L_i` is the loss for - * example `i`, `y_i` is the K-dimensional vector of target class - * probabilities, `pred_i` is K-dimensional vector of predicted - * class probabilities, and `N` is the number of examples. - * - * This can be interpreted as the negative log-likelihood assuming - * a Bernoulli distribution generalized to K dimensions, or a - * Multinomial with one observation. - * - * Inputs: - * - pred: Predictions, of shape (N, K). - * - y: Targets, of shape (N, K). - * - * Outputs: - * - loss: Average loss. - */ - N = nrow(y) - eps = 1e-10 # numerical stability to avoid log(0) - losses = rowSums(-y * log(pred+eps)) - loss = sum(losses) / N -} - -backward = function(matrix[double] pred, matrix[double] y) - return (matrix[double] dpred) { - /* - * Computes the backward pass of a cross-entropy loss function. The - * inputs consist of N examples, each with K dimensions corresponding - * to normalized probabilities of K classes. - * - * Inputs: - * - pred: Predictions, of shape (N, K). - * - y: Targets, of shape (N, K). - * - * Outputs: - * - dpred: Gradient wrt `pred`, of shape (N, K). - */ - N = nrow(y) - eps = 1e-10 # numerical stability to avoid divide-by-zero - dpred = (1/N) * -y * (1/(pred+eps)) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/dropout.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml deleted file mode 100644 index a36878b..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml +++ /dev/null @@ -1,76 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Dropout layer. - */ - -forward = function(matrix[double] X, double p, int seed) - return (matrix[double] out, matrix[double] mask) { - /* - * Computes the forward pass for an inverted dropout layer. - * - * Drops the inputs element-wise with a probability p, and divides - * by p to maintain the expected values of those inputs (which are - * the outputs of neurons) at test time. - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - p: Probability of keeping a neuron output. - * - seed: [Optional: -1] Random number generator seed to allow for - * deterministic evaluation. Set to -1 for a random seed. - * - * Outputs: - * - out: Outputs, of same shape as `X`. - * - mask: Dropout mask used to compute the output. - */ - # Normally, we might use something like - # `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p` - # to create a dropout mask. Fortunately, SystemML has a `sparsity` parameter on - # the `rand` function that allows use to create a mask directly. - if (seed == -1) { - mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p) - } else { - mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed) - } - out = X * mask / p -} - -backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask) - return (matrix[double] dX) { - /* - * Computes the backward pass for an inverted dropout layer. - * - * Applies the mask to the upstream gradient, and divides by p to - * maintain the expected values at test time. - * - * Inputs: - * - dout: Gradient wrt `out`, of same shape as `X`. - * - X: Inputs, of shape (any, any). - * - p: Probability of keeping a neuron output. - * - mask: Dropout mask used to compute the output. - * - * Outputs: - * - dX: Gradient wrt `X`, of same shape as `X`. - */ - dX = mask / p * dout -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml deleted file mode 100644 index b74566d..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml +++ /dev/null @@ -1,72 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * L1 loss function. - */ - -forward = function(matrix[double] pred, matrix[double] y) - return (double loss) { - /* - * Computes the forward pass for an L1 loss function. The inputs - * consist of N examples, each with M dimensions to predict. - * - * ``` - * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j. - * L = (1/N) sum(L_i) for i=1 to N - * ``` - * - * In these equations, `L` is the total loss, `L_i` is the loss for - * example `i`, `y_i` is the scalar target, `pred_i` is the scalar - * prediction, and `N` is the number of examples. - * - * This can be interpreted as the negative log-likelihood assuming - * a Laplace distribution. - * - * Inputs: - * - pred: Predictions, of shape (N, M). - * - y: Targets, of shape (N, M). - * - * Outputs: - * - loss: Average loss. - */ - N = nrow(y) - losses = rowSums(abs(pred-y)) - loss = sum(losses) / N -} - -backward = function(matrix[double] pred, matrix[double] y) - return (matrix[double] dpred) { - /* - * Computes the backward pass for an L1 loss function. The inputs - * consist of N examples, each with M dimensions to predict. - * - * Inputs: - * - pred: Predictions, of shape (N, M). - * - y: Targets, of shape (N, M). - * - * Outputs: - * - dpred: Gradient wrt `pred`, of shape (N, M). - */ - N = nrow(y) - dpred = sign(pred-y) / N -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml deleted file mode 100644 index 2b81c0b..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml +++ /dev/null @@ -1,56 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * L1 regularization. - */ - -forward = function(matrix[double] X, double lambda) - return (double reg_loss) { - /* - * Computes the forward pass for an L1 regularization function. - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - lambda: Regularization strength. - * A typical value is 0.01. - * - * Outputs: - * - reg_loss: Total regularization loss. - */ - reg_loss = lambda * sum(abs(X)) -} - -backward = function(matrix[double] X, double lambda) - return (matrix[double] dX) { - /* - * Computes the backward pass for an L1 regularization function. - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - lambda: Regularization strength. - * - * Outputs: - * - dX: Gradient wrt `X`, of same shape as `X`. - */ - dX = lambda * sign(X) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml deleted file mode 100644 index 0482f25..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml +++ /dev/null @@ -1,72 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * L2 loss function. - */ - -forward = function(matrix[double] pred, matrix[double] y) - return (double loss) { - /* - * Computes the forward pass for an L2 loss function. The inputs - * consist of N examples, each with M dimensions to predict. - * - * ``` - * L_i = (1/2) norm(pred_i - y_i)^2 - * L = (1/N) sum(L_i) for i=1 to N - * ``` - * - * In these equations, `L` is the total loss, `L_i` is the loss for - * example `i`, `y_i` is the scalar target, `pred_i` is the scalar - * prediction, and `N` is the number of examples. - * - * This can be interpreted as the negative log-likelihood assuming - * a Gaussian distribution. - * - * Inputs: - * - pred: Predictions, of shape (N, M). - * - y: Targets, of shape (N, M). - * - * Outputs: - * - loss: Average loss. - */ - N = nrow(y) - losses = 0.5 * rowSums((pred-y)^2) - loss = sum(losses) / N -} - -backward = function(matrix[double] pred, matrix[double] y) - return (matrix[double] dpred) { - /* - * Computes the backward pass for an L2 loss function. The inputs - * consist of N examples, each with M dimensions to predict. - * - * Inputs: - * - pred: Predictions, of shape (N, M). - * - y: Targets, of shape (N, M). - * - * Outputs: - * - dpred: Gradient wrt `pred`, of shape (N, M). - */ - N = nrow(y) - dpred = (pred-y) / N -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml deleted file mode 100644 index 7255efe..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml +++ /dev/null @@ -1,56 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * L2 regularization. - */ - -forward = function(matrix[double] X, double lambda) - return (double reg_loss) { - /* - * Computes the forward pass for an L2 regularization function. - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - lambda: Regularization strength. - * A typical value is 0.01. - * - * Outputs: - * - reg_loss: Total regularization loss. - */ - reg_loss = 0.5 * lambda * sum(X^2) -} - -backward = function(matrix[double] X, double lambda) - return (matrix[double] dX) { - /* - * Computes the backward pass for an L2 regularization function. - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - lambda: Regularization strength. - * - * Outputs: - * - dX: Gradient wrt `X`, of same shape as `X`. - */ - dX = lambda * X -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/log_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml deleted file mode 100644 index 15914f7..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml +++ /dev/null @@ -1,76 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Log loss function. - */ - -forward = function(matrix[double] pred, matrix[double] y) - return (double loss) { - /* - * Computes the forward pass for a log loss function. - * - * ``` - * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i) - * L = (1/N) sum(L_i) for i=1 to N - * ``` - * - * In these equations, `L` is the total loss, `L_i` is the loss for - * example `i`, `y_i` is the binary target, `pred_i` is probability - * of the true class (i.e. `y=1`), and `N` is the number of examples. - * - * This can be interpreted as the negative log-likelihood assuming - * a Bernoulli distribution. - * - * Inputs: - * - pred: Predictions, of shape (N, 1). - * Predictions should be probabilities of the true - * class (i.e. probability of `y=1`). - * - y: Targets, of shape (N, 1). - * Targets should be binary in the set {0, 1}. - * - * Outputs: - * - loss: Average loss. - */ - N = nrow(y) - losses = -y*log(pred) - (1-y)*log(1-pred) - loss = sum(losses) / N -} - -backward = function(matrix[double] pred, matrix[double] y) - return (matrix[double] dpred) { - /* - * Computes the backward pass for a log loss function. - * - * Inputs: - * - pred: Predictions, of shape (N, 1). - * Predictions should be probabilities of the true - * class (i.e. probability of `y=1`). - * - y: Targets, of shape (N, 1). - * Targets should be binary in the set {0, 1}. - * - * Outputs: - * - dpred: Gradient wrt `pred`, of shape (N, 1). - */ - N = nrow(y) - dpred = (1/N) * (pred-y) / (pred*(1-pred)) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/lstm.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml deleted file mode 100644 index a75add4..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml +++ /dev/null @@ -1,260 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * LSTM layer. - */ -source("nn/layers/sigmoid.dml") as sigmoid -source("nn/layers/tanh.dml") as tanh - -forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D, - boolean return_sequences, matrix[double] out0, matrix[double] c0) - return (matrix[double] out, matrix[double] c, - matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) { - /* - * Computes the forward pass for an LSTM layer with M neurons. - * The input data has N sequences of T examples, each with D features. - * - * In an LSTM, an internal cell state is maintained, additive - * interactions operate over the cell state at each timestep, and - * some amount of this cell state is exposed as output at each - * timestep. Additionally, the output of the previous timestep is fed - * back in as an additional input at the current timestep. - * - * Reference: - * - Long Short-Term Memory, Hochreiter, 1997 - * - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf - * - * Inputs: - * - X: Inputs, of shape (N, T*D). - * - W: Weights, of shape (D+M, 4M). - * - b: Biases, of shape (1, 4M). - * - T: Length of example sequences (number of timesteps). - * - D: Dimensionality of the input features (number of features). - * - return_sequences: Whether to return `out` at all timesteps, - * or just for the final timestep. - * - out0: Outputs from previous timestep, of shape (N, M). - * Note: This is *optional* and could just be an empty matrix. - * - c0: Initial cell state, of shape (N, M). - * Note: This is *optional* and could just be an empty matrix. - * - * Outputs: - * - out: If `return_sequences` is True, outputs for all timesteps, - * of shape (N, T*M). Else, outputs for the final timestep, of - * shape (N, M). - * - c: Cell state for final timestep, of shape (N, M). - * - cache_out: Cache of outputs, of shape (T, N*M). - * Note: This is used for performance during training. - * - cache_c: Cache of cell state, of shape (T, N*M). - * Note: This is used for performance during training. - * - cache_ifog: Cache of intermediate values, of shape (T, N*4M). - * Note: This is used for performance during training. - */ - N = nrow(X) - M = as.integer(ncol(W)/4) - out_prev = out0 - c_prev = c0 - c = c_prev - if (return_sequences) { - out = matrix(0, rows=N, cols=T*M) - } - else { - out = matrix(0, rows=N, cols=M) - } - # caches to be used during the backward pass for performance - cache_out = matrix(0, rows=T, cols=N*M) - cache_c = matrix(0, rows=T, cols=N*M) - cache_ifog = matrix(0, rows=T, cols=N*4*M) - - for (t in 1:T) { # each timestep - X_t = X[,(t-1)*D+1:t*D] # shape (N, D) - input = cbind(X_t, out_prev) # shape (N, D+M) - ifog = input %*% W + b # input, forget, output, and g gates; shape (N, 4M) - tmp = sigmoid::forward(ifog[,1:3*M]) # i,f,o gates squashed with sigmoid - ifog[,1:3*M] = tmp - tmp = tanh::forward(ifog[,3*M+1:4*M]) # g gate squashed with tanh - ifog[,3*M+1:4*M] = tmp - # c_t = f*prev_c + i*g - c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] # shape (N, M) - # out_t = o*tanh(c) - tmp = tanh::forward(c) - out_t = ifog[,2*M+1:3*M] * tmp # shape (N, M) - - # store - if (return_sequences) { - out[,(t-1)*M+1:t*M] = out_t - } - else { - out = out_t - } - out_prev = out_t - c_prev = c - cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape - cache_c[t,] = matrix(c, rows=1, cols=N*M) # reshape - cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M) # reshape - } -} - -backward = function(matrix[double] dout, matrix[double] dc, - matrix[double] X, matrix[double] W, matrix[double] b, int T, int D, - boolean given_sequences, matrix[double] out0, matrix[double] c0, - matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) - return (matrix[double] dX, matrix[double] dW, matrix[double] db, - matrix[double] dout0, matrix[double] dc0) { - /* - * Computes the backward pass for an LSTM layer with M neurons. - * - * Inputs: - * - dout: Gradient wrt `out`. If `given_sequences` is `True`, - * contains gradients on outputs for all timesteps, of - * shape (N, T*M). Else, contains the gradient on the output - * for the final timestep, of shape (N, M). - * - dc: Gradient wrt `c` (from later in time), of shape (N, M). - * This would come from later in time if the cell state was used - * downstream as the initial cell state for another LSTM layer. - * Typically, this would be used when a sequence was cut at - * timestep `T` and then continued in the next batch. If `c` - * was not used downstream, then `dc` would be an empty matrix. - * - X: Inputs, of shape (N, T*D). - * - W: Weights, of shape (D+M, 4M). - * - b: Biases, of shape (1, 4M). - * - T: Length of example sequences (number of timesteps). - * - D: Dimensionality of the input features. - * - given_sequences: Whether `dout` is for all timesteps, - * or just for the final timestep. This is based on whether - * `return_sequences` was true in the forward pass. - * - out0: Outputs from previous timestep, of shape (N, M). - * Note: This is *optional* and could just be an empty matrix. - * - c0: Initial cell state, of shape (N, M). - * Note: This is *optional* and could just be an empty matrix. - * - cache_out: Cache of outputs, of shape (T, N*M). - * Note: This is used for performance during training. - * - cache_c: Cache of cell state, of shape (T, N*M). - * Note: This is used for performance during training. - * - cache_ifog: Cache of intermediate values, of shape (T, N*4*M). - * Note: This is used for performance during training. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, T*D). - * - dW: Gradient wrt `W`, of shape (D+M, 4M). - * - db: Gradient wrt `b`, of shape (1, 4M). - * - dout0: Gradient wrt `out0`, of shape (N, M). - * - dc0: Gradient wrt `c0`, of shape (N, M). - */ - N = nrow(X) - M = as.integer(ncol(W)/4) - dX = matrix(0, rows=N, cols=T*D) - dW = matrix(0, rows=D+M, cols=4*M) - db = matrix(0, rows=1, cols=4*M) - dout0 = matrix(0, rows=N, cols=M) - dc0 = matrix(0, rows=N, cols=M) - dct = dc - if (!given_sequences) { - # only given dout for output at final timestep, so prepend empty douts for all other timesteps - dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M) - } - - t = T - for (iter in 1:T) { # each timestep in reverse order - X_t = X[,(t-1)*D+1:t*D] # shape (N, D) - dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M) - out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M) - ct = matrix(cache_c[t,], rows=N, cols=M) # shape (N, M) - if (t == 1) { - out_prev = out0 # shape (N, M) - c_prev = c0 # shape (N, M) - } - else { - out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M) - c_prev = matrix(cache_c[t-1,], rows=N, cols=M) # shape (N, M) - } - input = cbind(X_t, out_prev) # shape (N, D+M) - ifog = matrix(cache_ifog[t,], rows=N, cols=4*M) - i = ifog[,1:M] # input gate, shape (N, M) - f = ifog[,M+1:2*M] # forget gate, shape (N, M) - o = ifog[,2*M+1:3*M] # output gate, shape (N, M) - g = ifog[,3*M+1:4*M] # g gate, shape (N, M) - - tmp = tanh::backward(dout_t, ct) - dct = dct + o*tmp # shape (N, M) - tmp = tanh::forward(ct) - do = tmp * dout_t # output gate, shape (N, M) - df = c_prev * dct # forget gate, shape (N, M) - dc_prev = f * dct # shape (N, M) - di = g * dct # input gate, shape (N, M) - dg = i * dct # g gate, shape (N, M) - - di_raw = i * (1-i) * di - df_raw = f * (1-f) * df - do_raw = o * (1-o) * do - dg_raw = (1-g^2) * dg - difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M) - - dW = dW + t(input) %*% difog_raw # shape (D+M, 4M) - db = db + colSums(difog_raw) # shape (1, 4M) - dinput = difog_raw %*% t(W) # shape (N, D+M) - dX[,(t-1)*D+1:t*D] = dinput[,1:D] - dout_prev = dinput[,D+1:D+M] # shape (N, M) - if (t == 1) { - dout0 = dout_prev # shape (N, M) - dc0 = dc_prev # shape (N, M) - } - else { - dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M) - dct = dc_prev # shape (N, M) - } - t = t - 1 - } -} - -init = function(int N, int D, int M) - return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) { - /* - * Initialize the parameters of this layer. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * We use the Glorot uniform heuristic which limits the magnification - * of inputs/gradients during forward/backward passes by scaling - * uniform weights by a factor of sqrt(6/(fan_in + fan_out)). - * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf - * - * Inputs: - * - N: Number of examples in batch. - * - D: Dimensionality of the input features (number of features). - * - M: Number of neurons in this layer. - * - * Outputs: - * - W: Weights, of shape (D+M, 4M). - * - b: Biases, of shape (1, 4M). - * - out0: Empty previous timestep output matrix, of shape (N, M). - * - c0: Empty initial cell state matrix, of shape (N, M). - */ - fan_in = D+M - fan_out = 4*M - scale = sqrt(6/(fan_in+fan_out)) - W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform") - b = matrix(0, rows=1, cols=4*M) - out0 = matrix(0, rows=N, cols=M) - c0 = matrix(0, rows=N, cols=M) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml deleted file mode 100644 index fba1a4c..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml +++ /dev/null @@ -1,159 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Max Pooling layer. - */ -source("nn/util.dml") as util - -forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] out, int Hout, int Wout) { - /* - * Computes the forward pass for a 2D spatial max pooling layer. - * The input data has N examples, each represented as a 3D volume - * unrolled into a single vector. - * - * This implementation uses `im2col` internally for each image to - * extract local image regions (patches) of each channel slice into - * columns, and then performs max pooling over the patches to compute - * the output maps. - * - * Inputs: - * - X: Inputs, of shape (N, C*Hin*Win). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * A typical value is 0. - * - padw: Padding for left and right sides. - * A typical value is 0. - * - * Outputs: - * - out: Outputs, of shape (N, C*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - */ - N = nrow(X) - Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) - Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) - pad_value = -1/0 # in max pooling we pad with -infinity - - # Create output volume - out = matrix(0, rows=N, cols=C*Hout*Wout) - - # Max pooling - im2col implementation - parfor (n in 1:N) { # all examples - img = matrix(X[n,], rows=C, cols=Hin*Win) # reshape - - if (padh > 0 | padw > 0) { - # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw)) - img = util::pad_image(img, Hin, Win, padh, padw, pad_value) - } - - img_maxes = matrix(0, rows=C, cols=Hout*Wout) # zeros - parfor (c in 1:C) { # all channels - # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout) - img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew) - - # Max pooling on patches - img_maxes[c,] = colMaxs(img_slice_cols) - } - - out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout) - } -} - -backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] dX) { - /* - * Computes the backward pass for a 2D spatial max pooling layer. - * The input data has N examples, each represented as a 3D volume - * unrolled into a single vector. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of - * shape (N, C*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - * - X: Input data matrix, of shape (N, C*Hin*Win). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * A typical value is 0. - * - padw: Padding for left and right sides. - * A typical value is 0. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). - */ - N = nrow(X) - pad_value = -1/0 # in max pooling we pad with -infinity - - # Create gradient volume - dX = matrix(0, rows=N, cols=C*Hin*Win) - - # Gradient of max pooling - parfor (n in 1:N, check=0) { # all examples - img = matrix(X[n,], rows=C, cols=Hin*Win) - if (padh > 0 | padw > 0) { - # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw)) - img = util::pad_image(img, Hin, Win, padh, padw, pad_value) - } - - dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) - parfor (c in 1:C, check=0) { # all channels - img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw) - dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw) - for (hout in 1:Hout, check=0) { # all output rows - hin = (hout-1)*strideh + 1 - for (wout in 1:Wout) { # all output columns - win = (wout-1)*stridew + 1 - img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1] - max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix - # gradient passes through only for the max value(s) in this patch - dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] - dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1] - + dimg_slice_patch - } - } - dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) - } - - if (padh > 0 | padw > 0) { - # Unpad image gradient - dimg = util::unpad_image(dimg, Hin, Win, padh, padw) # shape (C, (Hin+2*padh)*(Win+2*padw)) - } - dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win) - } -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml deleted file mode 100644 index 880f818..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml +++ /dev/null @@ -1,103 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 2D Max Pooling layer. - * - * This implementation uses a built-in operator for higher performance. - */ - -forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] out, int Hout, int Wout) { - /* - * Computes the forward pass for a 2D spatial max pooling layer. - * The input data has N examples, each represented as a 3D volume - * unrolled into a single vector. - * - * This implementation uses a built-in operator for higher - * performance. - * - * Inputs: - * - X: Inputs, of shape (N, C*Hin*Win). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * A typical value is 0. - * - padw: Padding for left and right sides. - * A typical value is 0. - * - * Outputs: - * - out: Outputs, of shape (N, C*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - */ - N = nrow(X) - Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) - Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) - - # Max pooling - built-in implementation - out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf], - stride=[strideh,stridew], padding=[padh,padw]) -} - -backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] dX) { - /* - * Computes the backward pass for a 2D spatial max pooling layer. - * The input data has N examples, each represented as a 3D volume - * unrolled into a single vector. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of - * shape (N, C*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - * - X: Inputs, of shape (N, C*Hin*Win). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * A typical value is 0. - * - padw: Padding for left and right sides. - * A typical value is 0. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). - */ - N = nrow(X) - - # Gradient of max pooling - dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf], - stride=[strideh,stridew], padding=[padh,padw]) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/relu.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml deleted file mode 100644 index 93a6e90..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/relu.dml +++ /dev/null @@ -1,59 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Rectified Linear Unit (ReLU) nonlinearity layer. - */ - -forward = function(matrix[double] X) - return (matrix[double] out) { - /* - * Computes the forward pass for a ReLU nonlinearity layer. - * - * Performs an element-wise evaluation of `f(input) = max(0, input)`. - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - * Outputs: - * - out: Outputs, of same shape as `X`. - */ - out = max(X, 0) -} - -backward = function(matrix[double] dout, matrix[double] X) - return (matrix[double] dX) { - /* - * Computes the backward pass for a ReLU nonlinearity layer. - * - * Essentially performs a pass-through of the upstream gradient - * for cells > 0. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of same shape as `X`. - * - X: Previous input data matrix, of shape (any, any). - * - * Outputs: - * - dX: Gradient wrt `X`, of same shape as `X`. - */ - dX = (X > 0) * dout -} -