Repository: systemml Updated Branches: refs/heads/master 62b64b32d -> 532da1bc5
http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/convnet.dml ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/convnet.dml b/projects/breast_cancer/convnet.dml deleted file mode 100644 index 6cbea39..0000000 --- a/projects/breast_cancer/convnet.dml +++ /dev/null @@ -1,495 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Breast Cancer LeNet-like ConvNet Model - */ -# Imports -source("nn/layers/affine.dml") as affine -source("nn/layers/conv2d_builtin.dml") as conv2d -source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss -source("nn/layers/dropout.dml") as dropout -source("nn/layers/l2_reg.dml") as l2_reg -source("nn/layers/max_pool2d_builtin.dml") as max_pool2d -source("nn/layers/relu.dml") as relu -source("nn/layers/softmax.dml") as softmax -#source("nn/optim/adam.dml") as adam -source("nn/optim/sgd_nesterov.dml") as sgd_nesterov - -train = function(matrix[double] X, matrix[double] Y, - matrix[double] X_val, matrix[double] Y_val, - int C, int Hin, int Win, - double lr, double mu, double decay, double lambda, - int batch_size, int epochs, int log_interval, - string checkpoint_dir) - return (matrix[double] Wc1, matrix[double] bc1, - matrix[double] Wc2, matrix[double] bc2, - matrix[double] Wc3, matrix[double] bc3, - matrix[double] Wa1, matrix[double] ba1, - matrix[double] Wa2, matrix[double] ba2) { - /* - * Trains a convolutional net using a "LeNet"-like architecture. - * - * The input matrix, X, has N examples, each represented as a 3D - * volume unrolled into a single vector. The targets, Y, have K - * classes, and are one-hot encoded. - * - * Inputs: - * - X: Input data matrix, of shape (N, C*Hin*Win). - * - Y: Target matrix, of shape (N, K). - * - X_val: Input validation data matrix, of shape (N, C*Hin*Win). - * - Y_val: Target validation matrix, of shape (N, K). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - lr: Learning rate. - * - mu: Momentum value. - * Typical values are in the range of [0.5, 0.99], usually - * started at the lower end and annealed towards the higher end. - * - decay: Learning rate decay rate. - * - lambda: Regularization strength. - * - batch_size: Size of mini-batches to train on. - * - epochs: Total number of full training loops over the full data set. - * - log_interval: Interval, in iterations, between log outputs. - * - checkpoint_dir: Directory to store model checkpoints. - * - * Outputs: - * - Wc1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf). - * - bc1: 1st layer biases vector, of shape (F1, 1). - * - Wc2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf). - * - bc2: 2nd layer biases vector, of shape (F2, 1). - * - Wc3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3). - * - bc3: 3rd layer biases vector, of shape (1, N3). - * - Wa2: 4th layer weights (parameters) matrix, of shape (N3, K). - * - ba2: 4th layer biases vector, of shape (1, K). - */ - N = nrow(X) - K = ncol(Y) - - # Create network: - # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3 - # -> affine1 -> relu1 -> dropout1 -> affine2 -> softmax - Hf = 3 # filter height - Wf = 3 # filter width - stride = 1 - pad = 1 # For same dimensions, (Hf - stride) / 2 - F1 = 32 # num conv filters in conv1 - F2 = 32 # num conv filters in conv2 - F3 = 32 # num conv filters in conv3 - N1 = 512 # num nodes in affine1 - # Note: affine2 has K nodes, which is equal to the number of target dimensions (num classes) - [Wc1, bc1] = conv2d::init(F1, C, Hf, Wf) # inputs: (N, C*Hin*Win) - [Wc2, bc2] = conv2d::init(F2, F1, Hf, Wf) # inputs: (N, F1*(Hin/2)*(Win/2)) - [Wc3, bc3] = conv2d::init(F3, F2, Hf, Wf) # inputs: (N, F2*(Hin/2^2)*(Win/2^2)) - [Wa1, ba1] = affine::init(F3*(Hin/2^3)*(Win/2^3), N1) # inputs: (N, F3*(Hin/2^3)*(Win/2^3)) - [Wa2, ba2] = affine::init(N1, K) # inputs: (N, N1) - Wa2 = Wa2 / sqrt(2) # different initialization, since being fed into softmax, instead of relu - - # TODO: Compare optimizers once training is faster. - # Initialize SGD w/ Nesterov momentum optimizer - vWc1 = sgd_nesterov::init(Wc1); vbc1 = sgd_nesterov::init(bc1) - vWc2 = sgd_nesterov::init(Wc2); vbc2 = sgd_nesterov::init(bc2) - vWc3 = sgd_nesterov::init(Wc3); vbc3 = sgd_nesterov::init(bc3) - vWa1 = sgd_nesterov::init(Wa1); vba1 = sgd_nesterov::init(ba1) - vWa2 = sgd_nesterov::init(Wa2); vba2 = sgd_nesterov::init(ba2) - #[mWc1, vWc1] = adam::init(Wc1) # optimizer 1st & 2nd moment state for Wc1 - #[mbc1, vbc1] = adam::init(bc1) # optimizer 1st & 2nd moment state for bc1 - #[mWc2, vWc2] = adam::init(Wc2) # optimizer 1st & 2nd moment state for Wc2 - #[mbc2, vbc2] = adam::init(bc2) # optimizer 1st & 2nd moment state for bc2 - #[mWc3, vWc3] = adam::init(Wc3) # optimizer 1st & 2nd moment state for Wc3 - #[mbc3, vbc3] = adam::init(bc3) # optimizer 1st & 2nd moment state for bc3 - #[mWa1, vWa1] = adam::init(Wa1) # optimizer 1st & 2nd moment state for Wa1 - #[mba1, vba1] = adam::init(ba1) # optimizer 1st & 2nd moment state for ba1 - #[mWa2, vWa2] = adam::init(Wa2) # optimizer 1st & 2nd moment state for Wa2 - #[mba2, vba2] = adam::init(ba2) # optimizer 1st & 2nd moment state for ba2 - #beta1 = 0.9 - #beta2 = 0.999 - #eps = 1e-8 - - # TODO: Enable starting val metrics once fast, distributed predictions are available. - # Starting validation loss & accuracy - #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) - #loss_val = cross_entropy_loss::forward(probs_val, Y_val) - #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) - ## Output results - #print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val) - - # Optimize - print("Starting optimization") - iters = ceil(N / batch_size) - for (e in 1:epochs) { - for(i in 1:iters) { - # Get next batch - beg = ((i-1) * batch_size) %% N + 1 - end = min(N, beg + batch_size - 1) - X_batch = X[beg:end,] - y_batch = Y[beg:end,] - - # Compute forward pass - ## conv layer 1: conv1 -> relu1 -> pool1 - [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, - stride, stride, pad, pad) - outc1r = relu::forward(outc1) - [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, - strideh=2, stridew=2, 0, 0) - ## conv layer 2: conv2 -> relu2 -> pool2 - [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, - stride, stride, pad, pad) - outc2r = relu::forward(outc2) - [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, - strideh=2, stridew=2, 0, 0) - ## conv layer 3: conv3 -> relu3 -> pool3 - [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, - stride, stride, pad, pad) - outc3r = relu::forward(outc3) - [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, - strideh=2, stridew=2, 0, 0) - ## affine layer 1: affine1 -> relu1 -> dropout1 - outa1 = affine::forward(outc3p, Wa1, ba1) - outa1r = relu::forward(outa1) - [outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1) - ## affine layer 2: affine2 -> softmax - outa2 = affine::forward(outa1d, Wa2, ba2) - probs = softmax::forward(outa2) - - # Compute data backward pass - ## loss: - dprobs = cross_entropy_loss::backward(probs, y_batch) - ## affine layer 2: affine2 -> softmax - douta2 = softmax::backward(dprobs, outa2) - [douta1d, dWa2, dba2] = affine::backward(douta2, outa1d, Wa2, ba2) - ## layer 3: affine3 -> relu3 -> dropout - ## affine layer 1: affine1 -> relu1 -> dropout - douta1r = dropout::backward(douta1d, outa1r, 0.5, maskad1) - douta1 = relu::backward(douta1r, outa1) - [doutc3p, dWa1, dba1] = affine::backward(douta1, outc3p, Wa1, ba1) - ## conv layer 3: conv3 -> relu3 -> pool3 - doutc3r = max_pool2d::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, Houtc3, Woutc3, - Hf=2, Wf=2, strideh=2, stridew=2, 0, 0) - doutc3 = relu::backward(doutc3r, outc3) - [doutc2p, dWc3, dbc3] = conv2d::backward(doutc3, Houtc3, Woutc3, outc2p, Wc3, bc2, F2, - Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad) - ## conv layer 2: conv2 -> relu2 -> pool2 - doutc2r = max_pool2d::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, Houtc2, Woutc2, - Hf=2, Wf=2, strideh=2, stridew=2, 0, 0) - doutc2 = relu::backward(doutc2r, outc2) - [doutc1p, dWc2, dbc2] = conv2d::backward(doutc2, Houtc2, Woutc2, outc1p, Wc2, bc2, F1, - Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad) - ## conv layer 1: conv1 -> relu1 -> pool1 - doutc1r = max_pool2d::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, Houtc1, Woutc1, - Hf=2, Wf=2, strideh=2, stridew=2, 0, 0) - doutc1 = relu::backward(doutc1r, outc1) - [dX_batch, dWc1, dbc1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, Wc1, bc1, C, - Hin, Win, Hf, Wf, stride, stride, pad, pad) - - # Compute regularization backward pass - dWc1_reg = l2_reg::backward(Wc1, lambda) - dWc2_reg = l2_reg::backward(Wc2, lambda) - dWc3_reg = l2_reg::backward(Wc3, lambda) - dWa1_reg = l2_reg::backward(Wa1, lambda) - dWa2_reg = l2_reg::backward(Wa2, lambda) - dWc1 = dWc1 + dWc1_reg - dWc2 = dWc2 + dWc2_reg - dWc3 = dWc3 + dWc3_reg - dWa1 = dWa1 + dWa1_reg - dWa2 = dWa2 + dWa2_reg - - # Optimize with SGD w/ Nesterov momentum - [Wc1, vWc1] = sgd_nesterov::update(Wc1, dWc1, lr, mu, vWc1) - [bc1, vbc1] = sgd_nesterov::update(bc1, dbc1, lr, mu, vbc1) - [Wc2, vWc2] = sgd_nesterov::update(Wc2, dWc2, lr, mu, vWc2) - [bc2, vbc2] = sgd_nesterov::update(bc2, dbc2, lr, mu, vbc2) - [Wc3, vWc3] = sgd_nesterov::update(Wc3, dWc3, lr, mu, vWc3) - [bc3, vbc3] = sgd_nesterov::update(bc3, dbc3, lr, mu, vbc3) - [Wa1, vWa1] = sgd_nesterov::update(Wa1, dWa1, lr, mu, vWa1) - [ba1, vba1] = sgd_nesterov::update(ba1, dba1, lr, mu, vba1) - [Wa2, vWa2] = sgd_nesterov::update(Wa2, dWa2, lr, mu, vWa2) - [ba2, vba2] = sgd_nesterov::update(ba2, dba2, lr, mu, vba2) - #t = e*i - 1 - #[Wc1, mWc1, vWc1] = adam::update(Wc1, dWc1, lr, beta1, beta2, eps, t, mWc1, vWc1) - #[bc1, mbc1, vbc1] = adam::update(bc1, dbc1, lr, beta1, beta2, eps, t, mbc1, vbc1) - #[Wc2, mWc2, vWc2] = adam::update(Wc2, dWc2, lr, beta1, beta2, eps, t, mWc2, vWc2) - #[bc2, mbc2, vbc2] = adam::update(bc2, dbc2, lr, beta1, beta2, eps, t, mbc2, vbc2) - #[Wc3, mWc3, vWc3] = adam::update(Wc3, dWc3, lr, beta1, beta2, eps, t, mWc3, vWc3) - #[bc3, mbc3, vbc3] = adam::update(bc3, dbc3, lr, beta1, beta2, eps, t, mbc3, vbc3) - #[Wa1, mWa1, vWa1] = adam::update(Wa1, dWa1, lr, beta1, beta2, eps, t, mWa1, vWa1) - #[ba1, mba1, vba1] = adam::update(ba1, dba1, lr, beta1, beta2, eps, t, mba1, vba1) - #[Wa2, mWa2, vWa2] = adam::update(Wa2, dWa2, lr, beta1, beta2, eps, t, mWa2, vWa2) - #[ba2, mba2, vba2] = adam::update(ba2, dba2, lr, beta1, beta2, eps, t, mba2, vba2) - - # Compute loss & accuracy for training & validation data every `log_interval` iterations. - if (i %% log_interval == 0) { - # Compute training loss & accuracy - loss_data = cross_entropy_loss::forward(probs, y_batch) - loss_reg_Wc1 = l2_reg::forward(Wc1, lambda) - loss_reg_Wc2 = l2_reg::forward(Wc2, lambda) - loss_reg_Wc3 = l2_reg::forward(Wc3, lambda) - loss_reg_Wa1 = l2_reg::forward(Wa1, lambda) - loss_reg_Wa2 = l2_reg::forward(Wa2, lambda) - loss = loss_data + loss_reg_Wc1 + loss_reg_Wc2 + loss_reg_Wc3 + loss_reg_Wa1 + loss_reg_Wa2 - accuracy = mean(rowIndexMax(probs) == rowIndexMax(y_batch)) - - # TODO: Consider enabling val metrics here once fast, distributed predictions are available. - ## Compute validation loss & accuracy - #probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) - #loss_val = cross_entropy_loss::forward(probs_val, Y_val) - #accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) - - ## Output results - #print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", Train Accuracy: " - # + accuracy + ", Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val - # + ", lr: " + lr + ", mu " + mu) - # Output results - print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters - + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy) - } - } - - # Compute validation loss & accuracy for validation data every epoch - probs_val = predict(X_val, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) - loss_val = cross_entropy_loss::forward(probs_val, Y_val) - accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) - - # Output results - print("Epoch: " + e + "/" + epochs + ", Val Loss: " + loss_val - + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu) - - # Checkpoint model - dir = checkpoint_dir + e + "/" - dummy = checkpoint(dir, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) - str = "lr: " + lr + ", mu: " + mu + ", decay: " + decay + ", lambda: " + lambda - + ", batch_size: " + batch_size - name = dir + accuracy_val - write(str, name) - - # Anneal momentum towards 0.999 - mu = mu + (0.999 - mu)/(1+epochs-e) - # Decay learning rate - lr = lr * decay - } -} - -checkpoint = function(string dir, - matrix[double] Wc1, matrix[double] bc1, - matrix[double] Wc2, matrix[double] bc2, - matrix[double] Wc3, matrix[double] bc3, - matrix[double] Wa1, matrix[double] ba1, - matrix[double] Wa2, matrix[double] ba2) { - /* - * Save the model parameters. - * - * Inputs: - * - dir: Directory in which to save model parameters. - * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf). - * - bc1: 1st conv layer biases vector, of shape (F1, 1). - * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf). - * - bc2: 2nd conv layer biases vector, of shape (F2, 1). - * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf). - * - bc3: 3rd conv layer biases vector, of shape (F3, 1). - * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1). - * - ba1: 1st affine layer biases vector, of shape (1, N1). - * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K). - * - ba2: 2nd affine layer biases vector, of shape (1, K). - * - * Outputs: - * - probs: Class probabilities, of shape (N, K). - */ - write(Wc1, dir + "Wc1", format="binary") - write(bc1, dir + "bc1", format="binary") - write(Wc2, dir + "Wc2", format="binary") - write(bc2, dir + "bc2", format="binary") - write(Wc3, dir + "Wc3", format="binary") - write(bc3, dir + "bc3", format="binary") - write(Wa1, dir + "Wa1", format="binary") - write(ba1, dir + "ba1", format="binary") - write(Wa2, dir + "Wa2", format="binary") - write(ba2, dir + "ba2", format="binary") -} - -predict = function(matrix[double] X, int C, int Hin, int Win, - matrix[double] Wc1, matrix[double] bc1, - matrix[double] Wc2, matrix[double] bc2, - matrix[double] Wc3, matrix[double] bc3, - matrix[double] Wa1, matrix[double] ba1, - matrix[double] Wa2, matrix[double] ba2) - return (matrix[double] probs) { - /* - * Computes the class probability predictions of a convolutional - * net using the "LeNet" architecture. - * - * The input matrix, X, has N examples, each represented as a 3D - * volume unrolled into a single vector. - * - * Inputs: - * - X: Input data matrix, of shape (N, C*Hin*Win). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Wc1: 1st conv layer weights (parameters) matrix, of shape (F1, C*Hf*Wf). - * - bc1: 1st conv layer biases vector, of shape (F1, 1). - * - Wc2: 2nd conv layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf). - * - bc2: 2nd conv layer biases vector, of shape (F2, 1). - * - Wc3: 3rd conv layer weights (parameters) matrix, of shape (F3, F2*Hf*Wf). - * - bc3: 3rd conv layer biases vector, of shape (F3, 1). - * - Wa1: 1st affine layer weights (parameters) matrix, of shape (F3*(Hin/2^3)*(Win/2^1), N1). - * - ba1: 1st affine layer biases vector, of shape (1, N1). - * - Wa2: 2nd affine layer weights (parameters) matrix, of shape (N1, K). - * - ba2: 2nd affine layer biases vector, of shape (1, K). - * - * Outputs: - * - probs: Class probabilities, of shape (N, K). - */ - N = nrow(X) - - # Network: - # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> pool3 - # -> affine1 -> relu1 -> affine2 -> softmax - Hf = 3 # filter height - Wf = 3 # filter width - stride = 1 - pad = 1 # For same dimensions, (Hf - stride) / 2 - - F1 = nrow(Wc1) # num conv filters in conv1 - F2 = nrow(Wc2) # num conv filters in conv2 - F3 = nrow(Wc3) # num conv filters in conv3 - N1 = ncol(Wa1) # num nodes in affine1 - K = ncol(Wa2) # num nodes in affine2, equal to number of target dimensions (num classes) - - # TODO: Implement fast, distributed conv & max pooling operators so that predictions - # can be computed in a full-batch, distributed manner. Alternatively, improve `parfor` - # so that it can be efficiently used for parallel predictions. - ## Compute forward pass - ### conv layer 1: conv1 -> relu1 -> pool1 - #[outc1, Houtc1, Woutc1] = conv2d::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, - # pad, pad) - #outc1r = relu::forward(outc1) - #[outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, - # strideh=2, stridew=2, 0, 0) - ### conv layer 2: conv2 -> relu2 -> pool2 - #[outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, - # stride, stride, pad, pad) - #outc2r = relu::forward(outc2) - #[outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, - # strideh=2, stridew=2, 0, 0) - ### conv layer 3: conv3 -> relu3 -> pool3 - #[outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, - # stride, stride, pad, pad) - #outc3r = relu::forward(outc3) - #[outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, - # strideh=2, stridew=2, 0, 0) - ### affine layer 1: affine1 -> relu1 -> dropout - #outa1 = affine::forward(outc3p, Wa1, ba1) - #outa1r = relu::forward(outa1) - ##[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1) - ### affine layer 2: affine2 -> softmax - #outa2 = affine::forward(outa1r, Wa2, ba2) - #probs = softmax::forward(outa2) - - # Compute predictions over mini-batches - probs = matrix(0, rows=N, cols=K) - batch_size = 50 - iters = ceil(N / batch_size) - for(i in 1:iters) { - # TODO: `parfor` should work here, possibly as an alternative to distributed predictions. - #parfor(i in 1:iters, check=0, mode=REMOTE_SPARK, resultmerge=REMOTE_SPARK) { - # Get next batch - beg = ((i-1) * batch_size) %% N + 1 - end = min(N, beg + batch_size - 1) - X_batch = X[beg:end,] - - # Compute forward pass - ## conv layer 1: conv1 -> relu1 -> pool1 - [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, Hf, Wf, - stride, stride, pad, pad) - outc1r = relu::forward(outc1) - [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, Woutc1, Hf=2, Wf=2, - strideh=2, stridew=2, 0, 0) - ## conv layer 2: conv2 -> relu2 -> pool2 - [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, - stride, stride, pad, pad) - outc2r = relu::forward(outc2) - [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, Woutc2, Hf=2, Wf=2, - strideh=2, stridew=2, 0, 0) - ## conv layer 3: conv3 -> relu3 -> pool3 - [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, Woutc2p, Hf, Wf, - stride, stride, pad, pad) - outc3r = relu::forward(outc3) - [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, Woutc3, Hf=2, Wf=2, - strideh=2, stridew=2, 0, 0) - ## affine layer 1: affine1 -> relu1 -> dropout - outa1 = affine::forward(outc3p, Wa1, ba1) - outa1r = relu::forward(outa1) - #[outa1d, maskad1] = dropout::forward(outa1r, 0.5, -1) - ## affine layer 2: affine2 -> softmax - outa2 = affine::forward(outa1r, Wa2, ba2) - probs_batch = softmax::forward(outa2) - - # Store predictions - probs[beg:end,] = probs_batch - } -} - -eval = function(matrix[double] probs, matrix[double] Y) - return (double loss, double accuracy) { - /* - * Evaluates a convolutional net using the "LeNet" architecture. - * - * The probs matrix contains the class probability predictions - * of K classes over N examples. The targets, Y, have K classes, - * and are one-hot encoded. - * - * Inputs: - * - probs: Class probabilities, of shape (N, K). - * - Y: Target matrix, of shape (N, - * - * Outputs: - * - loss: Scalar loss, of shape (1). - * - accuracy: Scalar accuracy, of shape (1). - */ - # Compute loss & accuracy - loss = cross_entropy_loss::forward(probs, Y) - correct_pred = rowIndexMax(probs) == rowIndexMax(Y) - accuracy = mean(correct_pred) -} - -generate_dummy_data = function() - return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) { - /* - * Generate a dummy dataset similar to the breast cancer dataset. - * - * Outputs: - * - X: Input data matrix, of shape (N, D). - * - Y: Target matrix, of shape (N, K). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - */ - # Generate dummy input data - N = 1024 # num examples - C = 3 # num input channels - Hin = 256 # input height - Win = 256 # input width - K = 3 # num target classes - X = rand(rows=N, cols=C*Hin*Win, pdf="normal") - classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform")) - Y = table(seq(1, N), classes) # one-hot encoding -} - http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/hyperparam_tuning.dml ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/hyperparam_tuning.dml b/projects/breast_cancer/hyperparam_tuning.dml index 4f054c3..c5e0382 100644 --- a/projects/breast_cancer/hyperparam_tuning.dml +++ b/projects/breast_cancer/hyperparam_tuning.dml @@ -23,7 +23,7 @@ * Hyperparameter Tuning Script For LeNet-like CNN Model */ # Imports -source("cnn.dml") as clf +source("breastcancer/convnet.dml") as clf # Read data # X = read("data/X_0.01_sample_binary") http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/nn ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/nn b/projects/breast_cancer/nn deleted file mode 120000 index 9c0c967..0000000 --- a/projects/breast_cancer/nn +++ /dev/null @@ -1 +0,0 @@ -../../scripts/staging/SystemML-NN/nn \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/preprocess.py ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/preprocess.py b/projects/breast_cancer/preprocess.py index 95b9f36..167fa61 100644 --- a/projects/breast_cancer/preprocess.py +++ b/projects/breast_cancer/preprocess.py @@ -69,7 +69,7 @@ num_partitions = 20000 add_row_indices = True train_frac = 0.8 split_seed = 24 -folder = "/home/MDM/breast_cancer/data" +folder = "data" # Linux-filesystem directory to read raw data save_folder = "data" # Hadoop-supported directory in which to save DataFrames df_path = os.path.join(save_folder, "samples_{}_{}{}.parquet".format( "labels" if training else "testing", sample_size, "_grayscale" if grayscale else "")) http://git-wip-us.apache.org/repos/asf/systemml/blob/532da1bc/projects/breast_cancer/softmax_clf.dml ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/softmax_clf.dml b/projects/breast_cancer/softmax_clf.dml deleted file mode 100644 index 35fd545..0000000 --- a/projects/breast_cancer/softmax_clf.dml +++ /dev/null @@ -1,207 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Breast Cancer Softmax Model - */ -# Imports -source("nn/layers/affine.dml") as affine -source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss -source("nn/layers/softmax.dml") as softmax -#source("nn/optim/adam.dml") as adam -source("nn/optim/sgd_nesterov.dml") as sgd_nesterov - -train = function(matrix[double] X, matrix[double] Y, - matrix[double] X_val, matrix[double] Y_val, - double lr, double mu, double decay, - int batch_size, int epochs, int log_interval) - return (matrix[double] W, matrix[double] b) { - /* - * Trains a softmax classifier. - * - * The input matrix, X, has N examples, each with D features. - * The targets, Y, have K classes, and are one-hot encoded. - * - * Inputs: - * - X: Input data matrix, of shape (N, D). - * - Y: Target matrix, of shape (N, K). - * - X_val: Input validation data matrix, of shape (N, C*Hin*Win). - * - Y_val: Target validation matrix, of shape (N, K). - * - lr: Learning rate. - * - mu: Momentum value. - * Typical values are in the range of [0.5, 0.99], usually - * started at the lower end and annealed towards the higher end. - * - decay: Learning rate decay rate. - * - batch_size: Size of mini-batches to train on. - * - epochs: Total number of full training loops over the full data set. - * - log_interval: Interval, in iterations, between log outputs. - * - * Outputs: - * - W: Weights (parameters) matrix, of shape (D, K). - * - b: Biases vector, of shape (1, K). - */ - N = nrow(Y) # num examples - D = ncol(X) # num features - K = ncol(Y) # num classes - - # Create softmax classifier: - # affine -> softmax - [W, b] = affine::init(D, K) - W = W / sqrt(2.0/(D)) * sqrt(1/(D)) - - # Initialize SGD w/ Nesterov momentum optimizer - vW = sgd_nesterov::init(W) # optimizer momentum state for W - vb = sgd_nesterov::init(b) # optimizer momentum state for b - #[mW, vW] = adam::init(W) # optimizer 1st & 2nd moment state for W - #[mb, vb] = adam::init(b) # optimizer 1st & 2nd moment state for b - - # Starting validation loss & accuracy - probs_val = predict(X_val, W, b) - loss_val = cross_entropy_loss::forward(probs_val, Y_val) - accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) - # Output results - print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val) - - # Optimize - print("Starting optimization") - iters = ceil(N / batch_size) - for (e in 1:epochs) { - for(i in 1:iters) { - # Get next batch - beg = ((i-1) * batch_size) %% N + 1 - end = min(N, beg + batch_size - 1) - #print("Epoch: " + e + ", Iter: " + i + ", X[" + beg + ":" + end + ",]") - X_batch = X[beg:end,] - Y_batch = Y[beg:end,] - - # Compute forward pass - ## affine & softmax: - out = affine::forward(X_batch, W, b) - probs = softmax::forward(out) - - # Compute backward pass - ## loss: - dprobs = cross_entropy_loss::backward(probs, Y_batch) - ## affine & softmax: - dout = softmax::backward(dprobs, out) - [dX_batch, dW, db] = affine::backward(dout, X_batch, W, b) - - # Optimize with SGD w/ Nesterov momentum - [W, vW] = sgd_nesterov::update(W, dW, lr, mu, vW) - [b, vb] = sgd_nesterov::update(b, db, lr, mu, vb) - #[W, mW, vW] = adam::update(W, dW, lr, 0.9, 0.999, 1e-8, e*i-1, mW, vW) - #[b, mb, vb] = adam::update(b, db, lr, 0.9, 0.999, 1e-8, e*i-1, mb, vb) - - # Compute loss & accuracy for training & validation data every `log_interval` iterations. - if (i %% log_interval == 0) { - #print("Eval time! - i: " + i) - # Compute training loss & accuracy - loss = cross_entropy_loss::forward(probs, Y_batch) - accuracy = mean(rowIndexMax(probs) == rowIndexMax(Y_batch)) - - # Compute validation loss & accuracy - probs_val = predict(X_val, W, b) - loss_val = cross_entropy_loss::forward(probs_val, Y_val) - accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) - - # Output results - print("Epoch: " + e + "/" + epochs + ", Iter: " + i + "/" + iters - + ", Train Loss: " + loss + ", Train Accuracy: " + accuracy + ", Val Loss: " - + loss_val + ", Val Accuracy: " + accuracy_val + ", lr: " + lr + ", mu " + mu) - } - } - # Anneal momentum towards 0.999 - mu = mu + (0.999 - mu)/(1+epochs-e) - # Decay learning rate - lr = lr * decay - } -} - -predict = function(matrix[double] X, matrix[double] W, matrix[double] b) - return (matrix[double] probs) { - /* - * Computes the class probability predictions of a softmax classifier. - * - * The input matrix, X, has N examples, each with D features. - * - * Inputs: - * - X: Input data matrix, of shape (N, D). - * - W: Weights (parameters) matrix, of shape (D, K). - * - b: Biases vector, of shape (1, K). - * - * Outputs: - * - probs: Class probabilities, of shape (N, K). - */ - N = nrow(X) # num examples - K = ncol(W) # num classes - - # Compute forward pass - ## affine & softmax: - out = affine::forward(X, W, b) - probs = softmax::forward(out) -} - -eval = function(matrix[double] probs, matrix[double] Y) - return (double loss, double accuracy) { - /* - * Evaluates a softmax classifier. - * - * The probs matrix contains the class probability predictions - * of K classes over N examples. The targets, Y, have K classes, - * and are one-hot encoded. - * - * Inputs: - * - probs: Class probabilities, of shape (N, K). - * - Y: Target matrix, of shape (N, K). - * - * Outputs: - * - loss: Scalar loss, of shape (1). - * - accuracy: Scalar accuracy, of shape (1). - */ - # Compute loss & accuracy - loss = cross_entropy_loss::forward(probs, Y) - correct_pred = rowIndexMax(probs) == rowIndexMax(Y) - accuracy = mean(correct_pred) -} - -generate_dummy_data = function() - return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) { - /* - * Generate a dummy dataset similar to the breast cancer dataset. - * - * Outputs: - * - X: Input data matrix, of shape (N, D). - * - Y: Target matrix, of shape (N, K). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - */ - # Generate dummy input data - N = 1024 # num examples - C = 3 # num input channels - Hin = 256 # input height - Win = 256 # input width - T = 10 # num targets - X = rand(rows=N, cols=C*Hin*Win, pdf="normal") - classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform")) - Y = table(seq(1, N), classes) # one-hot encoding -} -