[systemds] branch main updated: [SYSTEMDS-3397] Python NN testExample

baunsgaard Wed, 29 Jun 2022 01:14:44 -0700

This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 154b9ff6cd [SYSTEMDS-3397] Python NN testExample
154b9ff6cd is described below

commit 154b9ff6cdc15a917b4714914171372f84eb82e3
Author: baunsgaard <[email protected]>
AuthorDate: Tue Jun 28 22:40:53 2022 +0200

    [SYSTEMDS-3397] Python NN testExample
    
    A simple python example of neural network training and preprocessing.
    two different scenarios are tested.
    
    1. Train and measure accuracy in one go
    2. Train a model then save it, to then load and predict.
    
    Closes #1648
---
 src/main/python/.gitignore                         |   5 +-
 src/main/python/tests/README.md                    |   4 +
 .../tests/examples/tutorials/neural_net_source.dml | 129 +++++-------
 .../python/tests/examples/tutorials/test_adult.py  | 217 +--------------------
 .../tests/examples/tutorials/test_adult_neural.py  | 130 ++++++++++++
 5 files changed, 191 insertions(+), 294 deletions(-)

diff --git a/src/main/python/.gitignore b/src/main/python/.gitignore
index f60a72e9ad..ad5ef4dc44 100644
--- a/src/main/python/.gitignore
+++ b/src/main/python/.gitignore
@@ -1,5 +1,4 @@
 
-
 # Git ignore for python files.
 systemds/lib/
 systemds.egg-info/
@@ -15,10 +14,10 @@ tests/onnx_systemds/output_test
 tests/onnx_systemds/dml_output
 tests/onnx_systemds/test_models/*.onnx
 
-# git ignore tmp federated files
+# git ignore tmp test files
 tests/federated/output
 tests/federated/worker
 tests/federated/tmp
-
 tests/list/tmp
 tests/algorithms/readwrite/
+tests/examples/tutorials/model
diff --git a/src/main/python/tests/README.md b/src/main/python/tests/README.md
index bf6b6e35f0..24e0f01863 100644
--- a/src/main/python/tests/README.md
+++ b/src/main/python/tests/README.md
@@ -24,7 +24,11 @@ Tests are easily executed using unittest:
 But before executing the tests it is recommended to go through systemds 
[Setting SYSTEMDS_ROOT environment](/bin/README.md)
 
 ```bash
+# Single thread:
 python -m unittest discover -s tests -p 'test_*.py'
+
+# Parallel
+unittest-parallel -t . -s tests --module-fixtures
 ```
 
 This command searches through the test directory and finds all python files 
starting with `test_` and executes them.
diff --git a/src/main/python/tests/examples/tutorials/neural_net_source.dml 
b/src/main/python/tests/examples/tutorials/neural_net_source.dml
index 037ba7891b..8615f04991 100644
--- a/src/main/python/tests/examples/tutorials/neural_net_source.dml
+++ b/src/main/python/tests/examples/tutorials/neural_net_source.dml
@@ -21,9 +21,10 @@
 
 # Imports
 source("nn/layers/affine.dml") as affine
-source("nn/layers/logcosh_loss.dml") as logcosh
-source("nn/layers/elu.dml") as elu
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/relu.dml") as relu
 source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/softmax.dml") as softmax
 source("nn/optim/sgd.dml") as sgd
 
 init_model = function(Integer inputDimension, Integer outputDimension, int 
seed = -1)
@@ -36,26 +37,23 @@ init_model = function(Integer inputDimension, Integer 
outputDimension, int seed
   model = list(W1, W2, W3, b1, b2, b3)
 }
 
-
 predict = function(matrix[double] X,
                    list[unknown] model)
     return (matrix[double] probs) {
 
-  W1 = as.matrix(model[1])
-  W2 = as.matrix(model[2])
-  W3 = as.matrix(model[3])
-  b1 = as.matrix(model[4])
-  b2 = as.matrix(model[5])
-  b3 = as.matrix(model[6])
+  W1 = as.matrix(model[1]);  b1 = as.matrix(model[4])
+  W2 = as.matrix(model[2]);  b2 = as.matrix(model[5])
+  W3 = as.matrix(model[3]);  b3 = as.matrix(model[6])
 
-  out1elu = elu::forward(affine::forward(X, W1, b1),1)
-  out2elu = elu::forward(affine::forward(out1elu, W2, b2),1)
-  probs = elu::forward(affine::forward(out2elu, W3, b3),1)
+  out1a = sigmoid::forward(affine::forward(X, W1, b1))
+  out2a = relu::forward(affine::forward(out1a, W2, b2))
+  probs = softmax::forward(affine::forward(out2a, W3, b3))
 }
 
 eval = function(matrix[double] probs, matrix[double] y)
-    return (double loss) {
-  loss = logcosh::forward(probs, y)
+    return (double accuracy) {
+  correct_pred = rowIndexMax(probs) == rowIndexMax(y)
+  accuracy = mean(correct_pred)
 }
 
 gradients = function(list[unknown] model,
@@ -64,33 +62,31 @@ gradients = function(list[unknown] model,
                      matrix[double] labels)
     return (list[unknown] gradients) {
 
-  W1 = as.matrix(model[1])
-  W2 = as.matrix(model[2])
-  W3 = as.matrix(model[3])
-  b1 = as.matrix(model[4])
-  b2 = as.matrix(model[5])
-  b3 = as.matrix(model[6])
-
+  W1 = as.matrix(model[1]); b1 = as.matrix(model[4])
+  W2 = as.matrix(model[2]); b2 = as.matrix(model[5])
+  W3 = as.matrix(model[3]); b3 = as.matrix(model[6])
+  
   # Compute forward pass
   out1 = affine::forward(features, W1, b1)
-  out1elu = elu::forward(out1, 1)
-  out2 = affine::forward(out1elu, W2, b2)
-  out2elu = elu::forward(out2, 1)
-  out3 = affine::forward(out2elu, W3, b3)
-  probs = elu::forward(out3,1)
+  out1a = sigmoid::forward(out1)
+  out2 = affine::forward(out1a, W2, b2)
+  out2a = relu::forward(out2)
+  out3 = affine::forward(out2a, W3, b3)
+  probs = softmax::forward(out3)
 
   # Compute loss & accuracy for training data
-  loss = logcosh::forward(probs, labels)
+  loss = cross_entropy_loss::forward(probs, labels)
   print("Batch loss: " + loss)
 
   # Compute data backward pass
-  dprobs = logcosh::backward(probs, labels)
-  dout3 = elu::backward(dprobs, out3, 1)
-  [dout2elu, dW3, db3] = affine::backward(dout3, out2elu, W3, b3)
-  dout2 = elu::backward(dout2elu, out2, 1)
-  [dout1elu, dW2, db2] = affine::backward(dout2, out1elu, W2, b2)
-  dout1 = elu::backward(dout1elu, out1, 1)
-  [dfeatures, dW1, db1] = affine::backward(dout1, features, W1, b1)
+  # Note it is same arguments as forward with one extra argument in front
+  dloss = cross_entropy_loss::backward(probs, labels)
+  dout3 = softmax::backward(dloss, out3)
+  [dout2a, dW3, db3] = affine::backward(dout3, out2a, W3, b3)
+  dout2 = relu::backward(dout2a, out2)
+  [dout1a, dW2, db2] = affine::backward(dout2, out1a, W2, b2)
+  dout1 = sigmoid::backward(dout1a, out1)
+  [a, dW1, db1] = affine::backward(dout1, features, W1, b1)
 
   gradients = list(dW1, dW2, dW3, db1, db2, db3)
 }
@@ -100,18 +96,13 @@ aggregation = function(list[unknown] model,
                        list[unknown] gradients)
     return (list[unknown] model_result) {
 
-  W1 = as.matrix(model[1])
-  W2 = as.matrix(model[2])
-  W3 = as.matrix(model[3])
-  b1 = as.matrix(model[4])
-  b2 = as.matrix(model[5])
-  b3 = as.matrix(model[6])
-  dW1 = as.matrix(gradients[1])
-  dW2 = as.matrix(gradients[2])
-  dW3 = as.matrix(gradients[3])
-  db1 = as.matrix(gradients[4])
-  db2 = as.matrix(gradients[5])
-  db3 = as.matrix(gradients[6])
+  W1 = as.matrix(model[1]); dW1 = as.matrix(gradients[1])
+  W2 = as.matrix(model[2]); dW2 = as.matrix(gradients[2])
+  W3 = as.matrix(model[3]); dW3 = as.matrix(gradients[3])
+  b1 = as.matrix(model[4]); db1 = as.matrix(gradients[4])
+  b2 = as.matrix(model[5]); db2 = as.matrix(gradients[5])
+  b3 = as.matrix(model[6]); db3 = as.matrix(gradients[6])
+  
   learning_rate = as.double(as.scalar(hyperparams["learning_rate"]))
 
   # Optimize with SGD
@@ -125,7 +116,6 @@ aggregation = function(list[unknown] model,
   model_result = list(W1, W2, W3, b1, b2, b3)
 }
 
-
 train = function(matrix[double] X, matrix[double] y,
                  int epochs, int batch_size, double learning_rate, 
                  int seed = -1)
@@ -136,12 +126,9 @@ train = function(matrix[double] X, matrix[double] y,
   K = ncol(y)  # num classes
 
   model = init_model(D, K, seed)
-  W1 = as.matrix(model[1])
-  W2 = as.matrix(model[2])
-  W3 = as.matrix(model[3])
-  b1 = as.matrix(model[4])
-  b2 = as.matrix(model[5])
-  b3 = as.matrix(model[6])
+  W1 = as.matrix(model[1]); b1 = as.matrix(model[4])
+  W2 = as.matrix(model[2]); b2 = as.matrix(model[5])
+  W3 = as.matrix(model[3]); b3 = as.matrix(model[6])
   
   # Create the hyper parameter list
   hyperparams = list(learning_rate=learning_rate)
@@ -163,13 +150,10 @@ train = function(matrix[double] X, matrix[double] y,
       gradients_list = gradients(model_list, hyperparams, X_batch, y_batch)
       model_updated = aggregation(model_list, hyperparams, gradients_list)
 
-      W1 = as.matrix(model_updated[1])
-      W2 = as.matrix(model_updated[2])
-      W3 = as.matrix(model_updated[3])
-      b1 = as.matrix(model_updated[4])
-      b2 = as.matrix(model_updated[5])
-      b3 = as.matrix(model_updated[6])
-
+      W1 = as.matrix(model_updated[1]); b1 = as.matrix(model_updated[4])
+      W2 = as.matrix(model_updated[2]); b2 = as.matrix(model_updated[5])
+      W3 = as.matrix(model_updated[3]); b3 = as.matrix(model_updated[6])
+      
     }
   }
 
@@ -178,9 +162,13 @@ train = function(matrix[double] X, matrix[double] y,
 
 train_paramserv = function(matrix[Double] X, matrix[Double] y,
     Integer epochs, Integer batch_size, Double learning_rate, Integer workers,
-    String utype, String freq, String mode, Integer seed)
+    Integer seed)
     return (list[unknown] model_trained) {
 
+  utype = "BSP"
+  freq = "BATCH"
+  mode = "LOCAL"
+
   N = nrow(X)  # num examples
   D = ncol(X)  # num features
   K = ncol(y)  # num classes
@@ -194,24 +182,9 @@ train_paramserv = function(matrix[Double] X, 
matrix[Double] y,
   # Use paramserv function
   model_trained = paramserv(model=model_list, features=X, labels=y, 
     val_features=matrix(0, rows=0, cols=0), val_labels=matrix(0, rows=0, 
cols=0), 
-    upd="./network/TwoNN.dml::gradients", 
agg="./network/TwoNN.dml::aggregation",
+    upd="./tests/examples/tutorials/neural_net_source.dml::gradients",
+    agg="./tests/examples/tutorials/neural_net_source.dml::aggregation",
     mode=mode, utype=utype, freq=freq, epochs=epochs, batchsize=batch_size,
     k=workers, hyperparams=params, checkpointing="NONE")
 
 }
-
-save_model = function (list[unknown] model, String baseFolder){
-  W1  = as.matrix(model[1])
-  W2  = as.matrix(model[2])
-  W3  = as.matrix(model[3])
-  b1  = as.matrix(model[4])
-  b2  = as.matrix(model[5])
-  b3  = as.matrix(model[6])
-
-  write(W1, (baseFolder + "/W1.data"), format="binary")
-  write(W2, (baseFolder + "/W2.data"), format="binary")
-  write(W3, (baseFolder + "/W3.data"), format="binary")
-  write(b1, (baseFolder + "/b1.data"), format="binary")
-  write(b2, (baseFolder + "/b2.data") , format="binary")
-  write(b3, (baseFolder + "/b3.data") , format="binary")
-}
\ No newline at end of file
diff --git a/src/main/python/tests/examples/tutorials/test_adult.py 
b/src/main/python/tests/examples/tutorials/test_adult.py
index ddafc96e28..d327676977 100644
--- a/src/main/python/tests/examples/tutorials/test_adult.py
+++ b/src/main/python/tests/examples/tutorials/test_adult.py
@@ -18,20 +18,17 @@
 # under the License.
 #
 # -------------------------------------------------------------
-import os
+
 import unittest
 
 import numpy as np
 from systemds.context import SystemDSContext
 from systemds.examples.tutorials.adult import DataManager
-from systemds.operator import Frame, Matrix, OperationNode
-from systemds.operator.algorithm import (confusionMatrix, kmeans, l2svm,
-                                         multiLogReg, multiLogRegPredict,
-                                         scale, scaleApply, split, winsorize)
-from systemds.script_building import DMLScript
+from systemds.operator.algorithm import (confusionMatrix, 
+                                         multiLogReg, multiLogRegPredict)
 
 
-class Test_DMLScript(unittest.TestCase):
+class TestAdultStandardML(unittest.TestCase):
     """
     Test class for adult dml script tutorial code.
     """
@@ -152,212 +149,6 @@ class Test_DMLScript(unittest.TestCase):
         self.assertTrue(confusion_numpy[1][1] > 0.5)
         self.assertTrue(confusion_numpy[1][0] < 0.2)
 
-    # def test_neural_net(self):
-    #     # Reduced because we want the tests to finish a bit faster.
-    #     train_count = 15000
-    #     test_count = 5000
-
-    #     train_data, train_labels, test_data, test_labels = 
self.d.get_preprocessed_dataset(interpolate=True, standardize=True, dimred=0.1)
-
-    #     # Train data
-    #     X = self.sds.from_numpy( train_data[:train_count])
-    #     Y = self.sds.from_numpy( train_labels[:train_count])
-
-    #     # Test data
-    #     Xt = self.sds.from_numpy(test_data[:test_count])
-    #     Yt = self.sds.from_numpy(test_labels[:test_count])
-
-    #     FFN_package = self.sds.source(self.neural_net_src_path, "fnn", 
print_imported_methods=True)
-
-    #     network = FFN_package.train(X, Y, 1, 16, 0.01, 1)
-
-    #     self.assertTrue(type(network) is not None) # sourcing and training 
seems to works
-
-    #     FFN_package.save_model(network, 
'"model/python_FFN/"').compute(verbose=True)
-
-    #     # TODO This does not work yet, not sure what the problem is
-    #     #probs = FFN_package.predict(Xt, network).compute(True)
-    #     # FFN_package.eval(Yt, Yt).compute()
-
-    # def test_level1(self):
-    #     # Reduced because we want the tests to finish a bit faster.
-    #     train_count = 15000
-    #     test_count = 5000
-    #     train_data, train_labels, test_data, test_labels = 
self.d.get_preprocessed_dataset(interpolate=True,
-    #                                                                          
              standardize=True, dimred=0.1)
-    #     # Train data
-    #     X = self.sds.from_numpy(train_data[:train_count])
-    #     Y = self.sds.from_numpy(train_labels[:train_count])
-    #     Y = Y + 1.0
-
-    #     # Test data
-    #     Xt = self.sds.from_numpy(test_data[:test_count])
-    #     Yt = self.sds.from_numpy(test_labels[:test_count])
-    #     Yt = Yt + 1.0
-
-    #     betas = multiLogReg(X, Y)
-
-    #     [_, y_pred, acc] = multiLogRegPredict(Xt, betas, Yt).compute()
-    #     self.assertGreater(acc, 80) #Todo remove?
-    #     # todo add text how high acc should be with this config
-
-    #     confusion_matrix_abs, _ = 
confusionMatrix(self.sds.from_numpy(y_pred), Yt).compute()
-    #     # todo print confusion matrix? Explain cm?
-    #     self.assertTrue(
-    #         np.allclose(
-    #             confusion_matrix_abs,
-    #             np.array([[3583, 502],
-    #                       [245, 670]])
-    #         )
-    #     )
-
-    # def test_level2(self):
-
-    #     train_count = 32561
-    #     test_count = 16281
-
-    #     SCHEMA = 
'"DOUBLE,STRING,DOUBLE,STRING,DOUBLE,STRING,STRING,STRING,STRING,STRING,DOUBLE,DOUBLE,DOUBLE,STRING,STRING"'
-
-    #     F1 = self.sds.read(
-    #         self.dataset_path_train,
-    #         schema=SCHEMA
-    #     )
-    #     F2 = self.sds.read(
-    #         self.dataset_path_test,
-    #         schema=SCHEMA
-    #     )
-
-    #     jspec = self.sds.read(self.dataset_jspec, data_type="scalar", 
value_type="string")
-    #     PREPROCESS_package = self.sds.source(self.preprocess_src_path, 
"preprocess", print_imported_methods=True)
-
-    #     X1 = F1.rbind(F2)
-    #     X1, M1 = X1.transform_encode(spec=jspec)
-
-    #     X = PREPROCESS_package.get_X(X1, 1, train_count)
-    #     Y = PREPROCESS_package.get_Y(X1, 1, train_count)
-
-    #     Xt = PREPROCESS_package.get_X(X1, train_count, 
train_count+test_count)
-    #     Yt = PREPROCESS_package.get_Y(X1, train_count, 
train_count+test_count)
-
-    #     Yt = PREPROCESS_package.replace_value(Yt, 3.0, 1.0)
-    #     Yt = PREPROCESS_package.replace_value(Yt, 4.0, 2.0)
-
-    #     # better alternative for encoding. This was intended, but it does 
not work
-    #     #F2 = F2.replace("<=50K.", "<=50K")
-    #     #F2 = F2.replace(">50K.", ">50K")
-    #     #X1, M = F1.transform_encode(spec=jspec)
-    #     #X2 = F2.transform_apply(spec=jspec, meta=M)
-
-    #     #X = PREPROCESS_package.get_X(X1, 1, train_count)
-    #     #Y = PREPROCESS_package.get_Y(X1, 1, train_count)
-    #     #Xt = PREPROCESS_package.get_X(X2, 1, test_count)
-    #     #Yt = PREPROCESS_package.get_Y(X2, 1, test_count)
-
-    #     # TODO somehow throws error at predict with this included
-    #     #X, mean, sigma = scale(X, True, True)
-    #     #Xt = scaleApply(Xt, mean, sigma)
-
-    #     betas = multiLogReg(X, Y)
-
-    #     [_, y_pred, acc] = multiLogRegPredict(Xt, betas, Yt)
-
-    #     confusion_matrix_abs, _ = confusionMatrix(y_pred, Yt).compute()
-    #     print(confusion_matrix_abs)
-    #     self.assertTrue(
-    #         np.allclose(
-    #             confusion_matrix_abs,
-    #             np.array([[11593.,  1545.],
-    #                       [842., 2302.]])
-    #         )
-    #     )
-
-    # def test_level3(self):
-    #     train_count = 32561
-    #     test_count = 16281
-
-    #     SCHEMA = 
'"DOUBLE,STRING,DOUBLE,STRING,DOUBLE,STRING,STRING,STRING,STRING,STRING,DOUBLE,DOUBLE,DOUBLE,STRING,STRING"'
-
-    #     F1 = self.sds.read(
-    #         self.dataset_path_train,
-    #         schema=SCHEMA
-    #     )
-    #     F2 = self.sds.read(
-    #         self.dataset_path_test,
-    #         schema=SCHEMA
-    #     )
-
-    #     jspec = self.sds.read(self.dataset_jspec, data_type="scalar", 
value_type="string")
-    #     PREPROCESS_package = self.sds.source(self.preprocess_src_path, 
"preprocess", print_imported_methods=True)
-
-    #     X1 = F1.rbind(F2)
-    #     X1, M1 = X1.transform_encode(spec=jspec)
-
-    #     X = PREPROCESS_package.get_X(X1, 1, train_count)
-    #     Y = PREPROCESS_package.get_Y(X1, 1, train_count)
-
-    #     Xt = PREPROCESS_package.get_X(X1, train_count, train_count + 
test_count)
-    #     Yt = PREPROCESS_package.get_Y(X1, train_count, train_count + 
test_count)
-
-    #     Yt = PREPROCESS_package.replace_value(Yt, 3.0, 1.0)
-    #     Yt = PREPROCESS_package.replace_value(Yt, 4.0, 2.0)
-
-    #     # better alternative for encoding
-    #     # F2 = F2.replace("<=50K.", "<=50K")
-    #     # F2 = F2.replace(">50K.", ">50K")
-    #     # X1, M = F1.transform_encode(spec=jspec)
-    #     # X2 = F2.transform_apply(spec=jspec, meta=M)
-
-    #     # X = PREPROCESS_package.get_X(X1, 1, train_count)
-    #     # Y = PREPROCESS_package.get_Y(X1, 1, train_count)
-    #     # Xt = PREPROCESS_package.get_X(X2, 1, test_count)
-    #     # Yt = PREPROCESS_package.get_Y(X2, 1, test_count)
-
-    #     # TODO somehow throws error at predict with this included
-    #     # X, mean, sigma = scale(X, True, True)
-    #     # Xt = scaleApply(Xt, mean, sigma)
-
-    #     FFN_package = self.sds.source(self.neural_net_src_path, "fnn", 
print_imported_methods=True)
-
-    #     epochs = 1
-    #     batch_size = 16
-    #     learning_rate = 0.01
-    #     seed = 42
-
-    #     network = FFN_package.train(X, Y, epochs, batch_size, learning_rate, 
seed)
-
-    #     """
-    #     If more ressources are available, one can also choose to train the 
model using a parameter server.
-    #     Here we use the same parameters as before, however we need to 
specifiy a few more.
-    #     """
-    #     
################################################################################################################
-    #     # workers = 1
-    #     # utype = '"BSP"'
-    #     # freq = '"EPOCH"'
-    #     # mode = '"LOCAL"'
-    #     # network = FFN_package.train_paramserv(X, Y, epochs,
-    #     #                                       batch_size, learning_rate, 
workers, utype, freq, mode,
-    #     #                                       seed)
-    #     
################################################################################################################
-
-    #     FFN_package.save_model(network, 
'"model/python_FFN/"').compute(verbose=True)
-
-    #     """
-    #     Next we evaluate our network on the test set which was not used for 
training.
-    #     The predict function with the test features and our trained network 
returns a matrix of class probabilities.
-    #     This matrix contains for each test sample the probabilities for each 
class.
-    #     For predicting the most likely class of a sample, we choose the 
class with the highest probability.
-    #     """
-    #     
################################################################################################################
-    #     #probs = FFN_package.predict(Xt, network)
-    #     
################################################################################################################
-    #     """
-    #     To evaluate how well our model performed on the test set, we can use 
the probability matrix from the predict call and the real test labels
-    #     and compute the log-cosh loss.
-    #     """
-    #     
################################################################################################################
-    #     #FFN_package.eval(Xt, Yt).compute(True)
-    #     
################################################################################################################
-
 
 if __name__ == "__main__":
     unittest.main(exit=False)
diff --git a/src/main/python/tests/examples/tutorials/test_adult_neural.py 
b/src/main/python/tests/examples/tutorials/test_adult_neural.py
new file mode 100644
index 0000000000..1323ff22cf
--- /dev/null
+++ b/src/main/python/tests/examples/tutorials/test_adult_neural.py
@@ -0,0 +1,130 @@
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import shutil
+import unittest
+
+from systemds.context import SystemDSContext
+from systemds.examples.tutorials.adult import DataManager
+from systemds.operator.algorithm.builtin.scale import scale
+from systemds.operator.algorithm.builtin.scaleApply import scaleApply
+
+
+class TestAdultNeural(unittest.TestCase):
+    """
+    Test class for adult neural network code
+    """
+
+    sds: SystemDSContext = None
+    d: DataManager = None
+    neural_net_src_path: str = "tests/examples/tutorials/neural_net_source.dml"
+    preprocess_src_path: str = "tests/examples/tutorials/preprocess.dml"
+    dataset_path_train: str = 
"../../test/resources/datasets/adult/train_data.csv"
+    dataset_path_train_mtd: str = 
"../../test/resources/datasets/adult/train_data.csv.mtd"
+    dataset_path_test: str = 
"../../test/resources/datasets/adult/test_data.csv"
+    dataset_path_test_mtd: str = 
"../../test/resources/datasets/adult/test_data.csv.mtd"
+    dataset_jspec: str = "../../test/resources/datasets/adult/jspec.json"
+
+    train_count: int = 15000
+    test_count: int = 300
+
+    network_dir: str = "tests/examples/tutorials/model"
+    network: str = network_dir + "/fnn"
+
+    @classmethod
+    def setUpClass(cls):
+        cls.sds = SystemDSContext()
+        cls.d = DataManager()
+        shutil.rmtree(cls.network_dir, ignore_errors=True)
+
+    @classmethod
+    def tearDownClass(cls):
+        cls.sds.close()
+        shutil.rmtree(cls.network_dir, ignore_errors=True)
+
+    # Tests
+
+    def test_train_neural_net(self):
+        self.train_neural_net_and_save()
+        self.eval_neural_net()
+
+    def test_train_predict(self):
+        self.train_neural_net_and_predict()
+
+    # Helper methods
+
+    def prepare_x(self):
+        jspec = self.d.get_jspec(self.sds)
+        train_x_frame = self.d.get_train_data(self.sds)[0:self.train_count]
+        train_x, M1 = train_x_frame.transform_encode(spec=jspec)
+        test_x_frame = self.d.get_test_data(self.sds)[0:self.test_count]
+        test_x = test_x_frame.transform_apply(spec=jspec, meta=M1)
+        # Scale and shift .... not needed because of sigmoid layer,
+        # could be useful therefore tested.
+        [train_x, ce, sc] = scale(train_x)
+        test_x = scaleApply(test_x, ce, sc)
+        return [train_x, test_x]
+
+    def prepare_y(self):
+        jspec_dict = {"recode": ["income"]}
+        jspec_labels = self.sds.scalar(f'"{jspec_dict}"')
+        train_y_frame = self.d.get_train_labels(self.sds)[0:self.train_count]
+        train_y, M2 = train_y_frame.transform_encode(spec=jspec_labels)
+        test_y_frame = self.d.get_test_labels(self.sds)[0:self.test_count]
+        test_y = test_y_frame.transform_apply(spec=jspec_labels, meta=M2)
+        labels = 2
+        train_y = train_y.to_one_hot(labels)
+        test_y = test_y.to_one_hot(labels)
+        return [train_y, test_y]
+
+    def prepare(self):
+        x = self.prepare_x()
+        y = self.prepare_y()
+        return [x[0], x[1], y[0], y[1]]
+
+    def train_neural_net_and_save(self):
+        [train_x, _, train_y, _] = self.prepare()
+        FFN_package = self.sds.source(self.neural_net_src_path, "fnn")
+        network = FFN_package.train(train_x, train_y, 4, 16, 0.01, 1)
+        network.write(self.network).compute()
+
+    def train_neural_net_and_predict(self):
+        [train_x, test_x, train_y, test_y] = self.prepare()
+        FFN_package = self.sds.source(self.neural_net_src_path, "fnn")
+        network = FFN_package.train_paramserv(
+            train_x, train_y, 4, 16, 0.01, 2,  1)
+        probs = FFN_package.predict(test_x, network)
+        accuracy = FFN_package.eval(probs, test_y).compute()
+        # accuracy is returned in percent
+        self.assertTrue(accuracy > 0.80)
+
+    def eval_neural_net(self):
+        [_, test_x, _, test_y] = self.prepare()
+        network = self.sds.read(self.network)
+        FFN_package = self.sds.source(self.neural_net_src_path, "fnn")
+        probs = FFN_package.predict(test_x, network)
+        accuracy = FFN_package.eval(probs, test_y).compute()
+        # accuracy is returned in percent
+        self.assertTrue(accuracy > 0.80)
+
+
+if __name__ == "__main__":
+    unittest.main(exit=False)

[systemds] branch main updated: [SYSTEMDS-3397] Python NN testExample

Reply via email to