[systemml] branch master updated: [MINOR] Script-level improvements mice builtin function

mboehm7 Sat, 25 Apr 2020 10:42:43 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git



The following commit(s) were added to refs/heads/master by this push:
     new f450ead  [MINOR] Script-level improvements mice builtin function
f450ead is described below

commit f450ead5506d1615b5979bee85b39891e0f0fc00
Author: Matthias Boehm <mboe...@gmail.com>
AuthorDate: Sat Apr 25 19:40:58 2020 +0200

    [MINOR] Script-level improvements mice builtin function
    
    * Loop vectorization of scalar assignment
    * Removed unnecessary branch for table padding
    * Minor modifications of rmEmpty use to increase common subexpression
    elimination
---
 scripts/builtin/mice.dml | 44 +++++++++++++++++++-------------------------
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml
index 3f3c325..99d2be2 100644
--- a/scripts/builtin/mice.dml
+++ b/scripts/builtin/mice.dml
@@ -56,12 +56,12 @@ return(Frame[String] dataset, Frame[String] singleSet)
   col = ncol(F) 
   Result = matrix(0, rows=1, cols = col)
   Mask_Result = matrix(0, rows=1, cols=col)
-  cat = t(cMask) * seq(1, ncol(cMask))
-  cat = removeEmpty(target = cat, margin = "rows")
+  scat = seq(1, ncol(cMask))
+  cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
   s=""
   for(i in 1: nrow(cat), check =0)
-    s = s+as.integer(as.scalar(cat[i, 1]))+",";  
-      
+    s = s+as.integer(as.scalar(cat[i, 1]))+",";
+  
   
   # encoding categorical columns using recode transformation
   jspecR = "{ids:true, recode:["+s+"]}";
@@ -70,7 +70,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   XO = replace(target=X, pattern=NaN, replacement=0);
 
   # remove categorical features and impute continous features with mean
-  eX_n = removeEmpty(target=X, margin="cols", select=(1-cMask))
+  eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0))
   col_n = ncol(eX_n);
   # storing the mask/address of missing values
   Mask_n = is.na(eX_n);
@@ -80,7 +80,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   # filling the missing data with their means
   X2_n = eX_n+(Mask_n*colMeans(eX_n))
   # matrices for computing actul data
-  p_n = table( (seq(1, ncol(eX_n))) , (removeEmpty(target = t(cMask==0)*seq(1, 
ncol(cMask)), margin ="rows")) ,  1 )
+  p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows", 
select=t(cMask==0)))
   if(ncol(p_n) < ncol(cMask))
     p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n)))
   q = XO * cMask
@@ -91,8 +91,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
   eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c 
!= 0)==col_c))
   colMod = matrix(0, 1, ncol(eX_c))
   # compute columnwise mode
-  parfor(i in 1: col_c)
-  {
+  parfor(i in 1: col_c) {
     f = eX_c2[, i] # adding one in data for dealing with zero category
     cat_counts = table(f, 1, n, 1);  # counts for each category
     mode = as.scalar(rowIndexMax(t(cat_counts)));
@@ -100,13 +99,10 @@ return(Frame[String] dataset, Frame[String] singleSet)
   }
   
   # find the mask of missing values 
-  tmpMask_c = (eX_c == 0);
-  tmpMask_c = (tmpMask_c * colMod) # fill missing values with mode
+  tmpMask_c = (eX_c==0) * colMod # fill missing values with mode
   
   # Generate a matrix of actual length
-  p_c = table((seq(1, ncol(tmpMask_c))) , (removeEmpty(target = 
t(cMask)*seq(1, ncol(cMask)), margin ="rows")), 1)
-  if(ncol(p_c) < ncol(cMask))
-    p_c = cbind(p_c, matrix(0, nrow(p_c), ncol(cMask)-ncol(p_c)))
+  p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin 
="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask))
 
   Mask_c = tmpMask_c %*% p_c 
   inverseMask_c = Mask_c == 0
@@ -131,14 +127,13 @@ return(Frame[String] dataset, Frame[String] singleSet)
   dXMask = matrix(0, 1, ncol(dX))
   index = 1
   for(k in 1:col) {
-    if(as.scalar(dcDistincts[1,k]) != 0) {
-      for(l in 1:as.scalar(dcDistincts[1,k])){
-        dXMask[1,index] = 1
-        index = index +1
-      }
+    nDistk = as.scalar(dcDistincts[1,k]);
+    if(nDistk != 0) {
+      dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk)
+      index += nDistk;
     }
     else
-      index = index +1
+      index += 1
   }
   
   #multiple imputations
@@ -149,7 +144,6 @@ return(Frame[String] dataset, Frame[String] singleSet)
     in_n = 1; in_c = 1; i=1; j=1; # varibales for index selection
     while(i <= ncol(dX))
     {
-      
       if(as.scalar(dXMask[1,i]) == 0)
       {
         # construct column selector
@@ -175,7 +169,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
       }
      
       if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0))
-      {  
+      {
         j = (i + as.scalar(dist[1,in_c])) - 1
 
         # construct column selector
@@ -194,8 +188,8 @@ return(Frame[String] dataset, Frame[String] singleSet)
         test_X =  removeEmpty(target = slice2, margin = "cols", select = selX);
         test_Y = slice2a[,in_c]
        
-        # train clasification model  
-        beta = multiLogReg(X=train_X,  Y=train_Y, icpt = 1, tol = 0.00000001, 
reg = 0.001, maxi = 100, maxii=0, verbose=FALSE)
+        # train clasification model
+        beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001, 
reg = 0.001, maxi = 100, maxii=0, verbose=FALSE)
         # predicting missing values 
         [prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y)
         # imputing missing column values (assumes Mask_Filled being 0/1-matrix)
@@ -209,7 +203,7 @@ return(Frame[String] dataset, Frame[String] singleSet)
       in_c = in_c + 1
       i = i+1;
     }
-      
+
     nM = ((Mask_Filled_n) %*% p_n) + Mask_Filled_c
     Result = rbind(Result, nM+XO)
     Mask_Result = rbind(Mask_Result, nM)
@@ -266,7 +260,7 @@ return (Matrix[Double] agg)
         if(sum(u1 != s ) == 0)
           uCount = uCount + 1
         if(sum(v1 != s) == 0)
-          vCount = vCount + 1   
+          vCount = vCount + 1
       }
       # copy the results of u in v
       if(uCount > vCount)

[systemml] branch master updated: [MINOR] Script-level improvements mice builtin function

Reply via email to