phaniarnab commented on a change in pull request #972:
URL: https://github.com/apache/systemml/pull/972#discussion_r439627882



##########
File path: scripts/builtin/mice.dml
##########
@@ -19,266 +19,176 @@
 #
 #-------------------------------------------------------------
 
-# Builtin function Implements Multiple Imputation using Chained Equations 
(MICE) for nominal data
+# Built-in function Implements Multiple Imputation using Chained Equations 
(MICE) 
 #
 # INPUT PARAMETERS:
 # 
---------------------------------------------------------------------------------------------
 # NAME            TYPE    DEFAULT     MEANING
 # 
---------------------------------------------------------------------------------------------
-# F               String    ---        Data Frame
-# cMask           Double    ---        A 0/1 row vector for identifying 
numeric (0) adn categorical features (1)
+# X               String    ---        Data Matrix (Recoded Matrix for 
categorical features)
+# cMask           Double    ---        A 0/1 row vector for identifying 
numeric (0) and categorical features (1)
 # iter            Integer    3         Number of iteration for multiple 
imputations 
-# complete        Integer    3         A complete dataset generated though a 
specific iteration
 # 
---------------------------------------------------------------------------------------------
  
 
 #Output(s)
 # 
---------------------------------------------------------------------------------------------
 # NAME                  TYPE    DEFAULT     MEANING
 # 
---------------------------------------------------------------------------------------------
-# dataset               Double   ---        imputed dataset
-# singleSet             Double   ---        A complete dataset generated 
though a specific iteration
-
-# Assumption missing value are represented with empty string i.e ",," in csv 
file  
-# variables with suffix n are storing continuous/numeric data and variables 
with suffix c are storing categorical data
-s_mice= function(Frame[String] F, Matrix[Double] cMask, Integer iter = 3, 
Integer complete = 3, Boolean verbose = FALSE)
-return(Frame[String] dataset, Frame[String] singleSet)
-{
-
-  if(ncol(F) == 1)
-    stop("invalid argument: can not apply mice on single column")
-    
-  if(complete > iter)
-    complete = iter
+# output               Double   ---        imputed dataset
 
 
-  # adding a temporary  feature (in-case all attributes are of same type)
-  F = cbind(F,  as.frame(matrix(1,nrow(F), 1)))
-  cMask = cbind(cMask, matrix(1,1,1))
+# Assumption missing value are represented with empty string i.e ",," in CSV 
file  
+# variables with suffix n are storing continuos/numeric data and variables 
with suffix c are storing categorical data
+m_mice= function(Matrix[Double] X, Matrix[Double] cMask, Integer iter = 3, 
Boolean verbose = FALSE)
+return(Matrix[Double] output)
+{
 
-  n = nrow(F)
-  row = n*complete;
-  col = ncol(F) 
-  Result = matrix(0, rows=1, cols = col)
-  Mask_Result = matrix(0, rows=1, cols=col)
-  scat = seq(1, ncol(cMask))
-  cat = removeEmpty(target=scat, margin="rows", select=t(cMask))
+  lastIndex = ncol(X)
+  if(sum(cMask) == 0) # if all features are numeric add a categorical features
+  {
+    X = cbind(X, matrix(1, nrow(X), 1))
+    cMask = cbind(cMask, matrix(1, 1, 1))
+  }
+  else if(sum(cMask) == ncol(cMask)) # if all features are categorical add a 
numeric features
+  {
+    X = cbind(X, matrix(1, nrow(X), 1))
+    cMask = cbind(cMask, matrix(0, 1, 1))
+  } 
 
-  if(nrow(cat) == ncol(F))
-    cMask[1,ncol(cMask)] = 0
-  
-  s=""
-  for(i in 1: nrow(cat), check =0)
-    s = s+as.integer(as.scalar(cat[i, 1]))+",";
-  
+  # separate categorical and continuous features
+  nX = removeEmpty(target=X, margin="cols", select=(cMask==0))
+  cX = removeEmpty(target=X, margin="cols", select= cMask)
   
-  # encoding categorical columns using recode transformation
-  jspecR = "{ids:true, recode:["+s+"]}";
-  [X, M] = transformencode(target=F, spec=jspecR);
-  
-  XO = replace(target=X, pattern=NaN, replacement=0);
-
-  # remove categorical features and impute continuous features with mean
-  eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0))
-  col_n = ncol(eX_n);
-  # storing the mask/address of missing values
-  Mask_n = is.na(eX_n);
-  inverseMask_n = 1 - Mask_n;
-  # replacing the empty cells in encoded data with 0 
-  eX_n = replace(target=eX_n, pattern=NaN, replacement=0);
-  # filling the missing data with their means
-  X2_n = eX_n+(Mask_n*colMeans(eX_n))
-  # matrices for computing actul data
-  p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows", 
select=t(cMask==0)))
-  if(ncol(p_n) < ncol(cMask))
-    p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n)))
-  q = XO * cMask
+  # store the mask of numeric missing values 
+  Mask_n = is.na(nX);  
+  nX = replace(target=nX, pattern=NaN, replacement=0);
+  # initial mean imputation
+  X_n = nX+(Mask_n*colMeans(nX))
+    
+  # store the mask of categorical missing values 
+  Mask_c = is.na(cX);
+  cX = replace(target=cX, pattern=NaN, replacement=0);
+  colMode = colMode(cX)
+  # initial mode imputation
+  X_c = cX+(Mask_c*colMode)
   
-  # Taking out the categorical features for initial imputation by mode
-  eX_c = removeEmpty(target = q, margin = "cols")
-  col_c = ncol(eX_c);
-  eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c 
!= 0)==col_c))
-  colMod = matrix(0, 1, ncol(eX_c))
-  # compute columnwise mode
-  parfor(i in 1: col_c) {
-    f = eX_c2[, i] # adding one in data for dealing with zero category
-    cat_counts = table(f, 1, n, 1);  # counts for each category
-    mode = as.scalar(rowIndexMax(t(cat_counts)));
-    colMod[1,i] = mode
-  }
+  # reconstruct original matrix using sparse matrices p and q 
+  p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(cMask)), 
margin="rows", select=t(cMask==0)), ncol(nX), ncol(X))
+  q = table(seq(1, ncol(cX)), removeEmpty(target=seq(1, ncol(cMask)), 
margin="rows", select=t(cMask)), ncol(cX), ncol(X))
+  X1 = (X_n %*% p) + (X_c %*% q)
+  Mask1 =  is.na(X)
   
-  # find the mask of missing values 
-  tmpMask_c = (eX_c==0) * colMod # fill missing values with mode
+  X = replace(target=X, pattern=NaN, replacement=0);
+  d = ncol(X1)
+  n = nrow(X1)
   
-  # Generate a matrix of actual length
-  p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin 
="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask))
-
-  Mask_c = tmpMask_c %*% p_c 
-  inverseMask_c = Mask_c == 0
-  r = X2_n %*% p_n
-  qr = q + r
-  X2_c = qr + Mask_c
-  Mask_c = Mask_c != 0
+  # compute index of categorical features
+  encodeIndex = removeEmpty(target=t(seq(1, ncol(X1))), margin="cols", 
select=cMask) 
 
+  s=""
+  if(ncol(encodeIndex) == 1)
+    s = as.integer(as.scalar(encodeIndex[1, 1]))
+  else{
+    for(i in 1: ncol(encodeIndex)-1)
+     s = s+as.integer(as.scalar(encodeIndex[1, i]))+",";
+     s = s+as.integer(as.scalar(encodeIndex[1, ncol(encodeIndex)]))
+    }
 
-  # one-hot encoding of categorical features
+  # specifications for one-hot encoding of categorical features
   jspecDC = "{ids:true, dummycode:["+s+"]}";
-  [dX, dM] = transformencode(target=as.frame(X2_c), spec=jspecDC);
-  
-  # recoding of metadata of OHE features to get the number of distinct elements
-  [metaTransform, metaTransformMeta] = transformencode(target=dM, spec=jspecR);
-  metaTransform = replace(target=metaTransform, pattern=NaN, replacement=0)
-  # counting distinct elements in each categorical feature
-  dcDistincts = colMaxs(metaTransform)
-  dist = dcDistincts + (1-cMask) 
-
-  # creating a mask matrix of OHE features
-  dXMask = matrix(0, 1, ncol(dX))
-  index = 1
-  for(k in 1:col) {
-    nDistk = as.scalar(dcDistincts[1,k]);
-    if(nDistk != 0) {
-      dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk)
-      index += nDistk;
-    }
-    else
-      index += 1
-  }
   
-  #multiple imputations
-  for(k in 1:iter)
+  for(k in 1:iter) # start iterative imputation
   {
-    Mask_Filled_n = Mask_n;
-    Mask_Filled_c = Mask_c
-    in_n = 1; in_c = 1; i=1; j=1; # variables for index selection
-    while(i <= ncol(dX))
+    Mask_Filled = Mask1
+    inverseMask = Mask1 == 0
+    # OHE of categorical features
+    [dX, dM] = transformencode(target=as.frame(X1), spec=jspecDC);

Review comment:
       Sorry for being naive, can we not move this transformencode outside, to 
have no frame operations?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to