This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push: new f450ead [MINOR] Script-level improvements mice builtin function f450ead is described below commit f450ead5506d1615b5979bee85b39891e0f0fc00 Author: Matthias Boehm <mboe...@gmail.com> AuthorDate: Sat Apr 25 19:40:58 2020 +0200 [MINOR] Script-level improvements mice builtin function * Loop vectorization of scalar assignment * Removed unnecessary branch for table padding * Minor modifications of rmEmpty use to increase common subexpression elimination --- scripts/builtin/mice.dml | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/scripts/builtin/mice.dml b/scripts/builtin/mice.dml index 3f3c325..99d2be2 100644 --- a/scripts/builtin/mice.dml +++ b/scripts/builtin/mice.dml @@ -56,12 +56,12 @@ return(Frame[String] dataset, Frame[String] singleSet) col = ncol(F) Result = matrix(0, rows=1, cols = col) Mask_Result = matrix(0, rows=1, cols=col) - cat = t(cMask) * seq(1, ncol(cMask)) - cat = removeEmpty(target = cat, margin = "rows") + scat = seq(1, ncol(cMask)) + cat = removeEmpty(target=scat, margin="rows", select=t(cMask)) s="" for(i in 1: nrow(cat), check =0) - s = s+as.integer(as.scalar(cat[i, 1]))+","; - + s = s+as.integer(as.scalar(cat[i, 1]))+","; + # encoding categorical columns using recode transformation jspecR = "{ids:true, recode:["+s+"]}"; @@ -70,7 +70,7 @@ return(Frame[String] dataset, Frame[String] singleSet) XO = replace(target=X, pattern=NaN, replacement=0); # remove categorical features and impute continous features with mean - eX_n = removeEmpty(target=X, margin="cols", select=(1-cMask)) + eX_n = removeEmpty(target=X, margin="cols", select=(cMask==0)) col_n = ncol(eX_n); # storing the mask/address of missing values Mask_n = is.na(eX_n); @@ -80,7 +80,7 @@ return(Frame[String] dataset, Frame[String] singleSet) # filling the missing data with their means X2_n = eX_n+(Mask_n*colMeans(eX_n)) # matrices for computing actul data - p_n = table( (seq(1, ncol(eX_n))) , (removeEmpty(target = t(cMask==0)*seq(1, ncol(cMask)), margin ="rows")) , 1 ) + p_n = table(seq(1, ncol(eX_n)), removeEmpty(target=scat, margin="rows", select=t(cMask==0))) if(ncol(p_n) < ncol(cMask)) p_n = cbind(p_n, matrix(0, nrow(p_n), ncol(cMask)-ncol(p_n))) q = XO * cMask @@ -91,8 +91,7 @@ return(Frame[String] dataset, Frame[String] singleSet) eX_c2 = removeEmpty(target = eX_c, margin = "rows", select = (rowSums(eX_c != 0)==col_c)) colMod = matrix(0, 1, ncol(eX_c)) # compute columnwise mode - parfor(i in 1: col_c) - { + parfor(i in 1: col_c) { f = eX_c2[, i] # adding one in data for dealing with zero category cat_counts = table(f, 1, n, 1); # counts for each category mode = as.scalar(rowIndexMax(t(cat_counts))); @@ -100,13 +99,10 @@ return(Frame[String] dataset, Frame[String] singleSet) } # find the mask of missing values - tmpMask_c = (eX_c == 0); - tmpMask_c = (tmpMask_c * colMod) # fill missing values with mode + tmpMask_c = (eX_c==0) * colMod # fill missing values with mode # Generate a matrix of actual length - p_c = table((seq(1, ncol(tmpMask_c))) , (removeEmpty(target = t(cMask)*seq(1, ncol(cMask)), margin ="rows")), 1) - if(ncol(p_c) < ncol(cMask)) - p_c = cbind(p_c, matrix(0, nrow(p_c), ncol(cMask)-ncol(p_c))) + p_c = table(seq(1, ncol(tmpMask_c)), removeEmpty(target=scat, margin ="rows", select=t(cMask)), ncol(tmpMask_c), ncol(cMask)) Mask_c = tmpMask_c %*% p_c inverseMask_c = Mask_c == 0 @@ -131,14 +127,13 @@ return(Frame[String] dataset, Frame[String] singleSet) dXMask = matrix(0, 1, ncol(dX)) index = 1 for(k in 1:col) { - if(as.scalar(dcDistincts[1,k]) != 0) { - for(l in 1:as.scalar(dcDistincts[1,k])){ - dXMask[1,index] = 1 - index = index +1 - } + nDistk = as.scalar(dcDistincts[1,k]); + if(nDistk != 0) { + dXMask[1,index:(index+nDistk-1)] = matrix(1,1,nDistk) + index += nDistk; } else - index = index +1 + index += 1 } #multiple imputations @@ -149,7 +144,6 @@ return(Frame[String] dataset, Frame[String] singleSet) in_n = 1; in_c = 1; i=1; j=1; # varibales for index selection while(i <= ncol(dX)) { - if(as.scalar(dXMask[1,i]) == 0) { # construct column selector @@ -175,7 +169,7 @@ return(Frame[String] dataset, Frame[String] singleSet) } if((as.scalar(dXMask[1,i]) == 1) & (sum(Mask_c[, in_c]) != 0)) - { + { j = (i + as.scalar(dist[1,in_c])) - 1 # construct column selector @@ -194,8 +188,8 @@ return(Frame[String] dataset, Frame[String] singleSet) test_X = removeEmpty(target = slice2, margin = "cols", select = selX); test_Y = slice2a[,in_c] - # train clasification model - beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001, reg = 0.001, maxi = 100, maxii=0, verbose=FALSE) + # train clasification model + beta = multiLogReg(X=train_X, Y=train_Y, icpt = 1, tol = 0.00000001, reg = 0.001, maxi = 100, maxii=0, verbose=FALSE) # predicting missing values [prob,pred,acc] = multiLogRegPredict(X=test_X, B=beta, Y = test_Y) # imputing missing column values (assumes Mask_Filled being 0/1-matrix) @@ -209,7 +203,7 @@ return(Frame[String] dataset, Frame[String] singleSet) in_c = in_c + 1 i = i+1; } - + nM = ((Mask_Filled_n) %*% p_n) + Mask_Filled_c Result = rbind(Result, nM+XO) Mask_Result = rbind(Mask_Result, nM) @@ -266,7 +260,7 @@ return (Matrix[Double] agg) if(sum(u1 != s ) == 0) uCount = uCount + 1 if(sum(v1 != s) == 0) - vCount = vCount + 1 + vCount = vCount + 1 } # copy the results of u in v if(uCount > vCount)