This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 88f1063 [MINOR] Passing quantiles as function parameters in winsorize
builtin This commit also removes the parfor from logical pipelines'
enumerator to stabilize the results
88f1063 is described below
commit 88f1063f0197d7197f90ec1cb1113bfc8173b12b
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Sep 8 16:29:35 2021 +0200
[MINOR] Passing quantiles as function parameters in winsorize builtin
This commit also removes the parfor from logical pipelines' enumerator to
stabilize the results
Closes #1387.
---
scripts/builtin/winsorize.dml | 17 +++++++++--------
scripts/pipelines/properties/param.csv | 2 +-
scripts/pipelines/scripts/enumerateLogical.dml | 2 +-
src/test/scripts/functions/builtin/multipleBuiltins.dml | 2 +-
src/test/scripts/functions/builtin/winsorize.dml | 2 +-
src/test/scripts/functions/builtin/winsorizeFoo.dml | 2 +-
.../scripts/functions/misc/FunPotpourriMultiEval.dml | 2 +-
src/test/scripts/functions/misc/Functions15b.dml | 2 +-
8 files changed, 16 insertions(+), 15 deletions(-)
diff --git a/scripts/builtin/winsorize.dml b/scripts/builtin/winsorize.dml
index 5e7eb61..614630d 100644
--- a/scripts/builtin/winsorize.dml
+++ b/scripts/builtin/winsorize.dml
@@ -19,18 +19,19 @@
#
#-------------------------------------------------------------
-m_winsorize = function(Matrix[Double] X, Boolean verbose) return
(Matrix[Double] Y) {
+m_winsorize = function(Matrix[Double] X, Double ql = 0.05, Double qu = 0.95,
Boolean verbose)
+return (Matrix[Double] Y) {
+
Y = matrix(0, nrow(X), ncol(X))
- parfor(i in 1:ncol(X))
- Y[,i] = fixOutliersWinsorize(X[,i])
+ parfor(i in 1:ncol(X), check=0) {
+ q1 = quantile(X[,i], ql)
+ q2 = quantile(X[,i], qu)
+ Y[, i] = fixOutliersWinsorize(X[,i], q1, q2)
+ }
}
-fixOutliersWinsorize = function(Matrix[Double] X) return (Matrix[Double] Y)
+fixOutliersWinsorize = function(Matrix[Double] X, Double ql, Double qu)
return (Matrix[Double] Y)
{
- # compute quantiles for lower and upper probs
- q = quantile(X, matrix("0.05 0.95", rows=2, cols=1));
- ql = as.scalar(q[1,1]);
- qu = as.scalar(q[2,1]);
# replace values outside [ql,qu] w/ ql and qu respectively
Y = ifelse(X < ql, ql, X);
Y = ifelse(Y > qu, qu, Y);
diff --git a/scripts/pipelines/properties/param.csv
b/scripts/pipelines/properties/param.csv
index fc454c4..e2068b1 100644
--- a/scripts/pipelines/properties/param.csv
+++ b/scripts/pipelines/properties/param.csv
@@ -1,7 +1,7 @@
name,param_no,maskFlag,FDFlag,yFlag,verboseFlag,dataFlag,dt1,dt2,dt3,dt4,st1,en1,st2,en2,st3,en3,st4,en4
outlierByIQR,3,0,0,0,1,0,FP,INT,INT,1,7,2,2,1,1,,,
outlierBySd,3,0,0,0,1,0,INT,INT,INT,1,7,1,2,2,1,,,
-winsorize,0,0,0,0,1,0,,,,,,,,,,,,
+winsorize,2,0,0,0,1,0,FP,FP,0.01,0.05,0.95,1,,,,,,
normalize,0,0,0,0,0,0,,,,,,,,,,,,
imputeByMean,0,1,0,0,0,2,,,,,,,,,,,,
imputeByMedian,0,1,0,0,0,2,,,,,,,,,,,,
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml
b/scripts/pipelines/scripts/enumerateLogical.dml
index 29ac78c..977c0a2 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -85,7 +85,7 @@ return (Frame[Unknown] bestLg, Double pre_best)
# # # execute the physical pipelines
scores = matrix(0, nrow(physicalPipList), 1)
# TODO better parfor-dep handling of multi-assignments to avoid check=0
- parfor(i in 1:length(physicalPipList), check=0) {
+ for(i in 1:length(physicalPipList), check=0) {
lp2 = as.frame(logicalPipList[((i-1)%/%num_inst)+1,])
pp2 = as.frame(physicalPipList[i,])
# # append configuration keys for extracting the pipeline later on
diff --git a/src/test/scripts/functions/builtin/multipleBuiltins.dml
b/src/test/scripts/functions/builtin/multipleBuiltins.dml
index 7a8315b..a771f59 100644
--- a/src/test/scripts/functions/builtin/multipleBuiltins.dml
+++ b/src/test/scripts/functions/builtin/multipleBuiltins.dml
@@ -20,6 +20,6 @@
#-------------------------------------------------------------
X = read($1);
-Y = winsorize(X, FALSE);
+Y = winsorize(X=X, verbose=FALSE);
Z = outlier(Y, FALSE);
write(Z, $2);
diff --git a/src/test/scripts/functions/builtin/winsorize.dml
b/src/test/scripts/functions/builtin/winsorize.dml
index eeba09d..00725e4 100644
--- a/src/test/scripts/functions/builtin/winsorize.dml
+++ b/src/test/scripts/functions/builtin/winsorize.dml
@@ -20,5 +20,5 @@
#-------------------------------------------------------------
X = read($1);
-Y = winsorize(X, FALSE);
+Y = winsorize(X=X, ql=0.05, qu= 0.95, verbose=FALSE);
write(Y, $2)
diff --git a/src/test/scripts/functions/builtin/winsorizeFoo.dml
b/src/test/scripts/functions/builtin/winsorizeFoo.dml
index 78472bd..9c1c53b 100644
--- a/src/test/scripts/functions/builtin/winsorizeFoo.dml
+++ b/src/test/scripts/functions/builtin/winsorizeFoo.dml
@@ -25,5 +25,5 @@ foo = function(Matrix[Double] X, Boolean verbose)
while(FALSE){} #no inlining
if( verbose )
print( min(X)+" "+max(X) )
- R = winsorize(X, verbose);
+ R = winsorize(X=X, verbose=verbose);
}
diff --git a/src/test/scripts/functions/misc/FunPotpourriMultiEval.dml
b/src/test/scripts/functions/misc/FunPotpourriMultiEval.dml
index 72b7b7b..3d5fe37 100644
--- a/src/test/scripts/functions/misc/FunPotpourriMultiEval.dml
+++ b/src/test/scripts/functions/misc/FunPotpourriMultiEval.dml
@@ -23,7 +23,7 @@ X = rand(rows=10, cols= 10)
t1 = interQuartileMean(X[,7]);
for(i in 1:5)
- X = eval("winsorize", list(X, FALSE))
+ X = eval("winsorize", list(X=X, ql = 0.05, qu=0.95, verbose=FALSE))
t2 = interQuartileMean(X[,7]);
print("expected=TRUE, actual="+(t2 < t1))
diff --git a/src/test/scripts/functions/misc/Functions15b.dml
b/src/test/scripts/functions/misc/Functions15b.dml
index 6834f49..074c7e4 100644
--- a/src/test/scripts/functions/misc/Functions15b.dml
+++ b/src/test/scripts/functions/misc/Functions15b.dml
@@ -22,5 +22,5 @@
foo = function(Matrix[Double] X)
return (Matrix[Double] Y)
{
- Y = winsorize(X, FALSE)
+ Y = winsorize(X=X, verbose=FALSE)
}