This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 62a66b0  [MINOR] Performance and cleanup winsorizeApply built-in 
function
62a66b0 is described below

commit 62a66b09f7d8c5cd01bc2e9bd6f0d5f85071e305
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Mar 6 20:52:10 2022 +0100

    [MINOR] Performance and cleanup winsorizeApply built-in function
    
    On a full run of top-k cleaning pipeline enumeration on the EEG dataset,
    this patch improved the winsorizeApply function as follows (the
    different counts are due to different seeds):
    
    From HeavyHitter Statistics:
    ---
    old: 18  m_winsorizeApply                523.546      19455
    new: 31  m_winsorizeApply                170.390      12600
---
 scripts/builtin/winsorize.dml      |  7 ++-----
 scripts/builtin/winsorizeApply.dml | 35 +++++++++++++++++------------------
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/scripts/builtin/winsorize.dml b/scripts/builtin/winsorize.dml
index afb5a7a..eddb1da 100644
--- a/scripts/builtin/winsorize.dml
+++ b/scripts/builtin/winsorize.dml
@@ -43,13 +43,10 @@ m_winsorize = function(Matrix[Double] X, Double ql = 0.05, 
Double qu = 0.95, Boo
 return (Matrix[Double] Y, Matrix[Double] qLower, Matrix[Double] qUpper) {
   qLower = matrix(0, rows=1, cols=ncol(X))
   qUpper = matrix(0, rows=1, cols=ncol(X))
-  Y = matrix(0, nrow(X), ncol(X))
   Xtemp = replace(target=X, pattern=NaN, replacement=0)
   parfor(i in 1:ncol(X), check=0) {
-    q1 = quantile(Xtemp[,i], ql)
-    q2 = quantile(Xtemp[,i], qu)
-    qLower[1, i] = q1
-    qUpper[1, i] = q2
+    qLower[1,i] = quantile(Xtemp[,i], ql)
+    qUpper[1,i] = quantile(Xtemp[,i], qu)
   }
   Y = winsorizeApply(X, qLower, qUpper)
 }
diff --git a/scripts/builtin/winsorizeApply.dml 
b/scripts/builtin/winsorizeApply.dml
index b5e169e..968a0da 100644
--- a/scripts/builtin/winsorizeApply.dml
+++ b/scripts/builtin/winsorizeApply.dml
@@ -19,31 +19,30 @@
 #
 #-------------------------------------------------------------
 
-# The winsorizeAPply takes the upper and lower quantile values and remove 
outliers.
+# winsorizeApply takes the upper and lower quantile values per colunm, and
+# remove outliers by replacing them with these upper and lower bound values.
 #
 # INPUT PARAMETERS:
-# 
----------------------------------------------------------------------------------------------------------------------
-# NAME      TYPE       DEFAULT       MEANING
-# 
----------------------------------------------------------------------------------------------------------------------
-# X    Matrix[Double]   ---          Input feature matrix
-# ql   Double           ---          lower quantile
-# qu   Double           ---          upper quantile 
+# 
------------------------------------------------------------------------------
+# NAME     TYPE             DEFAULT   MEANING
+# 
------------------------------------------------------------------------------
+# X        Matrix[Double]   ---       Input feature matrix
+# qLower   Matrix[Double]   ---       row vector of upper bounds per column
+# qUpper   Matrix[Double]   ---       row vector of lower bounds per column 
 #
-# 
----------------------------------------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
 #
 # OUTPUT:
-# 
----------------------------------------------------------------------------------------------------------------------
-# NAME     TYPE                             MEANING
-# 
----------------------------------------------------------------------------------------------------------------------
-# Y        Matrix[Double]                   Matrix without outlier values
-# 
----------------------------------------------------------------------------------------------------------------------
+# 
------------------------------------------------------------------------------
+# NAME     TYPE                       MEANING
+# 
------------------------------------------------------------------------------
+# Y        Matrix[Double]             Matrix without outlier values
+# 
------------------------------------------------------------------------------
 
 
-m_winsorizeApply = function(Matrix[Double] X,  Matrix[Double] qLower, 
Matrix[Double] qUpper) return (Matrix[Double] Y)
+m_winsorizeApply = function(Matrix[Double] X,  Matrix[Double] qLower, 
Matrix[Double] qUpper)
+  return (Matrix[Double] Y)
 {
   # replace values outside [ql,qu] w/ ql and qu respectively
-  t1 = (X < qLower)
-  Y = ifelse(t1, (X * (t1 == 0))  + ( t1 * qLower), X);
-  t2 = Y > qUpper
-  Y = ifelse(t2, (Y * (t2 == 0))+ (t2 * qUpper), Y);
+  Y = min(max(X, qLower), qUpper);
 }

Reply via email to