This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 181d18d55a [MINOR] Minors fixes in cleaning scripts
181d18d55a is described below

commit 181d18d55a967d33df9a09edf0beb3bf17d3b6b2
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Aug 24 13:28:52 2022 +0200

    [MINOR] Minors fixes in cleaning scripts
---
 scripts/builtin/executePipeline.dml            |  4 ++--
 scripts/builtin/tomeklink.dml                  |  4 +++-
 scripts/pipelines/scripts/enumerateLogical.dml |  3 +--
 scripts/pipelines/scripts/utils.dml            | 15 ++++++++-------
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/scripts/builtin/executePipeline.dml 
b/scripts/builtin/executePipeline.dml
index 06e9132cba..cdde19e2ef 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -376,7 +376,7 @@ return (Matrix[Double] X, Matrix[Double] Y)
     minClass = min(classes)
     maxClass = max(classes)
     diff = (maxClass - minClass)/sum(classes)
-    if(diff > 0.2)
+    if(diff > 0.2 & max(Y) <=2)
     {
       XY = order(target = cbind(Y, X),  by = 1, decreasing=FALSE, 
index.return=FALSE)
       synthesized = matrix(0,0,0) # initialize variable
@@ -449,7 +449,7 @@ flipLabels = function(Matrix[Double] X, Matrix[Double] Y, 
Double threshold, Inte
 return (Matrix[Double] X, Matrix[Double] Y)
 {
   classes1 = table(Y, 1)
-  if(min(Y) != max(Y) & nrow(Y) > 1)
+  if(min(Y) != max(Y) & nrow(Y) > 1 & max(Y) <= 2)
   {
     betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=100, maxii=0, 
verbose=FALSE)
     [prob, yhat, accuracy] = multiLogRegPredict(X, betas, Y, FALSE)
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index e9853caa32..9bb72f007d 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -48,9 +48,10 @@ return (Matrix[Double] X_under, Matrix[Double] y_under, 
Matrix[Double] drop_idx)
   majority_label = as.scalar(rowIndexMax(t(label)))
 
   tomek_links = get_links(X, y, majority_label)
-  drop_idx = tomek_links * seq(1, nrow(X)) 
+
   if(sum(tomek_links == 0) > 0)
   {
+    drop_idx = tomek_links * seq(1, nrow(X)) 
     X_under = removeEmpty(target=X, margin="rows", select = (tomek_links == 0))
     y_under = removeEmpty(target=y, margin="rows", select = (tomek_links == 0))
     drop_idx = removeEmpty(target=drop_idx, margin="rows", select = 
tomek_links)
@@ -80,6 +81,7 @@ return (Matrix[Double] nn) {
 # find the tomek links
 get_links = function(Matrix[Double] X, Matrix[Double] y, double majority_label)
 return (Matrix[Double] tomek_links) {
+  tomek_links = matrix(-1, 1, 1)
   nn = get_nn(X)
   perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
   nn_labels = perm %*% y
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml 
b/scripts/pipelines/scripts/enumerateLogical.dml
index 17a3ead2f0..19de3a6fe4 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -281,8 +281,7 @@ getOps = function( Frame[string] allOps, Frame[String] 
refSol, Integer dist, Int
   else {
     allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") & 
!x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
       & !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") & 
!x.equals(\"ppca\") &
-      !x.equals(\"abstain\") & !x.equals(\"underSampling\") & 
!x.equals(\"flipLabels\") & !x.equals(\"SMOTE\"))?x:\"0\"")
-    # & !x.equals(\"mice\") & !x.equals(\"dbscan\")
+      !x.equals(\"abstain\") & !x.equals(\"underSampling\") & 
!x.equals(\"flipLabels\") & !x.equals(\"mice\") & 
!x.equals(\"SMOTE\"))?x:\"0\"") 
     ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3)
   }
   if(as.scalar(refSol[1,1]) == "NaN")
diff --git a/scripts/pipelines/scripts/utils.dml 
b/scripts/pipelines/scripts/utils.dml
index 3f9378c9d2..e9d5488511 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -77,8 +77,9 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, 
Double ratio, Boolean
       sampledX = P %*% eX
       sampledY = eY 
     }
+    print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
   }
-  print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
+
 }
 
 # #######################################################################
@@ -139,16 +140,16 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix, 
List[Unknown] dictiona
   print(prefix+" convert strings to lower case");
   data = map(data, "x -> x.toLowerCase()")
   # step 2 fix invalid lengths
-  q0 = 0.05
-  q1 = 0.95
-  print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
+  # q0 = 0.05
+  # q1 = 0.95
+  # print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
 
-  [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+  # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
 
   
     # # step 3 fix swap values
-  print(prefix+" value swap fixing");
-  data = valueSwap(data, schema)
+  # print(prefix+" value swap fixing");
+  # data = valueSwap(data, schema)
   
   # step 3 drop invalid types
   print(prefix+" drop values with type mismatch");

Reply via email to