This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 181d18d55a [MINOR] Minors fixes in cleaning scripts
181d18d55a is described below
commit 181d18d55a967d33df9a09edf0beb3bf17d3b6b2
Author: Shafaq Siddiqi <[email protected]>
AuthorDate: Wed Aug 24 13:28:52 2022 +0200
[MINOR] Minors fixes in cleaning scripts
---
scripts/builtin/executePipeline.dml | 4 ++--
scripts/builtin/tomeklink.dml | 4 +++-
scripts/pipelines/scripts/enumerateLogical.dml | 3 +--
scripts/pipelines/scripts/utils.dml | 15 ++++++++-------
4 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/scripts/builtin/executePipeline.dml
b/scripts/builtin/executePipeline.dml
index 06e9132cba..cdde19e2ef 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -376,7 +376,7 @@ return (Matrix[Double] X, Matrix[Double] Y)
minClass = min(classes)
maxClass = max(classes)
diff = (maxClass - minClass)/sum(classes)
- if(diff > 0.2)
+ if(diff > 0.2 & max(Y) <=2)
{
XY = order(target = cbind(Y, X), by = 1, decreasing=FALSE,
index.return=FALSE)
synthesized = matrix(0,0,0) # initialize variable
@@ -449,7 +449,7 @@ flipLabels = function(Matrix[Double] X, Matrix[Double] Y,
Double threshold, Inte
return (Matrix[Double] X, Matrix[Double] Y)
{
classes1 = table(Y, 1)
- if(min(Y) != max(Y) & nrow(Y) > 1)
+ if(min(Y) != max(Y) & nrow(Y) > 1 & max(Y) <= 2)
{
betas = multiLogReg(X=X, Y=Y, icpt=1, reg=1e-4, maxi=100, maxii=0,
verbose=FALSE)
[prob, yhat, accuracy] = multiLogRegPredict(X, betas, Y, FALSE)
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index e9853caa32..9bb72f007d 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -48,9 +48,10 @@ return (Matrix[Double] X_under, Matrix[Double] y_under,
Matrix[Double] drop_idx)
majority_label = as.scalar(rowIndexMax(t(label)))
tomek_links = get_links(X, y, majority_label)
- drop_idx = tomek_links * seq(1, nrow(X))
+
if(sum(tomek_links == 0) > 0)
{
+ drop_idx = tomek_links * seq(1, nrow(X))
X_under = removeEmpty(target=X, margin="rows", select = (tomek_links == 0))
y_under = removeEmpty(target=y, margin="rows", select = (tomek_links == 0))
drop_idx = removeEmpty(target=drop_idx, margin="rows", select =
tomek_links)
@@ -80,6 +81,7 @@ return (Matrix[Double] nn) {
# find the tomek links
get_links = function(Matrix[Double] X, Matrix[Double] y, double majority_label)
return (Matrix[Double] tomek_links) {
+ tomek_links = matrix(-1, 1, 1)
nn = get_nn(X)
perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
nn_labels = perm %*% y
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml
b/scripts/pipelines/scripts/enumerateLogical.dml
index 17a3ead2f0..19de3a6fe4 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -281,8 +281,7 @@ getOps = function( Frame[string] allOps, Frame[String]
refSol, Integer dist, Int
else {
allOps = map(allOps, "x -> (!x.equals(\"dummycoding\") &
!x.equals(\"frequencyEncode\") & !x.equals(\"tomeklink\")
& !x.equals(\"dbscan\") & !x.equals(\"WoE\") & !x.equals(\"pca\") &
!x.equals(\"ppca\") &
- !x.equals(\"abstain\") & !x.equals(\"underSampling\") &
!x.equals(\"flipLabels\") & !x.equals(\"SMOTE\"))?x:\"0\"")
- # & !x.equals(\"mice\") & !x.equals(\"dbscan\")
+ !x.equals(\"abstain\") & !x.equals(\"underSampling\") &
!x.equals(\"flipLabels\") & !x.equals(\"mice\") &
!x.equals(\"SMOTE\"))?x:\"0\"")
ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3)
}
if(as.scalar(refSol[1,1]) == "NaN")
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index 3f9378c9d2..e9d5488511 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -77,8 +77,9 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio, Boolean
sampledX = P %*% eX
sampledY = eY
}
+ print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
}
- print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
+
}
# #######################################################################
@@ -139,16 +140,16 @@ return(Frame[Unknown] data, List[Unknown] distanceMatrix,
List[Unknown] dictiona
print(prefix+" convert strings to lower case");
data = map(data, "x -> x.toLowerCase()")
# step 2 fix invalid lengths
- q0 = 0.05
- q1 = 0.95
- print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
+ # q0 = 0.05
+ # q1 = 0.95
+ # print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
- [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
+ # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
# # step 3 fix swap values
- print(prefix+" value swap fixing");
- data = valueSwap(data, schema)
+ # print(prefix+" value swap fixing");
+ # data = valueSwap(data, schema)
# step 3 drop invalid types
print(prefix+" drop values with type mismatch");