This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 41c21bf9ce [SYSTEMDS-3822] Fix incorrect sampling in top-k cleaning
pipelines
41c21bf9ce is described below
commit 41c21bf9ce225e05edee4dc616c330d2526e6533
Author: Matthias Boehm <[email protected]>
AuthorDate: Wed Jan 29 10:51:05 2025 +0100
[SYSTEMDS-3822] Fix incorrect sampling in top-k cleaning pipelines
This patch fixes a bug in top-k cleaning pipeline enumeration, where
for datasets with more than 200K rows the sampling ratio was ignored
and always set to 0.6 which means we actually ran with larger data
than expected, if people wanted to sampling very large datasets.
---
scripts/pipelines/scripts/utils.dml | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/scripts/pipelines/scripts/utils.dml
b/scripts/pipelines/scripts/utils.dml
index 45688db883..ff98d13910 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -62,9 +62,8 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio, Matrix[D
MIN_SAMPLE = 1000
sampledX = eX
sampledY = eY
- ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
sampled = floor(nrow(eX) * ratio)
-
+
if(sampled > MIN_SAMPLE & ratio != 1.0)
{
sampleVec = sample(nrow(eX), sampled, FALSE, 23)
@@ -76,7 +75,7 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY,
Double ratio, Matrix[D
}
else if(nrow(eY) == 1) { # for clustering
sampledX = P %*% eX
- sampledY = eY
+ sampledY = eY
}
print("sampled rows "+nrow(sampledY)+" out of "+nrow(eY))
}
@@ -271,4 +270,4 @@ return(Frame[Unknown] data)
# data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)",
margin=2)
# }
# }
-}
\ No newline at end of file
+}