This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new e2fafd6 [SYSTEMDS-3302] Fix imputeByFD built-in (correctness and
performance)
e2fafd6 is described below
commit e2fafd615f4bbad603eaf58af1c1dfd351c5daa4
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Mar 6 16:05:42 2022 +0100
[SYSTEMDS-3302] Fix imputeByFD built-in (correctness and performance)
This patch fixes correctness and performance issues of the builtin
function imputeByFD. First, in the old version, if no target values
qualified for imputation by robust functional dependency, still the last
target value index was incorrectly used for imputation. Second, the
check for qualifying robustness functional dependency values
unnecessarily created large intermediates. We now do these operations
on small vectors.
On a scenario of computing the following on a recoded version of the
NashvilleTrafficAccidents train dataset, the end-to-end performance
improved from 59s to 17s (11s for imputeByFD).
for(i in 1:(ncol(X)-1))
for(j in (i+1):ncol(X)) {
print("Impute FD for "+i+"-"+j);
R = imputeByFD(X=X[,i], Y=X[,j], threshold=0.9);
}
---
scripts/builtin/imputeByFD.dml | 16 +++++++++-------
.../functions/builtin/part1/BuiltinImputeFDTest.java | 3 +++
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/scripts/builtin/imputeByFD.dml b/scripts/builtin/imputeByFD.dml
index 526a0fb..da553e1 100644
--- a/scripts/builtin/imputeByFD.dml
+++ b/scripts/builtin/imputeByFD.dml
@@ -25,23 +25,24 @@
#
----------------------------------------------------------------------------------------------------------------------
# NAME TYPE DEFAULT MEANING
#
----------------------------------------------------------------------------------------------------------------------
-# X Matrix[Double] --- Matrix X
-# source Integer --- source attribute to use for
imputation and error correction
-# target Integer --- attribute to be fixed
+# X Matrix[Double] --- Vector X, source attribute of
functional dependency
+# Y Matrix[Double] --- Vector Y, target attribute of
functional dependency and imputation
# threshold Double --- threshold value in interval [0,
1] for robust FDs
+# verbose Boolean FALSE flag for printing verbose debug
output
#
----------------------------------------------------------------------------------------------------------------------
#
# OUTPUT:
#
----------------------------------------------------------------------------------------------------------------------
# NAME TYPE MEANING
#
----------------------------------------------------------------------------------------------------------------------
-# X Matrix[Double] Matrix with possible imputations
+# Y Matrix[Double] Vector Y, with missing values
mapped to a new max value
+# Y_imp Matrix[Double] Vector Y, with imputed missing
values
#
----------------------------------------------------------------------------------------------------------------------
m_imputeByFD = function(Matrix[Double] X, Matrix[Double] Y, Double threshold,
Boolean verbose = FALSE)
return(Matrix[Double] Y, Matrix[Double] Y_imp)
{
- # # validation checks
+ # validation checks
if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1]
found "+threshold)
@@ -77,9 +78,10 @@ imputeAndCorrect = function(Matrix[Double] X, Matrix[Double]
Y, Double threshold
if(sum(missing_mask_Y) > 0 & ncol(ctab) > 1)
ctab = ctab[,1:ncol(ctab)-1]
- ctab = ctab/(rowSums(ctab)) > threshold
+ # copmute vector of qualifying max count per row (source value)
+ I = (rowMaxs(ctab)/rowSums(ctab)) > threshold
# Get the most frequent mapped value of Y
- filled = rowIndexMax(ctab) #(ctab == rowMaxs(ctab)) * t(seq(1, ncol(ctab)))
#
+ filled = rowIndexMax(ctab) * I
imputed_Y = imputeByFDApply(X, filled)
}
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImputeFDTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImputeFDTest.java
index 9814696..9ec95d4 100644
---
a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImputeFDTest.java
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinImputeFDTest.java
@@ -70,6 +70,9 @@ public class BuiltinImputeFDTest extends AutomatedTestBase {
public void test4() throws IOException {
runImpute_RFDTests(2,3, 0.4, 2, ExecType.SPARK);
}
+
+ //TODO negative tests
+
private void runImpute_RFDTests(int source, int target, double
threshold, int test, ExecType instType)
throws IOException
{