This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new d443178a0f [SYSTEMDS-3153] Missing value imputation using KNN
d443178a0f is described below

commit d443178a0fd3d341189c8be96abe7bce42870dd2
Author: Christina Dionysio <[email protected]>
AuthorDate: Fri Jan 5 17:14:09 2024 +0100

    [SYSTEMDS-3153] Missing value imputation using KNN
    
    This commit adds a perf test case for missing value imputation using
    KNN. It is integrated into our perf suite.
    
    Closes #1943
---
 scripts/perftest/KnnMissingValueImputation.sh | 54 +++++++++++++++++++++++++++
 scripts/perftest/runAll.sh                    |  1 +
 scripts/perftest/scripts/ImputeByKNN.dml      | 52 ++++++++++++++++++++++++++
 3 files changed, 107 insertions(+)

diff --git a/scripts/perftest/KnnMissingValueImputation.sh 
b/scripts/perftest/KnnMissingValueImputation.sh
new file mode 100755
index 0000000000..aa7bf04be7
--- /dev/null
+++ b/scripts/perftest/KnnMissingValueImputation.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+CMD=$1
+MAXMEM=$2
+
+echo "KNN MISSING VALUE IMPUTATION" >>results/times.txt
+
+mkdir -p logs
+LogName='logs/KnnMissingValueImputation.log'
+rm -f $LogName     # full log file
+rm -f $LogName.log # Reduced log file
+
+is=("1000 10000 100000 1000000 10000000")
+
+for i in $is; do
+  for method in "dist" "dist_missing" "dist_sample"; do
+    if [ $(((i*i*8)/10**6)) -gt $MAXMEM ] && [ $method == "dist" ]; then
+      continue;
+    elif [ $(((i*9*i*8/100)/10**6)) -gt $MAXMEM ] && [ $method == 
"dist_missing" ]; then
+      continue;
+    fi
+
+    tstart=$(date +%s.%N)
+    ${CMD} -f ./scripts/ImputeByKNN.dml \
+    --config conf/SystemDS-config.xml \
+    --stats \
+    --nvargs num_rows=$i method=$method max_mem=$MAXMEM \
+    >>$LogName 2>&1
+    ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc)
+    echo "KNN Missing Value Imputation $i rows, $method method:" $ttrain 
>>results/times.txt
+  done
+done
+
+echo -e "\n\n" >>results/times.txt
\ No newline at end of file
diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh
index 9b20606c1d..6d39043a74 100755
--- a/scripts/perftest/runAll.sh
+++ b/scripts/perftest/runAll.sh
@@ -126,6 +126,7 @@ echo -e "\n\n" >> results/times.txt
 ./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
 ./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
 ./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+./KnnMissingValueImputation.sh ${CMD} ${MAXMEM}
 
 ### IO Benchmarks:
 ./runAllIO.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
diff --git a/scripts/perftest/scripts/ImputeByKNN.dml 
b/scripts/perftest/scripts/ImputeByKNN.dml
new file mode 100755
index 0000000000..0ec2ef6af8
--- /dev/null
+++ b/scripts/perftest/scripts/ImputeByKNN.dml
@@ -0,0 +1,52 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+max_mem = $max_mem
+num_rows = $num_rows
+method = $method
+num_nan = num_rows * 0.1
+
+print("Testing method " + method + " with " + num_rows + " rows and " + 
num_nan + " rows containing missing values.")
+
+# Prepare the data
+X = Rand (rows = num_rows, cols = 10, min = 0.0, max = 1.0, pdf = "uniform");
+
+sample_fraction = 100
+exp = 2
+while ((sample_fraction / 10^exp * num_rows * 0.9 * num_rows * 0.1 * 8 / 10^6) 
> max_mem) {
+  sample_fraction = (sample_fraction - 1)
+
+  if (sample_fraction == 0) {
+    sample_fraction = 100
+    exp = exp + 1
+  }
+}
+
+
+sample_fraction = sample_fraction / 10^exp
+
+
+for (i in 1:num_nan) {
+  X[i, 1] = 'NaN';
+}
+
+#Perform the KNN imputation
+result = imputeByKNN(X = X, method = method, seed = 42, sample_frac = 
sample_fraction)

Reply via email to