This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new d443178a0f [SYSTEMDS-3153] Missing value imputation using KNN
d443178a0f is described below
commit d443178a0fd3d341189c8be96abe7bce42870dd2
Author: Christina Dionysio <[email protected]>
AuthorDate: Fri Jan 5 17:14:09 2024 +0100
[SYSTEMDS-3153] Missing value imputation using KNN
This commit adds a perf test case for missing value imputation using
KNN. It is integrated into our perf suite.
Closes #1943
---
scripts/perftest/KnnMissingValueImputation.sh | 54 +++++++++++++++++++++++++++
scripts/perftest/runAll.sh | 1 +
scripts/perftest/scripts/ImputeByKNN.dml | 52 ++++++++++++++++++++++++++
3 files changed, 107 insertions(+)
diff --git a/scripts/perftest/KnnMissingValueImputation.sh
b/scripts/perftest/KnnMissingValueImputation.sh
new file mode 100755
index 0000000000..aa7bf04be7
--- /dev/null
+++ b/scripts/perftest/KnnMissingValueImputation.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+CMD=$1
+MAXMEM=$2
+
+echo "KNN MISSING VALUE IMPUTATION" >>results/times.txt
+
+mkdir -p logs
+LogName='logs/KnnMissingValueImputation.log'
+rm -f $LogName # full log file
+rm -f $LogName.log # Reduced log file
+
+is=("1000 10000 100000 1000000 10000000")
+
+for i in $is; do
+ for method in "dist" "dist_missing" "dist_sample"; do
+ if [ $(((i*i*8)/10**6)) -gt $MAXMEM ] && [ $method == "dist" ]; then
+ continue;
+ elif [ $(((i*9*i*8/100)/10**6)) -gt $MAXMEM ] && [ $method ==
"dist_missing" ]; then
+ continue;
+ fi
+
+ tstart=$(date +%s.%N)
+ ${CMD} -f ./scripts/ImputeByKNN.dml \
+ --config conf/SystemDS-config.xml \
+ --stats \
+ --nvargs num_rows=$i method=$method max_mem=$MAXMEM \
+ >>$LogName 2>&1
+ ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc)
+ echo "KNN Missing Value Imputation $i rows, $method method:" $ttrain
>>results/times.txt
+ done
+done
+
+echo -e "\n\n" >>results/times.txt
\ No newline at end of file
diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh
index 9b20606c1d..6d39043a74 100755
--- a/scripts/perftest/runAll.sh
+++ b/scripts/perftest/runAll.sh
@@ -126,6 +126,7 @@ echo -e "\n\n" >> results/times.txt
./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
+./KnnMissingValueImputation.sh ${CMD} ${MAXMEM}
### IO Benchmarks:
./runAllIO.sh ${CMD} ${TEMPFOLDER} ${MAXMEM}
diff --git a/scripts/perftest/scripts/ImputeByKNN.dml
b/scripts/perftest/scripts/ImputeByKNN.dml
new file mode 100755
index 0000000000..0ec2ef6af8
--- /dev/null
+++ b/scripts/perftest/scripts/ImputeByKNN.dml
@@ -0,0 +1,52 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+max_mem = $max_mem
+num_rows = $num_rows
+method = $method
+num_nan = num_rows * 0.1
+
+print("Testing method " + method + " with " + num_rows + " rows and " +
num_nan + " rows containing missing values.")
+
+# Prepare the data
+X = Rand (rows = num_rows, cols = 10, min = 0.0, max = 1.0, pdf = "uniform");
+
+sample_fraction = 100
+exp = 2
+while ((sample_fraction / 10^exp * num_rows * 0.9 * num_rows * 0.1 * 8 / 10^6)
> max_mem) {
+ sample_fraction = (sample_fraction - 1)
+
+ if (sample_fraction == 0) {
+ sample_fraction = 100
+ exp = exp + 1
+ }
+}
+
+
+sample_fraction = sample_fraction / 10^exp
+
+
+for (i in 1:num_nan) {
+ X[i, 1] = 'NaN';
+}
+
+#Perform the KNN imputation
+result = imputeByKNN(X = X, method = method, seed = 42, sample_frac =
sample_fraction)