This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new e8cc1de [SYSTEMDS-2797] Builtin function statsNA for computing NA
statistics Co-authored-by: Ismael Ibrahim <[email protected]>
DIA project WS2020/21. Closes #1117.
e8cc1de is described below
commit e8cc1de36777a91a23d02ba5998c2083bbb224b0
Author: haubitzer <[email protected]>
AuthorDate: Sat Jan 16 15:53:50 2021 +0100
[SYSTEMDS-2797] Builtin function statsNA for computing NA statistics
Co-authored-by: Ismael Ibrahim <[email protected]>
DIA project WS2020/21.
Closes #1117.
---
scripts/builtin/statsNA.dml | 204 +++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../test/functions/builtin/BuiltinStatsNATest.java | 96 ++++++++++
src/test/scripts/functions/builtin/statsNATest.R | 45 +++++
src/test/scripts/functions/builtin/statsNATest.dml | 26 +++
5 files changed, 372 insertions(+)
diff --git a/scripts/builtin/statsNA.dml b/scripts/builtin/statsNA.dml
new file mode 100644
index 0000000..2547175
--- /dev/null
+++ b/scripts/builtin/statsNA.dml
@@ -0,0 +1,204 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Print summary stats about the distribution of missing values in a univariate
time series.
+#
------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
------------------------------------------------------------------------------
+# X Matrix --- Numeric Vector (‘vector’)
object containing NAs
+# bins Integer 4 Split number for bin
stats. Number of bins the time series gets divided into.
+# For each bin information
about amount/percentage of missing values is printed.
+# Default value is 4 - which
means stats about the 1st,2nd,3rd,4th quarter of the time series are shown.
+# verbose Boolean TRUE Choose if the function
print or Returns.
+# For print_only = TRUE the
function has no return value and just prints out missing value stats.
+# If print_only is changed
to FALSE, nothing is printed and the function returns a list.
+# Print gives a little bit
more information,
+# since the returned list
does not include "Stats for Bins" and "overview NA series"
+#
------------------------------------------------------------------------------
+# stats Matrix Double Column vector where each
row correspond to following,
+# 1. "Length of time series"
- Number of observations in the time-series (including NAs)
+# 2. "Number of Missing
Values" - Number of missing values in the time series
+# 3. "Percentage of Missing
Values" - Percentage of missing values in the time series
+# 4. "Number of Gaps" -
Number of NA gaps (consisting of one or more consecutive NAs) in the time series
+# 5. "Average Gap Size" -
Average size of consecutive NAs for the NA gaps in the time series
+# 6. "Longest NA gap" -
Longest series of consecutive missing values (NAs in a row) in the time series
+# 7. "Most frequent gap
size" - Most frequent occurring series of missing values in the time series
+# 8. "Gap size accounting
for most NAs" - The series of consecutive missing values that accounts for most
missing
+# values overall in the
time series
+#
------------------------------------------------------------------------------
+
+m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose =
TRUE)
+ return( Matrix[Double] stats) {
+
+ longest_nan_gap = -1
+ most_frequent_nan_gap = -1
+ most_weighty_nan_gap = -1
+ stats = matrix(0, rows=8, cols=1)
+
+ if(ncol(X) != 1) {
+ stop("statsNA: expect a matrix with only one column");
+ }
+
+ # Count total entries
+ length_series = length(X);
+ # store length
+ stats[1, 1] = length_series
+
+ if (length_series == 0) {
+ stop("EMPTY MATRIX")
+ }
+
+ if (length_series < bins) {
+ print("Warning: data is less than no. of bins, bins value was changed to
4");
+ bins = 4;
+ } else if (length_series < 1) {
+ print("Warning: bin value can not be zero, bin value was changed to 1");
+ bins = 1;
+ }
+
+ # Count NaNs
+ p_position_nans = is.na(X)
+ number_nans = sum(p_position_nans);
+ # stop if no null value found in data
+ if(number_nans == 0)
+ stop("No missing value found in the data.")
+
+ stats[2, 1] = number_nans
+
+ # Calculate percentage of NaNs
+ stats[3, 1] = number_nans / length_series;
+ # Create Vector with numbers of gaps
+ p_gaps_vector = matrix(0, length_series, 1);
+ p_length_of_gap = 0;
+ for (i in 1:length_series) {
+ if (as.scalar(p_position_nans[i,1]) == 1) {
+ p_length_of_gap += 1;
+ }
+ else if (p_length_of_gap != 0){
+ p_gaps_vector[p_length_of_gap, 1] =
as.scalar(p_gaps_vector[p_length_of_gap, 1]) + 1;
+ p_length_of_gap = 0;
+ }
+ }
+
+ # The last element can also be a NaN but the loop will not update our vector
map, so this workaround is needed.
+ if(p_length_of_gap > 0)
+ p_gaps_vector[p_length_of_gap, 1] =
as.scalar(p_gaps_vector[p_length_of_gap, 1]) + 1;
+
+
+ # Count number of gaps
+ number_nan_gaps = sum(p_gaps_vector);
+ stats[4, 1] = number_nan_gaps
+ # Calculate average gap size
+ stats[5, 1] = number_nans / number_nan_gaps
+
+
+ # Find longest gap
+ longest_nan_gap = max(seq(1, length_series) * (p_gaps_vector>0))
+ stats[6, 1] = longest_nan_gap
+
+ # Find most frequent gap size
+ stats[7, 1] = as.scalar(rowIndexMax(t(p_gaps_vector)));
+
+ # Gap size that has most NaNs
+ p_gaps_vector_with_weight = matrix(0, rows=length_series, cols=1);
+ for(i in 1:length_series) {
+ p_gaps_vector_with_weight[i, 1] = i * as.scalar(p_gaps_vector[i,1]);
+ }
+ # Find most gap size with most weight
+ stats[8, 1] = as.scalar(rowIndexMax(t(p_gaps_vector_with_weight)));
+
+ # Calculate bins
+ #---
+ bins_start = matrix(0, bins, 1);
+ bins_end = matrix(0, bins, 1);
+ bins_nans = matrix(0, bins, 1);
+ bins_percentage = matrix(0, bins, 1);
+ bin_length = ceiling(length_series / bins)
+
+ # Calculate where a bin starts and ends
+ tmp_splitter = 0
+ for(i in 1:bins) {
+ bins_start[i,1] = tmp_splitter + 1;
+ tmp_splitter = tmp_splitter + bin_length;
+ bins_end[i,1] = tmp_splitter;
+ }
+
+ for(i in 1:bins) {
+ start = as.scalar(bins_start[i,1]);
+ end = as.scalar(bins_end[i,1]);
+ tmp_nans = sum(p_position_nans[start:end, 1]);
+
+ bins_nans[i,1] = tmp_nans;
+ bins_percentage[i,1] = tmp_nans / bin_length;
+ }
+ #---
+
+ # Print results
+ #---
+ if (verbose) {
+ print("-------------------------")
+ print("Length of time series:");
+ print(as.scalar(stats[1, 1]));
+ print("-------------------------");
+ print("Number of Missing Values:");
+ print(as.scalar(stats[2, 1]));
+ print("-------------------------");
+ print("Percentage of Missing Values:");
+ print("%3.2f %%", as.scalar(stats[3, 1]));
+ print("-------------------------");
+ print("Number of Gaps:");
+ print(as.scalar(stats[4, 1]));
+ print("-------------------------");
+ print("Average Gap Size:");
+ print("%3.2f %%", as.scalar(stats[5, 1]));
+ print("-------------------------");
+ print("Longest NA gap (series of consecutive NAs)");
+ print(as.scalar(stats[6, 1]));
+ print("-------------------------");
+ print("Most frequent gap size (series of consecutive NA series)");
+ print(as.scalar(stats[7, 1]));
+ print("-------------------------");
+ print("Gap size accounting for most NAs");
+ print(as.scalar(stats[8, 1]));
+ print("-------------------------");
+ if(bins > 0) {
+ print("Stats for Bins")
+ for (i in 1:bins) {
+ l = bin_length
+ s = as.scalar(bins_start[i,1]);
+ e = as.scalar(bins_end[i,1]);
+ n = as.scalar(bins_nans[i,1]);
+ p = as.scalar(bins_percentage[i,1]);
+ print(" Bin %d (%2.0f values from %2.0f to %2.0f):%5.0f NAs (%3.2f
%%)", i,l,s,e,n,p);
+ }
+ print("-------------------------")
+ }
+ print("Stats for Bins")
+ for (i in 1:bins) {
+ v = as.scalar(p_gaps_vector[i,1]);
+ if(v > 0) {
+ print(" %.0f NA in a row: %d times", v, i);
+ }
+ }
+ print("-------------------------")
+ }
+}
+
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index f18ec9c..8215501 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -203,6 +203,7 @@ public enum Builtins {
SMOTE("smote", true),
SOLVE("solve", false),
SPLIT("split", true),
+ STATSNA("statsNA", true),
SQRT("sqrt", false),
SUM("sum", false),
SVD("svd", false, ReturnType.MULTI_RETURN),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java
new file mode 100644
index 0000000..733a6fc
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.apache.sysds.common.Types;
+import org.apache.sysds.lops.LopProperties;
+import org.apache.sysds.runtime.matrix.data.MatrixValue;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.HashMap;
+
+public class BuiltinStatsNATest extends AutomatedTestBase {
+ private final static String TEST_NAME = "statsNATest";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinSplitTest.class.getSimpleName() + "/";
+ private final static double eps = 1e-3;
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR,
TEST_NAME, new String[]{"B",}));
+ }
+
+ @Test
+ public void testStatsNA1() {
+ runStatsNA(1, 100, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testStatsNA2() {
+ runStatsNA(4, 100, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testStatsNA3() {
+ runStatsNA(100, 1000, LopProperties.ExecType.CP);
+ }
+
+ @Test
+ public void testStatsNA4() {
+ runStatsNA(100, 10000, LopProperties.ExecType.CP);
+ }
+
+
+ private void runStatsNA(int bins, int size, LopProperties.ExecType
instType) {
+ Types.ExecMode platformOld = setExecMode(instType);
+ try
+ {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{ "-nvargs", "X=" + input("A"), "bins="
+ bins, "Out=" + output("Out")};
+
+ double[][] A = getRandomMatrix(size, 1, -10, 10, 0.6, 7);
+ writeInputMatrixWithMTD("A", A, true);
+
+ fullRScriptName = HOME + TEST_NAME + ".R";
+ rCmd = getRCmd(inputDir(), Integer.toString(bins), expectedDir());
+
+ runTest(true, false, null, -1);
+ runRScript(true);
+ //compare matrices
+ HashMap<MatrixValue.CellIndex, Double> dmlfileOut1 =
readDMLMatrixFromOutputDir("Out");
+ HashMap<MatrixValue.CellIndex, Double> rfileOut1 =
readRMatrixFromExpectedDir("Out");
+ MatrixValue.CellIndex key_ce = new MatrixValue.CellIndex(1, 1);
+
+ TestUtils.compareMatrices(dmlfileOut1, rfileOut1, eps, "Stat-DML",
"Stat-R");
+ }
+ catch(Exception e) {
+ e.printStackTrace();
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/statsNATest.R
b/src/test/scripts/functions/builtin/statsNATest.R
new file mode 100644
index 0000000..d68be25
--- /dev/null
+++ b/src/test/scripts/functions/builtin/statsNATest.R
@@ -0,0 +1,45 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+args <- commandArgs(TRUE)
+
+library("Matrix")
+library("imputeTS")
+
+input_matrix = as.matrix(readMM(paste(args[1], "A.mtx", sep="")))
+input_matrix[input_matrix==0] = NA
+
+bins_in = as.numeric(args[2])
+output = matrix(0, nrow=8, ncol=1)
+
+Out = statsNA(input_matrix, bins = bins_in, print_only = FALSE)
+
+output[1,1]=Out["length_series"][[1]]
+output[2,1]=Out["number_NAs"][[1]]
+output[3,1]=as.numeric(sub("%","",Out["percentage_NAs"][[1]],fixed=TRUE))/100
+output[4,1]=Out["number_na_gaps"][[1]]
+output[5,1]=Out["average_size_na_gaps"][[1]]
+output[6,1]=Out["longest_na_gap"][[1]]
+output[7,1]=Out["most_frequent_na_gap"][[1]]
+output[8,1]=Out["most_weighty_na_gap"][[1]]
+
+writeMM(as(output, "CsparseMatrix"), paste(args[3], "Out", sep=""))
+
diff --git a/src/test/scripts/functions/builtin/statsNATest.dml
b/src/test/scripts/functions/builtin/statsNATest.dml
new file mode 100644
index 0000000..fd9be6e
--- /dev/null
+++ b/src/test/scripts/functions/builtin/statsNATest.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+input_matrix = read($X);
+# replace zeros with NaN
+dataWithNa = replace(target=input_matrix, pattern = 0, replacement = NaN)
+Out = statsNA(dataWithNa, $bins, TRUE)
+write(Out, $Out);