[systemds] 01/02: [SYSTEMDS-2797] Cleanup new statsNA built-in function

mboehm7 Sat, 16 Jan 2021 13:21:44 -0800

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


commit 6165b509c3b3fcfc0b0690d52d1afe6e13d3fc17
Author: Matthias Boehm <[email protected]>
AuthorDate: Sat Jan 16 21:47:31 2021 +0100

    [SYSTEMDS-2797] Cleanup new statsNA built-in function
    
    * Vectorized all loops of statsNA
    * Fix statsNA verbose printing of gaps vector
    * Fix statsNA test formatting and warnings,
    * Fix statsNA documentation (formatting, conciseness)
    
    DIA project WS2020/21, part 2
    Co-authored-by: haubitzer <[email protected]>
    Co-authored-by: Ismael Ibrahim <[email protected]>
---
 scripts/builtin/statsNA.dml                        | 121 ++++++++-------------
 .../sysds/runtime/compress/lib/LibRightMultBy.java |   1 -
 .../compress/ParCompressedMatrixTest.java          |   1 -
 .../test/functions/builtin/BuiltinStatsNATest.java | 105 +++++++++---------
 4 files changed, 95 insertions(+), 133 deletions(-)

diff --git a/scripts/builtin/statsNA.dml b/scripts/builtin/statsNA.dml
index 2547175..d858966 100644
--- a/scripts/builtin/statsNA.dml
+++ b/scripts/builtin/statsNA.dml
@@ -21,40 +21,37 @@
 
 # Print summary stats about the distribution of missing values in a univariate 
time series.
 # 
------------------------------------------------------------------------------
-# NAME                      TYPE        DEFAULT     MEANING
+# NAME    TYPE        DEFAULT     MEANING
 # 
------------------------------------------------------------------------------
-# X                         Matrix      ---         Numeric Vector (‘vector’) 
object containing NAs
-# bins                      Integer     4           Split number for bin 
stats. Number of bins the time series gets divided into. 
-#                                                   For each bin information 
about amount/percentage of missing values is printed. 
-#                                                   Default value is 4 - which 
means stats about the 1st,2nd,3rd,4th quarter of the time series are shown.
-# verbose                   Boolean     TRUE        Choose if the function 
print or Returns. 
-#                                                   For print_only = TRUE the 
function has no return value and just prints out  missing value stats. 
-#                                                   If print_only is changed 
to FALSE, nothing is printed and the function returns a list.
-#                                                   Print gives a little bit 
more information, 
-#                                                   since the returned list 
does not include "Stats for Bins" and "overview NA series"
+# X       Matrix      ---         Numeric Vector ('vector') object containing 
NAs
+# bins    Integer     4           Split number for bin stats. Number of bins 
the time series gets 
+#                                 divided into. For each bin information about 
amount/percentage of
+#                                 missing values is printed. 
+# verbose Boolean     TRUE        Print detailed information. 
+#                                 For print_only = TRUE, the missing value 
stats are printed with
+#                                 more information ("Stats for Bins" and 
"overview NA series").
 # 
------------------------------------------------------------------------------
-# stats                    Matrix      Double       Column vector where each 
row correspond to following, 
-#                                                   1. "Length of time series" 
- Number of observations in the time-series (including NAs)
-#                                                   2. "Number of Missing 
Values" - Number of missing values in the time series
-#                                                   3. "Percentage of Missing 
Values" - Percentage of missing values in the time series
-#                                                   4. "Number of Gaps" - 
Number of NA gaps (consisting of one or more consecutive NAs) in the time series
-#                                                   5. "Average Gap Size" - 
Average size of consecutive NAs for the NA gaps in the time series
-#                                                   6. "Longest NA gap" - 
Longest series of consecutive missing values (NAs in a row) in the time series
-#                                                   7. "Most frequent gap 
size" - Most frequent occurring series of missing values in the time series
-#                                                   8. "Gap size accounting 
for most NAs" - The series of consecutive missing values that accounts for most 
missing 
-#                                                       values overall in the 
time series
+# stats   Matrix      Double      Column vector where each row correspond to 
following, 
+#                                 1. Length of time series (including NAs)
+#                                 2. Number of Missing Values (NAs)
+#                                 3. Percentage of Missing Values (#2/#1)
+#                                 4. Number of Gaps (consisting of one or more 
consecutive NAs)
+#                                 5. Average Gap Size - Average size of 
consecutive NAs for the NA gaps
+#                                 6. Longest NA gap - Longest series of 
consecutive missing values
+#                                 7. Most frequent gap size - Most frequently 
occurring gap size
+#                                 8. Gap size accounting for most NAs
 # 
------------------------------------------------------------------------------
 
 m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = 
TRUE)
-  return( Matrix[Double] stats) {
-  
+  return( Matrix[Double] stats) 
+{
   longest_nan_gap = -1
   most_frequent_nan_gap = -1
   most_weighty_nan_gap = -1
   stats = matrix(0, rows=8, cols=1)
 
   if(ncol(X) != 1) {
-     stop("statsNA: expect a matrix with only one column");
+    stop("statsNA: expect a matrix with only one column");
   }
 
   # Count total entries
@@ -63,7 +60,7 @@ m_statsNA = function(Matrix[Double] X, Integer bins = 4, 
Boolean verbose = TRUE)
   stats[1, 1] = length_series
 
   if (length_series == 0) {
-     stop("EMPTY MATRIX")
+    stop("EMPTY MATRIX")
   }
 
   if (length_series < bins) {
@@ -80,76 +77,48 @@ m_statsNA = function(Matrix[Double] X, Integer bins = 4, 
Boolean verbose = TRUE)
   # stop if no null value found in data
   if(number_nans == 0)
     stop("No missing value found in the data.")
-  
   stats[2, 1] =  number_nans
-  
+
   # Calculate percentage of NaNs
   stats[3, 1]  = number_nans / length_series;
-  # Create Vector with numbers of gaps
-  p_gaps_vector = matrix(0, length_series, 1);
-  p_length_of_gap = 0;
-  for (i in 1:length_series) {
-    if (as.scalar(p_position_nans[i,1]) == 1) {
-      p_length_of_gap += 1;
-    } 
-    else if (p_length_of_gap != 0){
-      p_gaps_vector[p_length_of_gap, 1] = 
as.scalar(p_gaps_vector[p_length_of_gap, 1]) + 1;
-      p_length_of_gap = 0;
-    }
-  }
-   
-  # The last element can also be a NaN but the loop will not update our vector 
map, so this workaround is needed.
-  if(p_length_of_gap > 0) 
-    p_gaps_vector[p_length_of_gap, 1] = 
as.scalar(p_gaps_vector[p_length_of_gap, 1]) + 1;
-  
+
+  # Create Vector with length of gaps
+  #  input:  0 0 1 1 1 0 0 0 1 1 1 1 0 1
+  #  csgaps: 0 0 1 2 3 0 0 0 1 2 3 4 0 1
+  #  output: 0 0 0 0 3 0 0 0 0 0 0 4 0 1
+  csgaps = cumsumprod(cbind(p_position_nans,p_position_nans));
+  csmask = matrix(0, length_series, 1);
+  csmask[1:(length_series-1)] = csgaps[2:length_series]
+  gap_lengths = csgaps * (csgaps > csmask)
+  gap_lengths = removeEmpty(target=gap_lengths, margin="rows")
+  p_gaps_vector = table(gap_lengths, 1);
 
   # Count number of gaps
   number_nan_gaps = sum(p_gaps_vector);
   stats[4, 1] = number_nan_gaps
+
   # Calculate average gap size
   stats[5, 1]  = number_nans / number_nan_gaps
 
-    
   # Find longest gap
-  longest_nan_gap = max(seq(1, length_series) * (p_gaps_vector>0))
-  stats[6, 1] = longest_nan_gap
+  stats[6, 1] = as.scalar(rowIndexMax(t(p_gaps_vector>0)))
 
   # Find most frequent gap size
   stats[7, 1]  = as.scalar(rowIndexMax(t(p_gaps_vector)));
 
   # Gap size that has most NaNs
-  p_gaps_vector_with_weight = matrix(0, rows=length_series, cols=1);
-  for(i in 1:length_series) {
-    p_gaps_vector_with_weight[i, 1] = i * as.scalar(p_gaps_vector[i,1]);
-  }
-  # Find most gap size with most weight
+  p_gaps_vector_with_weight = seq(1,nrow(p_gaps_vector)) * p_gaps_vector;
   stats[8, 1]  = as.scalar(rowIndexMax(t(p_gaps_vector_with_weight)));
 
   # Calculate bins
   #---
-  bins_start = matrix(0, bins, 1);
-  bins_end = matrix(0, bins, 1);
-  bins_nans = matrix(0, bins, 1);
-  bins_percentage = matrix(0, bins, 1);
   bin_length = ceiling(length_series / bins)
 
   # Calculate where a bin starts and ends
-  tmp_splitter = 0
-  for(i in 1:bins) {
-    bins_start[i,1] = tmp_splitter + 1;
-    tmp_splitter = tmp_splitter + bin_length;
-    bins_end[i,1] = tmp_splitter;
-  }
-
-  for(i in 1:bins) {
-    start = as.scalar(bins_start[i,1]);
-    end = as.scalar(bins_end[i,1]);
-    tmp_nans = sum(p_position_nans[start:end, 1]);
-
-    bins_nans[i,1] = tmp_nans;
-    bins_percentage[i,1] = tmp_nans / bin_length;
-  }
-    #---
+  bins_start = seq(1, bins*bin_length, bin_length);
+  bins_end = seq(bin_length, bins*bin_length, bin_length)
+  bins_nans = rowSums(matrix(p_position_nans, bins, bin_length))
+  bins_percentage = bins_nans/bin_length;
 
   # Print results
   #---
@@ -191,14 +160,12 @@ m_statsNA = function(Matrix[Double] X, Integer bins = 4, 
Boolean verbose = TRUE)
       }
       print("-------------------------")
     }
-    print("Stats for Bins")
-    for (i in 1:bins) {
+    print("Overview NA Series")
+    for (i in 1:nrow(p_gaps_vector)) {
       v = as.scalar(p_gaps_vector[i,1]);
-      if(v > 0) {
+      if(v > 0)
         print(" %.0f NA in a row: %d times", v, i);
-      }
     }
     print("-------------------------")
-  } 
+  }
 }
-
diff --git 
a/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java 
b/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java
index d164419..aac6dcf 100644
--- a/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java
+++ b/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java
@@ -34,7 +34,6 @@ import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.CompressionSettings;
 import org.apache.sysds.runtime.compress.colgroup.ColGroup;
-import org.apache.sysds.runtime.compress.colgroup.ColGroupOLE;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed;
 import org.apache.sysds.runtime.compress.colgroup.ColGroupValue;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
diff --git 
a/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java
 
b/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java
index 2251c1c..87890b2 100644
--- 
a/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java
+++ 
b/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java
@@ -21,7 +21,6 @@ package org.apache.sysds.test.component.compress;
 
 import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysds.runtime.compress.CompressionSettings;
-import 
org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer;
 import org.apache.sysds.runtime.instructions.InstructionUtils;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.matrix.operators.AggregateBinaryOperator;
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java
index 733a6fc..088d914 100644
--- 
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java
@@ -30,67 +30,64 @@ import org.junit.Test;
 import java.util.HashMap;
 
 public class BuiltinStatsNATest extends AutomatedTestBase {
-    private final static String TEST_NAME = "statsNATest";
-    private final static String TEST_DIR = "functions/builtin/";
-    private final static String TEST_CLASS_DIR = TEST_DIR + 
BuiltinSplitTest.class.getSimpleName() + "/";
-    private final static double eps = 1e-3;
+       private final static String TEST_NAME = "statsNATest";
+       private final static String TEST_DIR = "functions/builtin/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
BuiltinSplitTest.class.getSimpleName() + "/";
+       private final static double eps = 1e-3;
 
-    @Override
-    public void setUp() {
-        TestUtils.clearAssertionInformation();
-        addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, 
TEST_NAME, new String[]{"B",}));
-    }
+       @Override
+       public void setUp() {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_NAME, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B",}));
+       }
 
-    @Test
-    public void testStatsNA1() {
-        runStatsNA(1, 100, LopProperties.ExecType.CP);
-    }
+       @Test
+       public void testStatsNA1() {
+               runStatsNA(1, 100, LopProperties.ExecType.CP);
+       }
 
-    @Test
-    public void testStatsNA2() {
-        runStatsNA(4, 100, LopProperties.ExecType.CP);
-    }
+       @Test
+       public void testStatsNA2() {
+               runStatsNA(4, 100, LopProperties.ExecType.CP);
+       }
 
-    @Test
-    public void testStatsNA3() {
-        runStatsNA(100, 1000, LopProperties.ExecType.CP);
-    }
+       @Test
+       public void testStatsNA3() {
+               runStatsNA(100, 1000, LopProperties.ExecType.CP);
+       }
 
-    @Test
-    public void testStatsNA4() {
-        runStatsNA(100, 10000, LopProperties.ExecType.CP);
-    }
+       @Test
+       public void testStatsNA4() {
+               runStatsNA(100, 10000, LopProperties.ExecType.CP);
+       }
 
 
-    private void runStatsNA(int bins, int size, LopProperties.ExecType 
instType) {
-        Types.ExecMode platformOld = setExecMode(instType);
-        try
-        {
-            loadTestConfiguration(getTestConfiguration(TEST_NAME));
-            String HOME = SCRIPT_DIR + TEST_DIR;
-            fullDMLScriptName = HOME + TEST_NAME + ".dml";
-            programArgs = new String[]{ "-nvargs", "X=" + input("A"), "bins=" 
+ bins, "Out=" + output("Out")};
+       private void runStatsNA(int bins, int size, LopProperties.ExecType 
instType) {
+               Types.ExecMode platformOld = setExecMode(instType);
+               try {
+                       loadTestConfiguration(getTestConfiguration(TEST_NAME));
+                       String HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
+                       programArgs = new String[]{ "-nvargs", "X=" + 
input("A"), "bins=" + bins, "Out=" + output("Out")};
 
-            double[][] A = getRandomMatrix(size, 1, -10, 10, 0.6, 7);
-            writeInputMatrixWithMTD("A", A, true);
+                       double[][] A = getRandomMatrix(size, 1, -10, 10, 0.6, 
7);
+                       writeInputMatrixWithMTD("A", A, true);
 
-            fullRScriptName = HOME + TEST_NAME + ".R";
-            rCmd = getRCmd(inputDir(), Integer.toString(bins), expectedDir());
+                       fullRScriptName = HOME + TEST_NAME + ".R";
+                       rCmd = getRCmd(inputDir(), Integer.toString(bins), 
expectedDir());
 
-            runTest(true, false, null, -1);
-            runRScript(true);
-            //compare matrices
-            HashMap<MatrixValue.CellIndex, Double> dmlfileOut1 = 
readDMLMatrixFromOutputDir("Out");
-            HashMap<MatrixValue.CellIndex, Double> rfileOut1 = 
readRMatrixFromExpectedDir("Out");
-            MatrixValue.CellIndex key_ce = new MatrixValue.CellIndex(1, 1);
-
-            TestUtils.compareMatrices(dmlfileOut1, rfileOut1, eps, "Stat-DML", 
"Stat-R");
-        }
-        catch(Exception e) {
-            e.printStackTrace();
-        }
-        finally {
-            rtplatform = platformOld;
-        }
-    }
-}
\ No newline at end of file
+                       runTest(true, false, null, -1);
+                       runRScript(true);
+                       //compare matrices
+                       HashMap<MatrixValue.CellIndex, Double> dmlfileOut1 = 
readDMLMatrixFromOutputDir("Out");
+                       HashMap<MatrixValue.CellIndex, Double> rfileOut1 = 
readRMatrixFromExpectedDir("Out");
+                       TestUtils.compareMatrices(dmlfileOut1, rfileOut1, eps, 
"Stat-DML", "Stat-R");
+               }
+               catch(Exception e) {
+                       e.printStackTrace();
+               }
+               finally {
+                       rtplatform = platformOld;
+               }
+       }
+}

[systemds] 01/02: [SYSTEMDS-2797] Cleanup new statsNA built-in function

Reply via email to