This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 6165b509c3b3fcfc0b0690d52d1afe6e13d3fc17 Author: Matthias Boehm <[email protected]> AuthorDate: Sat Jan 16 21:47:31 2021 +0100 [SYSTEMDS-2797] Cleanup new statsNA built-in function * Vectorized all loops of statsNA * Fix statsNA verbose printing of gaps vector * Fix statsNA test formatting and warnings, * Fix statsNA documentation (formatting, conciseness) DIA project WS2020/21, part 2 Co-authored-by: haubitzer <[email protected]> Co-authored-by: Ismael Ibrahim <[email protected]> --- scripts/builtin/statsNA.dml | 121 ++++++++------------- .../sysds/runtime/compress/lib/LibRightMultBy.java | 1 - .../compress/ParCompressedMatrixTest.java | 1 - .../test/functions/builtin/BuiltinStatsNATest.java | 105 +++++++++--------- 4 files changed, 95 insertions(+), 133 deletions(-) diff --git a/scripts/builtin/statsNA.dml b/scripts/builtin/statsNA.dml index 2547175..d858966 100644 --- a/scripts/builtin/statsNA.dml +++ b/scripts/builtin/statsNA.dml @@ -21,40 +21,37 @@ # Print summary stats about the distribution of missing values in a univariate time series. # ------------------------------------------------------------------------------ -# NAME TYPE DEFAULT MEANING +# NAME TYPE DEFAULT MEANING # ------------------------------------------------------------------------------ -# X Matrix --- Numeric Vector (‘vector’) object containing NAs -# bins Integer 4 Split number for bin stats. Number of bins the time series gets divided into. -# For each bin information about amount/percentage of missing values is printed. -# Default value is 4 - which means stats about the 1st,2nd,3rd,4th quarter of the time series are shown. -# verbose Boolean TRUE Choose if the function print or Returns. -# For print_only = TRUE the function has no return value and just prints out missing value stats. -# If print_only is changed to FALSE, nothing is printed and the function returns a list. -# Print gives a little bit more information, -# since the returned list does not include "Stats for Bins" and "overview NA series" +# X Matrix --- Numeric Vector ('vector') object containing NAs +# bins Integer 4 Split number for bin stats. Number of bins the time series gets +# divided into. For each bin information about amount/percentage of +# missing values is printed. +# verbose Boolean TRUE Print detailed information. +# For print_only = TRUE, the missing value stats are printed with +# more information ("Stats for Bins" and "overview NA series"). # ------------------------------------------------------------------------------ -# stats Matrix Double Column vector where each row correspond to following, -# 1. "Length of time series" - Number of observations in the time-series (including NAs) -# 2. "Number of Missing Values" - Number of missing values in the time series -# 3. "Percentage of Missing Values" - Percentage of missing values in the time series -# 4. "Number of Gaps" - Number of NA gaps (consisting of one or more consecutive NAs) in the time series -# 5. "Average Gap Size" - Average size of consecutive NAs for the NA gaps in the time series -# 6. "Longest NA gap" - Longest series of consecutive missing values (NAs in a row) in the time series -# 7. "Most frequent gap size" - Most frequent occurring series of missing values in the time series -# 8. "Gap size accounting for most NAs" - The series of consecutive missing values that accounts for most missing -# values overall in the time series +# stats Matrix Double Column vector where each row correspond to following, +# 1. Length of time series (including NAs) +# 2. Number of Missing Values (NAs) +# 3. Percentage of Missing Values (#2/#1) +# 4. Number of Gaps (consisting of one or more consecutive NAs) +# 5. Average Gap Size - Average size of consecutive NAs for the NA gaps +# 6. Longest NA gap - Longest series of consecutive missing values +# 7. Most frequent gap size - Most frequently occurring gap size +# 8. Gap size accounting for most NAs # ------------------------------------------------------------------------------ m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE) - return( Matrix[Double] stats) { - + return( Matrix[Double] stats) +{ longest_nan_gap = -1 most_frequent_nan_gap = -1 most_weighty_nan_gap = -1 stats = matrix(0, rows=8, cols=1) if(ncol(X) != 1) { - stop("statsNA: expect a matrix with only one column"); + stop("statsNA: expect a matrix with only one column"); } # Count total entries @@ -63,7 +60,7 @@ m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE) stats[1, 1] = length_series if (length_series == 0) { - stop("EMPTY MATRIX") + stop("EMPTY MATRIX") } if (length_series < bins) { @@ -80,76 +77,48 @@ m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE) # stop if no null value found in data if(number_nans == 0) stop("No missing value found in the data.") - stats[2, 1] = number_nans - + # Calculate percentage of NaNs stats[3, 1] = number_nans / length_series; - # Create Vector with numbers of gaps - p_gaps_vector = matrix(0, length_series, 1); - p_length_of_gap = 0; - for (i in 1:length_series) { - if (as.scalar(p_position_nans[i,1]) == 1) { - p_length_of_gap += 1; - } - else if (p_length_of_gap != 0){ - p_gaps_vector[p_length_of_gap, 1] = as.scalar(p_gaps_vector[p_length_of_gap, 1]) + 1; - p_length_of_gap = 0; - } - } - - # The last element can also be a NaN but the loop will not update our vector map, so this workaround is needed. - if(p_length_of_gap > 0) - p_gaps_vector[p_length_of_gap, 1] = as.scalar(p_gaps_vector[p_length_of_gap, 1]) + 1; - + + # Create Vector with length of gaps + # input: 0 0 1 1 1 0 0 0 1 1 1 1 0 1 + # csgaps: 0 0 1 2 3 0 0 0 1 2 3 4 0 1 + # output: 0 0 0 0 3 0 0 0 0 0 0 4 0 1 + csgaps = cumsumprod(cbind(p_position_nans,p_position_nans)); + csmask = matrix(0, length_series, 1); + csmask[1:(length_series-1)] = csgaps[2:length_series] + gap_lengths = csgaps * (csgaps > csmask) + gap_lengths = removeEmpty(target=gap_lengths, margin="rows") + p_gaps_vector = table(gap_lengths, 1); # Count number of gaps number_nan_gaps = sum(p_gaps_vector); stats[4, 1] = number_nan_gaps + # Calculate average gap size stats[5, 1] = number_nans / number_nan_gaps - # Find longest gap - longest_nan_gap = max(seq(1, length_series) * (p_gaps_vector>0)) - stats[6, 1] = longest_nan_gap + stats[6, 1] = as.scalar(rowIndexMax(t(p_gaps_vector>0))) # Find most frequent gap size stats[7, 1] = as.scalar(rowIndexMax(t(p_gaps_vector))); # Gap size that has most NaNs - p_gaps_vector_with_weight = matrix(0, rows=length_series, cols=1); - for(i in 1:length_series) { - p_gaps_vector_with_weight[i, 1] = i * as.scalar(p_gaps_vector[i,1]); - } - # Find most gap size with most weight + p_gaps_vector_with_weight = seq(1,nrow(p_gaps_vector)) * p_gaps_vector; stats[8, 1] = as.scalar(rowIndexMax(t(p_gaps_vector_with_weight))); # Calculate bins #--- - bins_start = matrix(0, bins, 1); - bins_end = matrix(0, bins, 1); - bins_nans = matrix(0, bins, 1); - bins_percentage = matrix(0, bins, 1); bin_length = ceiling(length_series / bins) # Calculate where a bin starts and ends - tmp_splitter = 0 - for(i in 1:bins) { - bins_start[i,1] = tmp_splitter + 1; - tmp_splitter = tmp_splitter + bin_length; - bins_end[i,1] = tmp_splitter; - } - - for(i in 1:bins) { - start = as.scalar(bins_start[i,1]); - end = as.scalar(bins_end[i,1]); - tmp_nans = sum(p_position_nans[start:end, 1]); - - bins_nans[i,1] = tmp_nans; - bins_percentage[i,1] = tmp_nans / bin_length; - } - #--- + bins_start = seq(1, bins*bin_length, bin_length); + bins_end = seq(bin_length, bins*bin_length, bin_length) + bins_nans = rowSums(matrix(p_position_nans, bins, bin_length)) + bins_percentage = bins_nans/bin_length; # Print results #--- @@ -191,14 +160,12 @@ m_statsNA = function(Matrix[Double] X, Integer bins = 4, Boolean verbose = TRUE) } print("-------------------------") } - print("Stats for Bins") - for (i in 1:bins) { + print("Overview NA Series") + for (i in 1:nrow(p_gaps_vector)) { v = as.scalar(p_gaps_vector[i,1]); - if(v > 0) { + if(v > 0) print(" %.0f NA in a row: %d times", v, i); - } } print("-------------------------") - } + } } - diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java b/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java index d164419..aac6dcf 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java +++ b/src/main/java/org/apache/sysds/runtime/compress/lib/LibRightMultBy.java @@ -34,7 +34,6 @@ import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; import org.apache.sysds.runtime.compress.CompressionSettings; import org.apache.sysds.runtime.compress.colgroup.ColGroup; -import org.apache.sysds.runtime.compress.colgroup.ColGroupOLE; import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed; import org.apache.sysds.runtime.compress.colgroup.ColGroupValue; import org.apache.sysds.runtime.matrix.data.MatrixBlock; diff --git a/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java b/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java index 2251c1c..87890b2 100644 --- a/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/component/compress/ParCompressedMatrixTest.java @@ -21,7 +21,6 @@ package org.apache.sysds.test.component.compress; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; import org.apache.sysds.runtime.compress.CompressionSettings; -import org.apache.sysds.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysds.runtime.instructions.InstructionUtils; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.operators.AggregateBinaryOperator; diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java index 733a6fc..088d914 100644 --- a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java +++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinStatsNATest.java @@ -30,67 +30,64 @@ import org.junit.Test; import java.util.HashMap; public class BuiltinStatsNATest extends AutomatedTestBase { - private final static String TEST_NAME = "statsNATest"; - private final static String TEST_DIR = "functions/builtin/"; - private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinSplitTest.class.getSimpleName() + "/"; - private final static double eps = 1e-3; + private final static String TEST_NAME = "statsNATest"; + private final static String TEST_DIR = "functions/builtin/"; + private final static String TEST_CLASS_DIR = TEST_DIR + BuiltinSplitTest.class.getSimpleName() + "/"; + private final static double eps = 1e-3; - @Override - public void setUp() { - TestUtils.clearAssertionInformation(); - addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B",})); - } + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"B",})); + } - @Test - public void testStatsNA1() { - runStatsNA(1, 100, LopProperties.ExecType.CP); - } + @Test + public void testStatsNA1() { + runStatsNA(1, 100, LopProperties.ExecType.CP); + } - @Test - public void testStatsNA2() { - runStatsNA(4, 100, LopProperties.ExecType.CP); - } + @Test + public void testStatsNA2() { + runStatsNA(4, 100, LopProperties.ExecType.CP); + } - @Test - public void testStatsNA3() { - runStatsNA(100, 1000, LopProperties.ExecType.CP); - } + @Test + public void testStatsNA3() { + runStatsNA(100, 1000, LopProperties.ExecType.CP); + } - @Test - public void testStatsNA4() { - runStatsNA(100, 10000, LopProperties.ExecType.CP); - } + @Test + public void testStatsNA4() { + runStatsNA(100, 10000, LopProperties.ExecType.CP); + } - private void runStatsNA(int bins, int size, LopProperties.ExecType instType) { - Types.ExecMode platformOld = setExecMode(instType); - try - { - loadTestConfiguration(getTestConfiguration(TEST_NAME)); - String HOME = SCRIPT_DIR + TEST_DIR; - fullDMLScriptName = HOME + TEST_NAME + ".dml"; - programArgs = new String[]{ "-nvargs", "X=" + input("A"), "bins=" + bins, "Out=" + output("Out")}; + private void runStatsNA(int bins, int size, LopProperties.ExecType instType) { + Types.ExecMode platformOld = setExecMode(instType); + try { + loadTestConfiguration(getTestConfiguration(TEST_NAME)); + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[]{ "-nvargs", "X=" + input("A"), "bins=" + bins, "Out=" + output("Out")}; - double[][] A = getRandomMatrix(size, 1, -10, 10, 0.6, 7); - writeInputMatrixWithMTD("A", A, true); + double[][] A = getRandomMatrix(size, 1, -10, 10, 0.6, 7); + writeInputMatrixWithMTD("A", A, true); - fullRScriptName = HOME + TEST_NAME + ".R"; - rCmd = getRCmd(inputDir(), Integer.toString(bins), expectedDir()); + fullRScriptName = HOME + TEST_NAME + ".R"; + rCmd = getRCmd(inputDir(), Integer.toString(bins), expectedDir()); - runTest(true, false, null, -1); - runRScript(true); - //compare matrices - HashMap<MatrixValue.CellIndex, Double> dmlfileOut1 = readDMLMatrixFromOutputDir("Out"); - HashMap<MatrixValue.CellIndex, Double> rfileOut1 = readRMatrixFromExpectedDir("Out"); - MatrixValue.CellIndex key_ce = new MatrixValue.CellIndex(1, 1); - - TestUtils.compareMatrices(dmlfileOut1, rfileOut1, eps, "Stat-DML", "Stat-R"); - } - catch(Exception e) { - e.printStackTrace(); - } - finally { - rtplatform = platformOld; - } - } -} \ No newline at end of file + runTest(true, false, null, -1); + runRScript(true); + //compare matrices + HashMap<MatrixValue.CellIndex, Double> dmlfileOut1 = readDMLMatrixFromOutputDir("Out"); + HashMap<MatrixValue.CellIndex, Double> rfileOut1 = readRMatrixFromExpectedDir("Out"); + TestUtils.compareMatrices(dmlfileOut1, rfileOut1, eps, "Stat-DML", "Stat-R"); + } + catch(Exception e) { + e.printStackTrace(); + } + finally { + rtplatform = platformOld; + } + } +}
