This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


The following commit(s) were added to refs/heads/master by this push:
     new 955365c  [SYSTEMML-2121] PCA test for codegenalg suite
955365c is described below

commit 955365c5da1a916541d734a4e9494ab61c932503
Author: Janardhan Pulivarthi <j...@protonmail.com>
AuthorDate: Sat Apr 25 22:15:06 2020 +0200

    [SYSTEMML-2121] PCA test for codegenalg suite
    
    This patch adds a test case for algorithm test with codegen
      enabled against an R script.
    
      The test matrix is as follows:
    
              | Rewrite | Sparse |  FuseAll | FuseNoRedundancy |
              | ------- | ------ | -------- | ---------------- |
      - Spark |   1     |    0   |     0    |        0         |
        or CP |   1     |    1   |     0    |        0         |
              |   0     |    0   |     0    |        0         |
              |   0     |    1   |     0    |        0         |
              |   0     |    0   |     1    |        0         |
              |   0     |    1   |     1    |        0         |
              |   0     |    0   |     0    |        1         |
              |   0     |    1   |     0    |        1         |
    
    Closes #889.
---
 scripts/algorithms/PCA.dml                         |  14 +-
 .../functions/codegenalg/partone/AlgorithmPCA.java | 213 +++++++++++++++++++++
 .../scripts/functions/codegenalg/Algorithm_PCA.R   |  87 +++++++++
 3 files changed, 301 insertions(+), 13 deletions(-)

diff --git a/scripts/algorithms/PCA.dml b/scripts/algorithms/PCA.dml
index d165351..ea7afd7 100644
--- a/scripts/algorithms/PCA.dml
+++ b/scripts/algorithms/PCA.dml
@@ -62,19 +62,7 @@ if (model != "") {
        D = ncol(A);
 
        # perform z-scoring (centering and scaling)
-       if (center == 1) {
-           cm = colMeans(A);
-           A = A - cm;
-       }
-       if (scale == 1) {
-           cvars = (colSums (A^2));    
-           if (center == 1){
-               cm = colMeans(A);
-               cvars = (cvars - N*(cm^2))/(N-1);                   
-           }
-           Azscored = (A)/sqrt(cvars);
-            A = Azscored;
-       }       
+       A = scale(A, center==1, scale==1);
 
        # co-variance matrix 
        mu = colSums(A)/N;
diff --git 
a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java
 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java
new file mode 100644
index 0000000..e0a1906
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.codegenalg.partone;
+
+import java.io.File;
+import java.util.HashMap;
+
+import org.junit.Test;
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.lops.LopProperties.ExecType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+
+public class AlgorithmPCA extends AutomatedTestBase
+{
+       private final static String TEST_NAME1 = "Algorithm_PCA";
+       private final static String TEST_DIR = "functions/codegenalg/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
AlgorithmPCA.class.getSimpleName() + "/";
+       private final static String TEST_CONF_DEFAULT = 
"SystemDS-config-codegen.xml";
+       private final static File TEST_CONF_FILE_DEFAULT = new File(SCRIPT_DIR 
+ TEST_DIR, TEST_CONF_DEFAULT);
+       private final static String TEST_CONF_FUSE_ALL = 
"SystemDS-config-codegen-fuse-all.xml";
+       private final static File TEST_CONF_FILE_FUSE_ALL = new File(SCRIPT_DIR 
+ TEST_DIR, TEST_CONF_FUSE_ALL);
+       private final static String TEST_CONF_FUSE_NO_REDUNDANCY = 
"SystemDS-config-codegen-fuse-no-redundancy.xml";
+       private final static File TEST_CONF_FILE_FUSE_NO_REDUNDANCY = new 
File(SCRIPT_DIR + TEST_DIR,
+                       TEST_CONF_FUSE_NO_REDUNDANCY);
+
+       private enum TestType { DEFAULT, FUSE_ALL, FUSE_NO_REDUNDANCY }
+
+       private final static double eps = 1e-5;
+
+       private final static int rows = 1468;
+       private final static int cols1 = 1007;
+       private final static int cols2 = 387;
+
+       private final static double sparsity1 = 0.7; //dense
+       private final static double sparsity2 = 0.1; //sparse
+       
+       private TestType currentTestType = TestType.DEFAULT;
+
+       @Override
+       public void setUp() {
+               TestUtils.clearAssertionInformation();
+               addTestConfiguration(TEST_NAME1, new 
TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" }));
+       }
+
+       @Test
+       public void testPCADenseRewritesCP() {
+               runPCATest(TEST_NAME1, true, false, ExecType.CP, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCASparseRewritesCP() {
+               runPCATest(TEST_NAME1, true, true, ExecType.CP, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCADenseCP() {
+               runPCATest(TEST_NAME1, false, false, ExecType.CP, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCASparseCP() {
+               runPCATest(TEST_NAME1, false, true, ExecType.CP, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCADenseRewritesSP() {
+               runPCATest(TEST_NAME1, true, false, ExecType.SPARK, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCASparseRewritesSP() {
+               runPCATest(TEST_NAME1, true, true, ExecType.SPARK, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCADenseSP() {
+               runPCATest(TEST_NAME1, false, false, ExecType.SPARK, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCASparseSP() {
+               runPCATest(TEST_NAME1, false, true, ExecType.SPARK, 
TestType.DEFAULT);
+       }
+
+       @Test
+       public void testPCADenseRewritesCPFuseAll() {
+               runPCATest(TEST_NAME1, true, false, ExecType.CP, 
TestType.FUSE_ALL);
+       }
+
+       @Test
+       public void testPCASparseRewritesCPFuseAll() {
+               runPCATest(TEST_NAME1, true, true, ExecType.CP, 
TestType.FUSE_ALL);
+       }
+
+       @Test
+       public void testPCADenseRewritesSPFuseAll() {
+               runPCATest(TEST_NAME1, true, false, ExecType.SPARK, 
TestType.FUSE_ALL);
+       }
+
+       @Test
+       public void testPCASparseRewritesSPFuseAll() {
+               runPCATest(TEST_NAME1, true, true, ExecType.SPARK, 
TestType.FUSE_ALL);
+       }
+
+       @Test
+       public void testPCADenseRewritesCPFuseNoRedundancy() {
+               runPCATest(TEST_NAME1, true, false, ExecType.CP, 
TestType.FUSE_NO_REDUNDANCY);
+       }
+
+       @Test
+       public void testPCASparseRewritesCPFuseNoRedundancy() {
+               runPCATest(TEST_NAME1, true, true, ExecType.CP, 
TestType.FUSE_NO_REDUNDANCY);
+       }
+
+       @Test
+       public void testPCADenseRewritesSPFuseNoRedundancy() {
+               runPCATest(TEST_NAME1, true, false, ExecType.SPARK, 
TestType.FUSE_NO_REDUNDANCY);
+       }
+
+       @Test
+       public void testPCASparseRewritesSPFuseNoRedundancy() {
+               runPCATest(TEST_NAME1, true, true, ExecType.SPARK, 
TestType.FUSE_NO_REDUNDANCY);
+       }
+
+       private void runPCATest(String testname, boolean rewrites, boolean 
sparse, ExecType instType, TestType testType)
+       {
+               boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
+               ExecMode platformOld = setExecMode(instType);
+               
+               try {
+                       String TEST_NAME = testname;
+                       TestConfiguration config = 
getTestConfiguration(TEST_NAME);
+                       loadTestConfiguration(config);
+
+                       fullDMLScriptName = "scripts/algorithms/PCA.dml";
+                       // pass OFMT=text flag, since readDMLMatrixFromHDFS() 
uses " " separator, not a "," separator.
+                       programArgs = new String[]{ "-explain", "-stats", 
"-nvargs", "OFMT=TEXT","INPUT="+input("A"),
+                                       "OUTPUT="+output("")};
+
+                       rCmd = getRCmd(inputDir(), expectedDir());
+
+                       OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = 
rewrites;
+
+                       //generate actual datasets
+                       int cols = (instType==ExecType.SPARK) ? cols2 : cols1;
+                       double[][] A = getRandomMatrix(rows, cols, 0, 1, 
sparse?sparsity2:sparsity1, 714);
+                       writeInputMatrixWithMTD("A", A, true);
+
+                       runTest(true, false, null, -1);
+                       runRScript(true);
+
+                       //compare matrices
+                       HashMap<CellIndex, Double> dmleval = 
readDMLMatrixFromHDFS("dominant.eigen.values");
+                       HashMap<CellIndex, Double> reval   = 
readRMatrixFromFS("dominant.eigen.values");
+                       HashMap<CellIndex, Double> dmlevec = 
readDMLMatrixFromHDFS("dominant.eigen.vectors");
+                       HashMap<CellIndex, Double> revec = 
readDMLMatrixFromHDFS("dominant.eigen.vectors");
+                       HashMap<CellIndex, Double> dmlstd = 
readDMLMatrixFromHDFS("dominant.eigen.standard.deviations");
+                       HashMap<CellIndex, Double> rstd   = 
readRMatrixFromFS("dominant.eigen.standard.deviations");
+                       TestUtils.compareMatrices(dmleval, reval, eps, 
"Stat-DML", "Stat-R");
+                       TestUtils.compareMatrices(dmlevec, revec, eps, 
"Stat-DML", "Stat-R");
+                       TestUtils.compareMatrices(dmlstd, rstd, eps, 
"Stat-DML", "Stat-R");
+                       
Assert.assertTrue(heavyHittersContainsSubString("spoof") || 
heavyHittersContainsSubString("sp_spoof"));
+               }
+               finally {
+                       resetExecMode(platformOld);
+                       OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag;
+                       OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true;
+                       OptimizerUtils.ALLOW_OPERATOR_FUSION = true;
+               }
+       }
+
+       /**
+        * Override default configuration with custom test configuration to 
ensure
+        * scratch space and local temporary directory locations are also 
updated.
+        */
+       @Override
+       protected File getConfigTemplateFile() {
+               // Instrumentation in this test's output log to show custom 
configuration file used for template.
+               String message = "This test case overrides default 
configuration with ";
+               if(currentTestType == AlgorithmPCA.TestType.FUSE_ALL){
+                       System.out.println(message + 
TEST_CONF_FILE_FUSE_ALL.getPath());
+                       return TEST_CONF_FILE_FUSE_ALL;
+               } else if(currentTestType == TestType.FUSE_NO_REDUNDANCY){
+                       System.out.println(message + 
TEST_CONF_FILE_FUSE_NO_REDUNDANCY.getPath());
+                       return TEST_CONF_FILE_FUSE_NO_REDUNDANCY;
+               } else {
+                       System.out.println(message + 
TEST_CONF_FILE_DEFAULT.getPath());
+                       return TEST_CONF_FILE_DEFAULT;
+               }
+       }
+}
diff --git a/src/test/scripts/functions/codegenalg/Algorithm_PCA.R 
b/src/test/scripts/functions/codegenalg/Algorithm_PCA.R
new file mode 100644
index 0000000..338e6a1
--- /dev/null
+++ b/src/test/scripts/functions/codegenalg/Algorithm_PCA.R
@@ -0,0 +1,87 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# This script performs Principal Component Analysis (PCA) on the given input 
data.
+#
+
+args <- commandArgs(TRUE)
+library("Matrix")
+
+A = readMM(paste(args[1], "A.mtx", sep=""));
+K = ncol(A);
+projectData = 0;
+model = "";
+center = 0;
+scale = 0;
+
+
+if (model != "") {
+  # reuse existing model to project data
+} else if (model == "") {
+
+  N = nrow(A);
+  D = ncol(A);
+
+  # 1. perform z-scoring (centering and scaling)
+  if (center == 1) {
+    cm = matrix(1, nrow(A), 1) %*% colMeans(A);
+    A = A - cm
+  }
+  if (scale == 1) {
+    cvars = (colSums(A^2));
+    if (center == 1){
+      #cm = colMeans(A);
+      cvars = (cvars - N*(colMeans(A)^2))/(N-1);
+    }
+    Azscored = A / sqrt(cvars);
+    A = Azscored;
+  }
+
+  # 2. compute co-variance matrix
+  mu = colSums(A)/N;
+  C = (t(A) %*% A)/(N-1) - (N/(N-1))*(mu) %*% t(mu);
+
+  # 3. compute eigen vectors and values
+  R <- eigen(C);
+  evalues = R$values;
+  evectors = R$vectors;
+
+  # 4. make an index of values sorted according to magnitude of evalues
+  decreasing_Idx = order(as.vector(evalues), decreasing=TRUE);
+  diagmat = table(seq(1,D), decreasing_Idx);
+  # 5. sorts eigen values by decreasing order
+  evalues = diagmat %*% evalues;
+  # 6. sorts eigen vectors column-wise in the order of decreasing eigen values
+  evectors = evectors %*% diagmat;
+
+  # 7. select K dominant eigen vectors
+  nvec = ncol(evectors); # Here `nvec=K`
+  eval_dominant = evalues[1:K, 1];
+  evec_dominant = evectors[1:K,];
+
+  # 8. compute the std. deviation of dominant evalues
+  eval_stdev_dominant = sqrt(eval_dominant);
+
+  writeMM(as(eval_stdev_dominant, "CsparseMatrix"), 
paste(args[2],"dominant.eigen.standard.deviations", sep=""));
+  writeMM(as(eval_dominant, "CsparseMatrix"), paste(args[2], 
"dominant.eigen.values", sep=""));
+  writeMM(as(evec_dominant, "CsparseMatrix"), 
paste(args[2],"dominant.eigen.vectors", sep=""));
+}
\ No newline at end of file

Reply via email to