This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemml.git
The following commit(s) were added to refs/heads/master by this push: new 955365c [SYSTEMML-2121] PCA test for codegenalg suite 955365c is described below commit 955365c5da1a916541d734a4e9494ab61c932503 Author: Janardhan Pulivarthi <j...@protonmail.com> AuthorDate: Sat Apr 25 22:15:06 2020 +0200 [SYSTEMML-2121] PCA test for codegenalg suite This patch adds a test case for algorithm test with codegen enabled against an R script. The test matrix is as follows: | Rewrite | Sparse | FuseAll | FuseNoRedundancy | | ------- | ------ | -------- | ---------------- | - Spark | 1 | 0 | 0 | 0 | or CP | 1 | 1 | 0 | 0 | | 0 | 0 | 0 | 0 | | 0 | 1 | 0 | 0 | | 0 | 0 | 1 | 0 | | 0 | 1 | 1 | 0 | | 0 | 0 | 0 | 1 | | 0 | 1 | 0 | 1 | Closes #889. --- scripts/algorithms/PCA.dml | 14 +- .../functions/codegenalg/partone/AlgorithmPCA.java | 213 +++++++++++++++++++++ .../scripts/functions/codegenalg/Algorithm_PCA.R | 87 +++++++++ 3 files changed, 301 insertions(+), 13 deletions(-) diff --git a/scripts/algorithms/PCA.dml b/scripts/algorithms/PCA.dml index d165351..ea7afd7 100644 --- a/scripts/algorithms/PCA.dml +++ b/scripts/algorithms/PCA.dml @@ -62,19 +62,7 @@ if (model != "") { D = ncol(A); # perform z-scoring (centering and scaling) - if (center == 1) { - cm = colMeans(A); - A = A - cm; - } - if (scale == 1) { - cvars = (colSums (A^2)); - if (center == 1){ - cm = colMeans(A); - cvars = (cvars - N*(cm^2))/(N-1); - } - Azscored = (A)/sqrt(cvars); - A = Azscored; - } + A = scale(A, center==1, scale==1); # co-variance matrix mu = colSums(A)/N; diff --git a/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java new file mode 100644 index 0000000..e0a1906 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/codegenalg/partone/AlgorithmPCA.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.codegenalg.partone; + +import java.io.File; +import java.util.HashMap; + +import org.junit.Test; +import org.apache.sysds.common.Types.ExecMode; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.lops.LopProperties.ExecType; +import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; +import org.junit.Assert; + +public class AlgorithmPCA extends AutomatedTestBase +{ + private final static String TEST_NAME1 = "Algorithm_PCA"; + private final static String TEST_DIR = "functions/codegenalg/"; + private final static String TEST_CLASS_DIR = TEST_DIR + AlgorithmPCA.class.getSimpleName() + "/"; + private final static String TEST_CONF_DEFAULT = "SystemDS-config-codegen.xml"; + private final static File TEST_CONF_FILE_DEFAULT = new File(SCRIPT_DIR + TEST_DIR, TEST_CONF_DEFAULT); + private final static String TEST_CONF_FUSE_ALL = "SystemDS-config-codegen-fuse-all.xml"; + private final static File TEST_CONF_FILE_FUSE_ALL = new File(SCRIPT_DIR + TEST_DIR, TEST_CONF_FUSE_ALL); + private final static String TEST_CONF_FUSE_NO_REDUNDANCY = "SystemDS-config-codegen-fuse-no-redundancy.xml"; + private final static File TEST_CONF_FILE_FUSE_NO_REDUNDANCY = new File(SCRIPT_DIR + TEST_DIR, + TEST_CONF_FUSE_NO_REDUNDANCY); + + private enum TestType { DEFAULT, FUSE_ALL, FUSE_NO_REDUNDANCY } + + private final static double eps = 1e-5; + + private final static int rows = 1468; + private final static int cols1 = 1007; + private final static int cols2 = 387; + + private final static double sparsity1 = 0.7; //dense + private final static double sparsity2 = 0.1; //sparse + + private TestType currentTestType = TestType.DEFAULT; + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "w" })); + } + + @Test + public void testPCADenseRewritesCP() { + runPCATest(TEST_NAME1, true, false, ExecType.CP, TestType.DEFAULT); + } + + @Test + public void testPCASparseRewritesCP() { + runPCATest(TEST_NAME1, true, true, ExecType.CP, TestType.DEFAULT); + } + + @Test + public void testPCADenseCP() { + runPCATest(TEST_NAME1, false, false, ExecType.CP, TestType.DEFAULT); + } + + @Test + public void testPCASparseCP() { + runPCATest(TEST_NAME1, false, true, ExecType.CP, TestType.DEFAULT); + } + + @Test + public void testPCADenseRewritesSP() { + runPCATest(TEST_NAME1, true, false, ExecType.SPARK, TestType.DEFAULT); + } + + @Test + public void testPCASparseRewritesSP() { + runPCATest(TEST_NAME1, true, true, ExecType.SPARK, TestType.DEFAULT); + } + + @Test + public void testPCADenseSP() { + runPCATest(TEST_NAME1, false, false, ExecType.SPARK, TestType.DEFAULT); + } + + @Test + public void testPCASparseSP() { + runPCATest(TEST_NAME1, false, true, ExecType.SPARK, TestType.DEFAULT); + } + + @Test + public void testPCADenseRewritesCPFuseAll() { + runPCATest(TEST_NAME1, true, false, ExecType.CP, TestType.FUSE_ALL); + } + + @Test + public void testPCASparseRewritesCPFuseAll() { + runPCATest(TEST_NAME1, true, true, ExecType.CP, TestType.FUSE_ALL); + } + + @Test + public void testPCADenseRewritesSPFuseAll() { + runPCATest(TEST_NAME1, true, false, ExecType.SPARK, TestType.FUSE_ALL); + } + + @Test + public void testPCASparseRewritesSPFuseAll() { + runPCATest(TEST_NAME1, true, true, ExecType.SPARK, TestType.FUSE_ALL); + } + + @Test + public void testPCADenseRewritesCPFuseNoRedundancy() { + runPCATest(TEST_NAME1, true, false, ExecType.CP, TestType.FUSE_NO_REDUNDANCY); + } + + @Test + public void testPCASparseRewritesCPFuseNoRedundancy() { + runPCATest(TEST_NAME1, true, true, ExecType.CP, TestType.FUSE_NO_REDUNDANCY); + } + + @Test + public void testPCADenseRewritesSPFuseNoRedundancy() { + runPCATest(TEST_NAME1, true, false, ExecType.SPARK, TestType.FUSE_NO_REDUNDANCY); + } + + @Test + public void testPCASparseRewritesSPFuseNoRedundancy() { + runPCATest(TEST_NAME1, true, true, ExecType.SPARK, TestType.FUSE_NO_REDUNDANCY); + } + + private void runPCATest(String testname, boolean rewrites, boolean sparse, ExecType instType, TestType testType) + { + boolean oldFlag = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION; + ExecMode platformOld = setExecMode(instType); + + try { + String TEST_NAME = testname; + TestConfiguration config = getTestConfiguration(TEST_NAME); + loadTestConfiguration(config); + + fullDMLScriptName = "scripts/algorithms/PCA.dml"; + // pass OFMT=text flag, since readDMLMatrixFromHDFS() uses " " separator, not a "," separator. + programArgs = new String[]{ "-explain", "-stats", "-nvargs", "OFMT=TEXT","INPUT="+input("A"), + "OUTPUT="+output("")}; + + rCmd = getRCmd(inputDir(), expectedDir()); + + OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites; + + //generate actual datasets + int cols = (instType==ExecType.SPARK) ? cols2 : cols1; + double[][] A = getRandomMatrix(rows, cols, 0, 1, sparse?sparsity2:sparsity1, 714); + writeInputMatrixWithMTD("A", A, true); + + runTest(true, false, null, -1); + runRScript(true); + + //compare matrices + HashMap<CellIndex, Double> dmleval = readDMLMatrixFromHDFS("dominant.eigen.values"); + HashMap<CellIndex, Double> reval = readRMatrixFromFS("dominant.eigen.values"); + HashMap<CellIndex, Double> dmlevec = readDMLMatrixFromHDFS("dominant.eigen.vectors"); + HashMap<CellIndex, Double> revec = readDMLMatrixFromHDFS("dominant.eigen.vectors"); + HashMap<CellIndex, Double> dmlstd = readDMLMatrixFromHDFS("dominant.eigen.standard.deviations"); + HashMap<CellIndex, Double> rstd = readRMatrixFromFS("dominant.eigen.standard.deviations"); + TestUtils.compareMatrices(dmleval, reval, eps, "Stat-DML", "Stat-R"); + TestUtils.compareMatrices(dmlevec, revec, eps, "Stat-DML", "Stat-R"); + TestUtils.compareMatrices(dmlstd, rstd, eps, "Stat-DML", "Stat-R"); + Assert.assertTrue(heavyHittersContainsSubString("spoof") || heavyHittersContainsSubString("sp_spoof")); + } + finally { + resetExecMode(platformOld); + OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlag; + OptimizerUtils.ALLOW_AUTO_VECTORIZATION = true; + OptimizerUtils.ALLOW_OPERATOR_FUSION = true; + } + } + + /** + * Override default configuration with custom test configuration to ensure + * scratch space and local temporary directory locations are also updated. + */ + @Override + protected File getConfigTemplateFile() { + // Instrumentation in this test's output log to show custom configuration file used for template. + String message = "This test case overrides default configuration with "; + if(currentTestType == AlgorithmPCA.TestType.FUSE_ALL){ + System.out.println(message + TEST_CONF_FILE_FUSE_ALL.getPath()); + return TEST_CONF_FILE_FUSE_ALL; + } else if(currentTestType == TestType.FUSE_NO_REDUNDANCY){ + System.out.println(message + TEST_CONF_FILE_FUSE_NO_REDUNDANCY.getPath()); + return TEST_CONF_FILE_FUSE_NO_REDUNDANCY; + } else { + System.out.println(message + TEST_CONF_FILE_DEFAULT.getPath()); + return TEST_CONF_FILE_DEFAULT; + } + } +} diff --git a/src/test/scripts/functions/codegenalg/Algorithm_PCA.R b/src/test/scripts/functions/codegenalg/Algorithm_PCA.R new file mode 100644 index 0000000..338e6a1 --- /dev/null +++ b/src/test/scripts/functions/codegenalg/Algorithm_PCA.R @@ -0,0 +1,87 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# This script performs Principal Component Analysis (PCA) on the given input data. +# + +args <- commandArgs(TRUE) +library("Matrix") + +A = readMM(paste(args[1], "A.mtx", sep="")); +K = ncol(A); +projectData = 0; +model = ""; +center = 0; +scale = 0; + + +if (model != "") { + # reuse existing model to project data +} else if (model == "") { + + N = nrow(A); + D = ncol(A); + + # 1. perform z-scoring (centering and scaling) + if (center == 1) { + cm = matrix(1, nrow(A), 1) %*% colMeans(A); + A = A - cm + } + if (scale == 1) { + cvars = (colSums(A^2)); + if (center == 1){ + #cm = colMeans(A); + cvars = (cvars - N*(colMeans(A)^2))/(N-1); + } + Azscored = A / sqrt(cvars); + A = Azscored; + } + + # 2. compute co-variance matrix + mu = colSums(A)/N; + C = (t(A) %*% A)/(N-1) - (N/(N-1))*(mu) %*% t(mu); + + # 3. compute eigen vectors and values + R <- eigen(C); + evalues = R$values; + evectors = R$vectors; + + # 4. make an index of values sorted according to magnitude of evalues + decreasing_Idx = order(as.vector(evalues), decreasing=TRUE); + diagmat = table(seq(1,D), decreasing_Idx); + # 5. sorts eigen values by decreasing order + evalues = diagmat %*% evalues; + # 6. sorts eigen vectors column-wise in the order of decreasing eigen values + evectors = evectors %*% diagmat; + + # 7. select K dominant eigen vectors + nvec = ncol(evectors); # Here `nvec=K` + eval_dominant = evalues[1:K, 1]; + evec_dominant = evectors[1:K,]; + + # 8. compute the std. deviation of dominant evalues + eval_stdev_dominant = sqrt(eval_dominant); + + writeMM(as(eval_stdev_dominant, "CsparseMatrix"), paste(args[2],"dominant.eigen.standard.deviations", sep="")); + writeMM(as(eval_dominant, "CsparseMatrix"), paste(args[2], "dominant.eigen.values", sep="")); + writeMM(as(evec_dominant, "CsparseMatrix"), paste(args[2],"dominant.eigen.vectors", sep="")); +} \ No newline at end of file