This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new d1e1068  [SYSTEMDS-3260] Builtin for Matthews Correlation Coefficient 
DIA project WS2021/22 Closes #1496.
d1e1068 is described below

commit d1e1068d067b02e19477ec7c3cd43e76d920b4ce
Author: Bernhard Leder <[email protected]>
AuthorDate: Tue Jan 11 19:12:02 2022 +0100

    [SYSTEMDS-3260] Builtin for Matthews Correlation Coefficient
    DIA project WS2021/22
    Closes #1496.
---
 scripts/builtin/mcc.dml                            |  80 ++++++++++++
 .../java/org/apache/sysds/common/Builtins.java     |   1 +
 .../functions/builtin/part2/BuiltinMCCTest.java    | 137 +++++++++++++++++++++
 src/test/scripts/functions/builtin/mcc.R           |  31 +++++
 src/test/scripts/functions/builtin/mcc.dml         |  26 ++++
 src/test/scripts/installDependencies.R             |   1 +
 6 files changed, 276 insertions(+)

diff --git a/scripts/builtin/mcc.dml b/scripts/builtin/mcc.dml
new file mode 100644
index 0000000..60456c7
--- /dev/null
+++ b/scripts/builtin/mcc.dml
@@ -0,0 +1,80 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Built-in function mcc: Matthews' Correlation Coefficient for binary 
classification evaluation
+#
+# INPUT PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME            TYPE               DEFAULT     MEANING
+# 
---------------------------------------------------------------------------------------------
+# predictions     Matrix[Integer]      ---     Vector of predicted 0/1 values. 
+#                                                 (requires setting 'labels' 
parameter)
+# labels          Matrix[Integer]      ---     Vector of 0/1 labels.
+# 
---------------------------------------------------------------------------------------------
+ 
+#Output(s)
+# 
---------------------------------------------------------------------------------------------
+# NAME            TYPE    DEFAULT     MEANING
+# 
---------------------------------------------------------------------------------------------
+# mattCC          Double    ---       Matthews' Correlation Coefficient
+# 
---------------------------------------------------------------------------------------------
+
+m_mcc = function(Matrix[Double] predictions = matrix(0,0,0), Matrix[Double] 
labels = matrix(0,0,0))
+return (Double mattCC)
+{
+  # # validation checks   
+  if ((length(labels) > 0 & sum(labels) == 0)) 
+    stop("MCC Input Error: labels contains only zeros")
+
+  if (nrow(predictions) != nrow(labels)) 
+    stop("MCC Input Error: rows in predictions != rows in labels")
+    
+  if(min(labels) != 0 | min(predictions) != 0)
+    stop("MCC Input Error: accepts 0/1 vector only")
+    
+  if (min(labels) == max(labels)) 
+    stop("MCC Input Error: labels contains single class")
+  
+  if(max(labels) > 1 | max(predictions) > 1)
+    stop("MCC Input Error: accepts 0/1 vector only")
+  # # add 1 to predictions and labels because table does not accept zero
+  labels = labels + 1
+  predictions = predictions + 1
+  confM = table(labels, predictions, 2, 2)
+  mattCC = computeMCC(confM)
+}
+
+computeMCC = function(Matrix[Double] confusionM) 
+  return (Double mattCC) {
+
+  TN=as.scalar(confusionM[1,1])
+  FP=as.scalar(confusionM[1,2])
+  FN=as.scalar(confusionM[2,1])
+  TP=as.scalar(confusionM[2,2])
+
+  # from 
https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7
+  # MCC = (TP*TN - FP*FN) / sqrt((TP + FP) * (TP * FN) * (TN + FP) * (TN + FN))
+  # if row and/or column of zeros,
+  if (min(rowSums(confusionM)) == 0 | min(colSums(confusionM)) == 0) 
+    mattCC = 0.0 # epsilon approximation --> 0 --> setting mattCC to 0 
directly avoids calculation
+  else 
+    mattCC = (TP*TN - FP*FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + 
FN))
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java 
b/src/main/java/org/apache/sysds/common/Builtins.java
index 977c1f2..aa9c58c 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -200,6 +200,7 @@ public enum Builtins {
        MAX("max", "pmax", false),
        MAX_POOL("max_pool", false),
        MAX_POOL_BACKWARD("max_pool_backward", false),
+       MCC("mcc", true),
        MEAN("mean", "avg", false),
        MEDIAN("median", false),
        MICE("mice", true),
diff --git 
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMCCTest.java
 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMCCTest.java
new file mode 100644
index 0000000..24cb57c
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMCCTest.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import org.apache.sysds.common.Types.ExecMode;
+
+
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class BuiltinMCCTest extends AutomatedTestBase {
+    private final static String TEST_NAME = "mcc";
+       private final static String TEST_DIR = "functions/builtin/";
+       private final static String TEST_CLASS_DIR = TEST_DIR + 
BuiltinMCCTest.class.getSimpleName() + "/";
+
+    private final static String OUTPUT_IDENTIFIER = "mattCorrCoeff.scalar";
+    private final static double epsilon = 1e-10;
+
+    @Override
+       public void setUp() {
+        TestConfiguration tc = new TestConfiguration(TEST_CLASS_DIR, 
TEST_NAME, new String[]{OUTPUT_IDENTIFIER});
+               addTestConfiguration(TEST_NAME, tc);
+       }
+
+    @Test
+    public void testMCCCorrect1() {
+        double[][] predictions = {{1},{1},{1},{0},{1},{1},{0},{0},{0},{1}};
+        double[][] labels = {{1},{1},{1},{1},{1},{0},{0},{0},{0},{0}};
+        boolean expectException = false;
+        runMCCTest(predictions, labels, false, ExecMode.HYBRID, 
expectException);
+    }
+
+    @Test
+    public void testMCCCorrect_2() {
+        double[][] predictions = {{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}};
+        double[][] labels = {{1},{1},{1},{0},{1},{0},{1},{1},{0},{1}};
+        boolean expectException = false;
+        runMCCTest(predictions, labels, false, ExecMode.HYBRID, 
expectException);
+    }
+
+    @Test
+    public void testMCCIncorrectSP() {
+        double[][] predictions = {{0},{1},{1},{0},{1},{1},{0},{0},{0},{1}};
+        double[][] labels = {{1},{1},{1},{1},{1},{0},{0},{0},{0},{0}};
+        boolean expectException = false;
+        runMCCTest(predictions, labels, false, ExecMode.SPARK, 
expectException);
+    }
+
+    @Test
+    public void testMCCCorrectLarge() {
+        double[][] predictions = getRandomMatrix(100000, 1, 0.0, 1.0, 1.0, 7);
+        double[][] labels = getRandomMatrix(100000, 1, 0.0, 1.0, 1.0, 11);
+        for (int row = 0; row < predictions.length; row++) {
+            predictions[row][0] = Math.round(predictions[row][0]);
+            labels[row][0] = Math.round(labels[row][0]);
+        }
+        boolean expectException = false;
+        runMCCTest(predictions, labels, false, ExecMode.HYBRID, 
expectException);
+    }
+
+    @Test
+    public void testMCCIncorrect_1() {
+        double[][] predictions = 
{{-1},{-1},{-1},{-1},{-1},{-1},{-1},{-1},{-1},{-1}};
+        double[][] labels = 
{{99},{99},{99},{99},{99},{99},{99},{99},{99},{99}};
+        boolean expectException = true;
+        runMCCTest(predictions, labels, false, ExecMode.HYBRID, 
expectException);
+    }
+
+    @Test
+    public void testMCCIncorrect_2() {
+        double[][] predictions = {{1},{1},{1},{0},{1},{1},{0},{0},{0},{-1}};
+        double[][] labels = {{99},{1},{1},{1},{1},{0},{0},{0},{0},{0}};
+        boolean expectException = true;
+        runMCCTest(predictions, labels, false, ExecMode.HYBRID, 
expectException);
+    }
+    
+    private void runMCCTest(double[][] predictions, double[][] labels, boolean 
lineage, ExecMode mode, boolean expectException) {
+        ExecMode execModeOld = setExecMode(mode);
+        try {
+            loadTestConfiguration(getTestConfiguration(TEST_NAME));
+            String HOME = SCRIPT_DIR + TEST_DIR;
+                       fullDMLScriptName = HOME + TEST_NAME + ".dml";
+            programArgs = new String[]{
+                "-nvargs", 
+                "predictions="+input("predictions"),
+                "labels=" + input("labels"),
+                "mattCorrCoeff=" + output(OUTPUT_IDENTIFIER),
+            };
+            if (lineage) {
+                programArgs = (String[]) ArrayUtils.addAll(programArgs, new 
String[] {
+                    "-stats","-lineage", 
ReuseCacheType.REUSE_HYBRID.name().toLowerCase()});
+            }
+            writeInputMatrixWithMTD("labels", labels, true);
+            writeInputMatrixWithMTD("predictions", predictions, true);
+
+            fullRScriptName = HOME + TEST_NAME + ".R";
+                       rCmd = getRCmd(inputDir(), expected(OUTPUT_IDENTIFIER));
+
+            runTest(true, expectException, null, -1); 
+            if (!expectException) {
+                runRScript(true);
+                Double mattCorrCoeffDML = 
readDMLScalarFromOutputDir(OUTPUT_IDENTIFIER).get(new CellIndex(1,1));
+                Assert.assertTrue(-1 <= mattCorrCoeffDML && mattCorrCoeffDML 
<= 1);
+                Double mattCorrCoeffR = 
readRScalarFromExpectedDir(OUTPUT_IDENTIFIER).get(new CellIndex(1,1));
+                TestUtils.compareScalars(mattCorrCoeffDML, mattCorrCoeffR, 
epsilon);
+            }
+            
+        } finally {
+            resetExecMode(execModeOld);
+        }
+    }
+
+}
diff --git a/src/test/scripts/functions/builtin/mcc.R 
b/src/test/scripts/functions/builtin/mcc.R
new file mode 100644
index 0000000..32cf442
--- /dev/null
+++ b/src/test/scripts/functions/builtin/mcc.R
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args<-commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+library("mltools")
+
+predictions = as.vector(readMM(paste(args[1], "predictions.mtx", sep="")))
+labels = as.vector(readMM(paste(args[1], "labels.mtx", sep="")))
+mattCorrCoeff = mcc(preds=predictions, actuals=labels)
+
+write(mattCorrCoeff, args[2])
diff --git a/src/test/scripts/functions/builtin/mcc.dml 
b/src/test/scripts/functions/builtin/mcc.dml
new file mode 100644
index 0000000..ad1aa6f
--- /dev/null
+++ b/src/test/scripts/functions/builtin/mcc.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+predictionsIn = read($predictions)
+labelsIn = read($labels)
+
+mattCorrCoeff = mcc(predictions=predictionsIn, labels=labelsIn)
+write(mattCorrCoeff, $mattCorrCoeff)
diff --git a/src/test/scripts/installDependencies.R 
b/src/test/scripts/installDependencies.R
index 5a19477..af89f2b 100644
--- a/src/test/scripts/installDependencies.R
+++ b/src/test/scripts/installDependencies.R
@@ -63,6 +63,7 @@ custom_install("class");
 custom_install("unbalanced");
 custom_install("naivebayes");
 custom_install("BiocManager");
+custom_install("mltools");
 BiocManager::install("rhdf5");
 
 print("Installation Done")

Reply via email to