This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new d1e1068 [SYSTEMDS-3260] Builtin for Matthews Correlation Coefficient
DIA project WS2021/22 Closes #1496.
d1e1068 is described below
commit d1e1068d067b02e19477ec7c3cd43e76d920b4ce
Author: Bernhard Leder <[email protected]>
AuthorDate: Tue Jan 11 19:12:02 2022 +0100
[SYSTEMDS-3260] Builtin for Matthews Correlation Coefficient
DIA project WS2021/22
Closes #1496.
---
scripts/builtin/mcc.dml | 80 ++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../functions/builtin/part2/BuiltinMCCTest.java | 137 +++++++++++++++++++++
src/test/scripts/functions/builtin/mcc.R | 31 +++++
src/test/scripts/functions/builtin/mcc.dml | 26 ++++
src/test/scripts/installDependencies.R | 1 +
6 files changed, 276 insertions(+)
diff --git a/scripts/builtin/mcc.dml b/scripts/builtin/mcc.dml
new file mode 100644
index 0000000..60456c7
--- /dev/null
+++ b/scripts/builtin/mcc.dml
@@ -0,0 +1,80 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Built-in function mcc: Matthews' Correlation Coefficient for binary
classification evaluation
+#
+# INPUT PARAMETERS:
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# predictions Matrix[Integer] --- Vector of predicted 0/1 values.
+# (requires setting 'labels'
parameter)
+# labels Matrix[Integer] --- Vector of 0/1 labels.
+#
---------------------------------------------------------------------------------------------
+
+#Output(s)
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# mattCC Double --- Matthews' Correlation Coefficient
+#
---------------------------------------------------------------------------------------------
+
+m_mcc = function(Matrix[Double] predictions = matrix(0,0,0), Matrix[Double]
labels = matrix(0,0,0))
+return (Double mattCC)
+{
+ # # validation checks
+ if ((length(labels) > 0 & sum(labels) == 0))
+ stop("MCC Input Error: labels contains only zeros")
+
+ if (nrow(predictions) != nrow(labels))
+ stop("MCC Input Error: rows in predictions != rows in labels")
+
+ if(min(labels) != 0 | min(predictions) != 0)
+ stop("MCC Input Error: accepts 0/1 vector only")
+
+ if (min(labels) == max(labels))
+ stop("MCC Input Error: labels contains single class")
+
+ if(max(labels) > 1 | max(predictions) > 1)
+ stop("MCC Input Error: accepts 0/1 vector only")
+ # # add 1 to predictions and labels because table does not accept zero
+ labels = labels + 1
+ predictions = predictions + 1
+ confM = table(labels, predictions, 2, 2)
+ mattCC = computeMCC(confM)
+}
+
+computeMCC = function(Matrix[Double] confusionM)
+ return (Double mattCC) {
+
+ TN=as.scalar(confusionM[1,1])
+ FP=as.scalar(confusionM[1,2])
+ FN=as.scalar(confusionM[2,1])
+ TP=as.scalar(confusionM[2,2])
+
+ # from
https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7
+ # MCC = (TP*TN - FP*FN) / sqrt((TP + FP) * (TP * FN) * (TN + FP) * (TN + FN))
+ # if row and/or column of zeros,
+ if (min(rowSums(confusionM)) == 0 | min(colSums(confusionM)) == 0)
+ mattCC = 0.0 # epsilon approximation --> 0 --> setting mattCC to 0
directly avoids calculation
+ else
+ mattCC = (TP*TN - FP*FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN +
FN))
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 977c1f2..aa9c58c 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -200,6 +200,7 @@ public enum Builtins {
MAX("max", "pmax", false),
MAX_POOL("max_pool", false),
MAX_POOL_BACKWARD("max_pool_backward", false),
+ MCC("mcc", true),
MEAN("mean", "avg", false),
MEDIAN("median", false),
MICE("mice", true),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMCCTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMCCTest.java
new file mode 100644
index 0000000..24cb57c
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/part2/BuiltinMCCTest.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part2;
+
+import org.apache.sysds.common.Types.ExecMode;
+
+
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.sysds.runtime.lineage.LineageCacheConfig.ReuseCacheType;
+import org.apache.sysds.runtime.matrix.data.MatrixValue.CellIndex;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class BuiltinMCCTest extends AutomatedTestBase {
+ private final static String TEST_NAME = "mcc";
+ private final static String TEST_DIR = "functions/builtin/";
+ private final static String TEST_CLASS_DIR = TEST_DIR +
BuiltinMCCTest.class.getSimpleName() + "/";
+
+ private final static String OUTPUT_IDENTIFIER = "mattCorrCoeff.scalar";
+ private final static double epsilon = 1e-10;
+
+ @Override
+ public void setUp() {
+ TestConfiguration tc = new TestConfiguration(TEST_CLASS_DIR,
TEST_NAME, new String[]{OUTPUT_IDENTIFIER});
+ addTestConfiguration(TEST_NAME, tc);
+ }
+
+ @Test
+ public void testMCCCorrect1() {
+ double[][] predictions = {{1},{1},{1},{0},{1},{1},{0},{0},{0},{1}};
+ double[][] labels = {{1},{1},{1},{1},{1},{0},{0},{0},{0},{0}};
+ boolean expectException = false;
+ runMCCTest(predictions, labels, false, ExecMode.HYBRID,
expectException);
+ }
+
+ @Test
+ public void testMCCCorrect_2() {
+ double[][] predictions = {{0},{0},{0},{0},{0},{0},{0},{0},{0},{0}};
+ double[][] labels = {{1},{1},{1},{0},{1},{0},{1},{1},{0},{1}};
+ boolean expectException = false;
+ runMCCTest(predictions, labels, false, ExecMode.HYBRID,
expectException);
+ }
+
+ @Test
+ public void testMCCIncorrectSP() {
+ double[][] predictions = {{0},{1},{1},{0},{1},{1},{0},{0},{0},{1}};
+ double[][] labels = {{1},{1},{1},{1},{1},{0},{0},{0},{0},{0}};
+ boolean expectException = false;
+ runMCCTest(predictions, labels, false, ExecMode.SPARK,
expectException);
+ }
+
+ @Test
+ public void testMCCCorrectLarge() {
+ double[][] predictions = getRandomMatrix(100000, 1, 0.0, 1.0, 1.0, 7);
+ double[][] labels = getRandomMatrix(100000, 1, 0.0, 1.0, 1.0, 11);
+ for (int row = 0; row < predictions.length; row++) {
+ predictions[row][0] = Math.round(predictions[row][0]);
+ labels[row][0] = Math.round(labels[row][0]);
+ }
+ boolean expectException = false;
+ runMCCTest(predictions, labels, false, ExecMode.HYBRID,
expectException);
+ }
+
+ @Test
+ public void testMCCIncorrect_1() {
+ double[][] predictions =
{{-1},{-1},{-1},{-1},{-1},{-1},{-1},{-1},{-1},{-1}};
+ double[][] labels =
{{99},{99},{99},{99},{99},{99},{99},{99},{99},{99}};
+ boolean expectException = true;
+ runMCCTest(predictions, labels, false, ExecMode.HYBRID,
expectException);
+ }
+
+ @Test
+ public void testMCCIncorrect_2() {
+ double[][] predictions = {{1},{1},{1},{0},{1},{1},{0},{0},{0},{-1}};
+ double[][] labels = {{99},{1},{1},{1},{1},{0},{0},{0},{0},{0}};
+ boolean expectException = true;
+ runMCCTest(predictions, labels, false, ExecMode.HYBRID,
expectException);
+ }
+
+ private void runMCCTest(double[][] predictions, double[][] labels, boolean
lineage, ExecMode mode, boolean expectException) {
+ ExecMode execModeOld = setExecMode(mode);
+ try {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{
+ "-nvargs",
+ "predictions="+input("predictions"),
+ "labels=" + input("labels"),
+ "mattCorrCoeff=" + output(OUTPUT_IDENTIFIER),
+ };
+ if (lineage) {
+ programArgs = (String[]) ArrayUtils.addAll(programArgs, new
String[] {
+ "-stats","-lineage",
ReuseCacheType.REUSE_HYBRID.name().toLowerCase()});
+ }
+ writeInputMatrixWithMTD("labels", labels, true);
+ writeInputMatrixWithMTD("predictions", predictions, true);
+
+ fullRScriptName = HOME + TEST_NAME + ".R";
+ rCmd = getRCmd(inputDir(), expected(OUTPUT_IDENTIFIER));
+
+ runTest(true, expectException, null, -1);
+ if (!expectException) {
+ runRScript(true);
+ Double mattCorrCoeffDML =
readDMLScalarFromOutputDir(OUTPUT_IDENTIFIER).get(new CellIndex(1,1));
+ Assert.assertTrue(-1 <= mattCorrCoeffDML && mattCorrCoeffDML
<= 1);
+ Double mattCorrCoeffR =
readRScalarFromExpectedDir(OUTPUT_IDENTIFIER).get(new CellIndex(1,1));
+ TestUtils.compareScalars(mattCorrCoeffDML, mattCorrCoeffR,
epsilon);
+ }
+
+ } finally {
+ resetExecMode(execModeOld);
+ }
+ }
+
+}
diff --git a/src/test/scripts/functions/builtin/mcc.R
b/src/test/scripts/functions/builtin/mcc.R
new file mode 100644
index 0000000..32cf442
--- /dev/null
+++ b/src/test/scripts/functions/builtin/mcc.R
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+args<-commandArgs(TRUE)
+options(digits=22)
+
+library("Matrix")
+library("mltools")
+
+predictions = as.vector(readMM(paste(args[1], "predictions.mtx", sep="")))
+labels = as.vector(readMM(paste(args[1], "labels.mtx", sep="")))
+mattCorrCoeff = mcc(preds=predictions, actuals=labels)
+
+write(mattCorrCoeff, args[2])
diff --git a/src/test/scripts/functions/builtin/mcc.dml
b/src/test/scripts/functions/builtin/mcc.dml
new file mode 100644
index 0000000..ad1aa6f
--- /dev/null
+++ b/src/test/scripts/functions/builtin/mcc.dml
@@ -0,0 +1,26 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+predictionsIn = read($predictions)
+labelsIn = read($labels)
+
+mattCorrCoeff = mcc(predictions=predictionsIn, labels=labelsIn)
+write(mattCorrCoeff, $mattCorrCoeff)
diff --git a/src/test/scripts/installDependencies.R
b/src/test/scripts/installDependencies.R
index 5a19477..af89f2b 100644
--- a/src/test/scripts/installDependencies.R
+++ b/src/test/scripts/installDependencies.R
@@ -63,6 +63,7 @@ custom_install("class");
custom_install("unbalanced");
custom_install("naivebayes");
custom_install("BiocManager");
+custom_install("mltools");
BiocManager::install("rhdf5");
print("Installation Done")