This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new e0a62ff741 [SYSTEMDS-3740] Python API Builtin countDistinct
e0a62ff741 is described below

commit e0a62ff7416638b559501e2d4cd2be630ae73117
Author: e-strauss <[email protected]>
AuthorDate: Tue Sep 3 18:39:30 2024 +0200

    [SYSTEMDS-3740] Python API Builtin countDistinct
    
    Closes #2087
---
 src/main/python/systemds/operator/nodes/matrix.py | 16 ++++++++
 src/main/python/tests/matrix/test_aggregations.py | 47 +++++++++++++++++++++--
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/src/main/python/systemds/operator/nodes/matrix.py 
b/src/main/python/systemds/operator/nodes/matrix.py
index 2862686ca3..a7c4202e88 100644
--- a/src/main/python/systemds/operator/nodes/matrix.py
+++ b/src/main/python/systemds/operator/nodes/matrix.py
@@ -244,6 +244,22 @@ class Matrix(OperationNode):
         raise ValueError(
             f"Axis has to be either 0, 1 or None, for column, row or complete 
{self.operation}")
 
+    def countDistinct(self, axis: int = None) -> 'OperationNode':
+        """Calculate the number of distinct values of matrix.
+
+        :param axis: can be 0 or 1 to do either row or column aggregation
+        :return: `Matrix` representing operation
+        """
+        if axis == 0:
+            return Matrix(self.sds_context, 'colCountDistinct', [self])
+        elif axis == 1:
+            return Matrix(self.sds_context, 'rowCountDistinct', [self])
+        elif axis is None:
+            return Scalar(self.sds_context, 'countDistinct', [self])
+        raise ValueError(
+            f"Axis has to be either 0, 1 or None, for column, row or complete 
{self.operation}")
+
+
     def var(self, axis: int = None) -> 'OperationNode':
         """Calculate variance of matrix.
 
diff --git a/src/main/python/tests/matrix/test_aggregations.py 
b/src/main/python/tests/matrix/test_aggregations.py
index 1b345d6b21..6313615122 100644
--- a/src/main/python/tests/matrix/test_aggregations.py
+++ b/src/main/python/tests/matrix/test_aggregations.py
@@ -30,10 +30,10 @@ m1 = np.array(np.random.randint(100, size=dim * dim) + 
1.01, dtype=np.double)
 m1.shape = (dim, dim)
 m2 = np.array(np.random.randint(5, size=dim * dim) + 1, dtype=np.double)
 m2.shape = (dim, dim)
-
+m3 = np.array(np.random.randint(10, size=dim * dim * 10) + 1, dtype=np.double)
+m3.shape = (dim * 10, dim)
 
 class TestMatrixAggFn(unittest.TestCase):
-
     sds: SystemDSContext = None
 
     @classmethod
@@ -70,7 +70,7 @@ class TestMatrixAggFn(unittest.TestCase):
 
     def test_full(self):
         self.assertTrue(np.allclose(
-            self.sds.full( (2, 3), 10.1).compute(), np.full((2, 3), 10.1)))
+            self.sds.full((2, 3), 10.1).compute(), np.full((2, 3), 10.1)))
 
     def test_seq(self):
         self.assertTrue(np.allclose(
@@ -121,5 +121,46 @@ class TestMatrixAggFn(unittest.TestCase):
             self.sds.from_numpy(m2).trace().compute(), m2.trace()))
 
 
+    def test_countDistinct1(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m1).countDistinct().compute(), 
len(np.unique(m1))))
+
+    def test_countDistinct2(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m2).countDistinct().compute(), 
len(np.unique(m2))))
+
+    def test_countDistinct3(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m3).countDistinct().compute(), 
len(np.unique(m3))))
+
+    def test_countDistinct4(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m1).countDistinct(0).compute(), 
[len(np.unique(col)) for col in m1.T]))
+
+    def test_countDistinct5(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m2).countDistinct(0).compute(), 
[len(np.unique(col)) for col in m2.T]))
+
+    def test_countDistinct6(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m3).countDistinct(0).compute(), 
[len(np.unique(col)) for col in m3.T]))
+
+    def test_countDistinct7(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m1).countDistinct(1).compute(), 
np.array([[len(np.unique(col))] for col in m1])))
+
+    def test_countDistinct8(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m2).countDistinct(1).compute(), 
np.array([[len(np.unique(col))] for col in m2])))
+
+    def test_countDistinct9(self):
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m3).countDistinct(1).compute(), 
np.array([[len(np.unique(col))] for col in m3])))
+
+    def test_countDistinct10(self):
+        with self.assertRaises(ValueError):
+            self.sds.from_numpy(m3).countDistinct(2)
+
+
 if __name__ == "__main__":
     unittest.main(exit=False)

Reply via email to