This is an automated email from the ASF dual-hosted git repository.
baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new e0a62ff741 [SYSTEMDS-3740] Python API Builtin countDistinct
e0a62ff741 is described below
commit e0a62ff7416638b559501e2d4cd2be630ae73117
Author: e-strauss <[email protected]>
AuthorDate: Tue Sep 3 18:39:30 2024 +0200
[SYSTEMDS-3740] Python API Builtin countDistinct
Closes #2087
---
src/main/python/systemds/operator/nodes/matrix.py | 16 ++++++++
src/main/python/tests/matrix/test_aggregations.py | 47 +++++++++++++++++++++--
2 files changed, 60 insertions(+), 3 deletions(-)
diff --git a/src/main/python/systemds/operator/nodes/matrix.py
b/src/main/python/systemds/operator/nodes/matrix.py
index 2862686ca3..a7c4202e88 100644
--- a/src/main/python/systemds/operator/nodes/matrix.py
+++ b/src/main/python/systemds/operator/nodes/matrix.py
@@ -244,6 +244,22 @@ class Matrix(OperationNode):
raise ValueError(
f"Axis has to be either 0, 1 or None, for column, row or complete
{self.operation}")
+ def countDistinct(self, axis: int = None) -> 'OperationNode':
+ """Calculate the number of distinct values of matrix.
+
+ :param axis: can be 0 or 1 to do either row or column aggregation
+ :return: `Matrix` representing operation
+ """
+ if axis == 0:
+ return Matrix(self.sds_context, 'colCountDistinct', [self])
+ elif axis == 1:
+ return Matrix(self.sds_context, 'rowCountDistinct', [self])
+ elif axis is None:
+ return Scalar(self.sds_context, 'countDistinct', [self])
+ raise ValueError(
+ f"Axis has to be either 0, 1 or None, for column, row or complete
{self.operation}")
+
+
def var(self, axis: int = None) -> 'OperationNode':
"""Calculate variance of matrix.
diff --git a/src/main/python/tests/matrix/test_aggregations.py
b/src/main/python/tests/matrix/test_aggregations.py
index 1b345d6b21..6313615122 100644
--- a/src/main/python/tests/matrix/test_aggregations.py
+++ b/src/main/python/tests/matrix/test_aggregations.py
@@ -30,10 +30,10 @@ m1 = np.array(np.random.randint(100, size=dim * dim) +
1.01, dtype=np.double)
m1.shape = (dim, dim)
m2 = np.array(np.random.randint(5, size=dim * dim) + 1, dtype=np.double)
m2.shape = (dim, dim)
-
+m3 = np.array(np.random.randint(10, size=dim * dim * 10) + 1, dtype=np.double)
+m3.shape = (dim * 10, dim)
class TestMatrixAggFn(unittest.TestCase):
-
sds: SystemDSContext = None
@classmethod
@@ -70,7 +70,7 @@ class TestMatrixAggFn(unittest.TestCase):
def test_full(self):
self.assertTrue(np.allclose(
- self.sds.full( (2, 3), 10.1).compute(), np.full((2, 3), 10.1)))
+ self.sds.full((2, 3), 10.1).compute(), np.full((2, 3), 10.1)))
def test_seq(self):
self.assertTrue(np.allclose(
@@ -121,5 +121,46 @@ class TestMatrixAggFn(unittest.TestCase):
self.sds.from_numpy(m2).trace().compute(), m2.trace()))
+ def test_countDistinct1(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m1).countDistinct().compute(),
len(np.unique(m1))))
+
+ def test_countDistinct2(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m2).countDistinct().compute(),
len(np.unique(m2))))
+
+ def test_countDistinct3(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m3).countDistinct().compute(),
len(np.unique(m3))))
+
+ def test_countDistinct4(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m1).countDistinct(0).compute(),
[len(np.unique(col)) for col in m1.T]))
+
+ def test_countDistinct5(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m2).countDistinct(0).compute(),
[len(np.unique(col)) for col in m2.T]))
+
+ def test_countDistinct6(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m3).countDistinct(0).compute(),
[len(np.unique(col)) for col in m3.T]))
+
+ def test_countDistinct7(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m1).countDistinct(1).compute(),
np.array([[len(np.unique(col))] for col in m1])))
+
+ def test_countDistinct8(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m2).countDistinct(1).compute(),
np.array([[len(np.unique(col))] for col in m2])))
+
+ def test_countDistinct9(self):
+ self.assertTrue(np.allclose(
+ self.sds.from_numpy(m3).countDistinct(1).compute(),
np.array([[len(np.unique(col))] for col in m3])))
+
+ def test_countDistinct10(self):
+ with self.assertRaises(ValueError):
+ self.sds.from_numpy(m3).countDistinct(2)
+
+
if __name__ == "__main__":
unittest.main(exit=False)