This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 3d244415da [SYSTEMDS-3741] Python API Builtin countDistinctApprox
3d244415da is described below

commit 3d244415dae9b615a8ada6c16b7deabeabd9bef8
Author: e-strauss <92718421+e-stra...@users.noreply.github.com>
AuthorDate: Tue Sep 3 18:45:04 2024 +0200

    [SYSTEMDS-3741] Python API Builtin countDistinctApprox
    
    Closes #2088
---
 src/main/python/systemds/operator/nodes/matrix.py | 14 ++++++++++++
 src/main/python/tests/matrix/test_aggregations.py | 26 +++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/src/main/python/systemds/operator/nodes/matrix.py 
b/src/main/python/systemds/operator/nodes/matrix.py
index a7c4202e88..07566ebfd0 100644
--- a/src/main/python/systemds/operator/nodes/matrix.py
+++ b/src/main/python/systemds/operator/nodes/matrix.py
@@ -260,6 +260,20 @@ class Matrix(OperationNode):
             f"Axis has to be either 0, 1 or None, for column, row or complete 
{self.operation}")
 
 
+    def countDistinctApprox(self, axis: int = None) -> 'OperationNode':
+        """Calculate the approximate number of distinct values of matrix.
+        :param axis: can be 0 or 1 to do either row or column aggregation
+        :return: `Matrix` representing operation
+        """
+        if axis == 0:
+            return Matrix(self.sds_context, 'colCountDistinctApprox', [self])
+        elif axis == 1:
+            return Matrix(self.sds_context, 'rowCountDistinctApprox', [self])
+        elif axis is None:
+            return Scalar(self.sds_context, 'countDistinctApprox', [self])
+        raise ValueError(
+            f"Axis has to be either 0, 1 or None, for column, row or complete 
{self.operation}")
+
     def var(self, axis: int = None) -> 'OperationNode':
         """Calculate variance of matrix.
 
diff --git a/src/main/python/tests/matrix/test_aggregations.py 
b/src/main/python/tests/matrix/test_aggregations.py
index 6313615122..8627d2547c 100644
--- a/src/main/python/tests/matrix/test_aggregations.py
+++ b/src/main/python/tests/matrix/test_aggregations.py
@@ -120,6 +120,32 @@ class TestMatrixAggFn(unittest.TestCase):
         self.assertTrue(np.allclose(
             self.sds.from_numpy(m2).trace().compute(), m2.trace()))
 
+    def test_countDistinctApprox1(self):
+        distinct = 100
+        m = np.round(np.random.random((1000, 1000))*(distinct - 1))
+        # allow and error of 1%
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m).countDistinctApprox().compute(), 
len(np.unique(m)), 1))
+
+    def test_countDistinctApprox2(self):
+        distinct = 1000
+        m = np.round(np.random.random((10000, 100))*(distinct - 1))
+        # allow and error of 1%
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m).countDistinctApprox(0).compute(), 
[len(np.unique(col))*100 for col in m.T], 10))
+
+    def test_countDistinctApprox3(self):
+        distinct = 1000
+        m = np.round(np.random.random((100, 10000))*(distinct - 1))
+        # allow and error of 1%
+        self.assertTrue(np.allclose(
+            self.sds.from_numpy(m).countDistinctApprox(1).compute(), 
np.array([[len(np.unique(col))] for col in m]), 10))
+
+    def test_countDistinctApprox4(self):
+        m = np.round(np.random.random((2, 2)))
+        with self.assertRaises(ValueError):
+            self.sds.from_numpy(m).countDistinctApprox(2)
+
 
     def test_countDistinct1(self):
         self.assertTrue(np.allclose(

Reply via email to