This is an automated email from the ASF dual-hosted git repository. baunsgaard pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new 3d244415da [SYSTEMDS-3741] Python API Builtin countDistinctApprox 3d244415da is described below commit 3d244415dae9b615a8ada6c16b7deabeabd9bef8 Author: e-strauss <92718421+e-stra...@users.noreply.github.com> AuthorDate: Tue Sep 3 18:45:04 2024 +0200 [SYSTEMDS-3741] Python API Builtin countDistinctApprox Closes #2088 --- src/main/python/systemds/operator/nodes/matrix.py | 14 ++++++++++++ src/main/python/tests/matrix/test_aggregations.py | 26 +++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/src/main/python/systemds/operator/nodes/matrix.py b/src/main/python/systemds/operator/nodes/matrix.py index a7c4202e88..07566ebfd0 100644 --- a/src/main/python/systemds/operator/nodes/matrix.py +++ b/src/main/python/systemds/operator/nodes/matrix.py @@ -260,6 +260,20 @@ class Matrix(OperationNode): f"Axis has to be either 0, 1 or None, for column, row or complete {self.operation}") + def countDistinctApprox(self, axis: int = None) -> 'OperationNode': + """Calculate the approximate number of distinct values of matrix. + :param axis: can be 0 or 1 to do either row or column aggregation + :return: `Matrix` representing operation + """ + if axis == 0: + return Matrix(self.sds_context, 'colCountDistinctApprox', [self]) + elif axis == 1: + return Matrix(self.sds_context, 'rowCountDistinctApprox', [self]) + elif axis is None: + return Scalar(self.sds_context, 'countDistinctApprox', [self]) + raise ValueError( + f"Axis has to be either 0, 1 or None, for column, row or complete {self.operation}") + def var(self, axis: int = None) -> 'OperationNode': """Calculate variance of matrix. diff --git a/src/main/python/tests/matrix/test_aggregations.py b/src/main/python/tests/matrix/test_aggregations.py index 6313615122..8627d2547c 100644 --- a/src/main/python/tests/matrix/test_aggregations.py +++ b/src/main/python/tests/matrix/test_aggregations.py @@ -120,6 +120,32 @@ class TestMatrixAggFn(unittest.TestCase): self.assertTrue(np.allclose( self.sds.from_numpy(m2).trace().compute(), m2.trace())) + def test_countDistinctApprox1(self): + distinct = 100 + m = np.round(np.random.random((1000, 1000))*(distinct - 1)) + # allow and error of 1% + self.assertTrue(np.allclose( + self.sds.from_numpy(m).countDistinctApprox().compute(), len(np.unique(m)), 1)) + + def test_countDistinctApprox2(self): + distinct = 1000 + m = np.round(np.random.random((10000, 100))*(distinct - 1)) + # allow and error of 1% + self.assertTrue(np.allclose( + self.sds.from_numpy(m).countDistinctApprox(0).compute(), [len(np.unique(col))*100 for col in m.T], 10)) + + def test_countDistinctApprox3(self): + distinct = 1000 + m = np.round(np.random.random((100, 10000))*(distinct - 1)) + # allow and error of 1% + self.assertTrue(np.allclose( + self.sds.from_numpy(m).countDistinctApprox(1).compute(), np.array([[len(np.unique(col))] for col in m]), 10)) + + def test_countDistinctApprox4(self): + m = np.round(np.random.random((2, 2))) + with self.assertRaises(ValueError): + self.sds.from_numpy(m).countDistinctApprox(2) + def test_countDistinct1(self): self.assertTrue(np.allclose(