This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new abe1998c597 [SPARK-39756][PS] Better error messages for missing pandas
scalars
abe1998c597 is described below
commit abe1998c5975a95eee0d73724301f7a7e3668878
Author: Xinrong Meng <[email protected]>
AuthorDate: Sat Jul 16 16:55:53 2022 +0900
[SPARK-39756][PS] Better error messages for missing pandas scalars
### What changes were proposed in this pull request?
pandas scalars are not reimplemented in pandas API on Spark intentionally,
as part of the initial design principle.
Users can use pandas scalars in pandas API on Spark directly.
However, error messages are confusing when users mistakenly assume pandas
scalars are reimplemented, for example, calling `ps.Timestamp` as below
```py
>>> ps.Series([ps.Timestamp(1994, 1, 31)])
Traceback (most recent call last):
...
AttributeError: module 'pyspark.pandas' has no attribute 'Timestamp'
```
Users may jump to the conclusion that a Series of timestamp data is not
supported.
However, we do support that by using `pd.Timestamp` as below:
```py
>>> ps.Series([pd.Timestamp(1994, 1, 31)])
0 1994-01-31
dtype: datetime64[ns]
```
We should inform users to use pandas scalars instead.
In addition, `PandasNotImplementedError` should be raised rather than
`AttributeError` for clarity.
### Why are the changes needed?
Better error messages should be clear and tell how to fix the errors.
That can enhance usability, debuggability, and furthermore, user adoption.
### Does this PR introduce _any_ user-facing change?
Yes. Error messages change. For example:
**Before**
```py
>>> ps.Series([ps.Timestamp(1994, 1, 31)])
Traceback (most recent call last):
...
AttributeError: module 'pyspark.pandas' has no attribute 'Timestamp'
```
**After**
```py
>>> ps.Series([ps.Timestamp(1994, 1, 31)])
Traceback (most recent call last):
...
pyspark.pandas.exceptions.PandasNotImplementedError: The scalar
`ps.Timestamp` is not reimplemented in pyspark.pandas; use `pd.Timestamp`.
```
### How was this patch tested?
Unit tests.
Closes #37168 from xinrong-meng/ps_missing_scalar.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
dev/sparktestsupport/modules.py | 1 +
python/pyspark/pandas/__init__.py | 3 ++
python/pyspark/pandas/exceptions.py | 10 +++++-
python/pyspark/pandas/missing/scalars.py | 29 +++++++++++++++
python/pyspark/pandas/supported_api_gen.py | 5 +--
python/pyspark/pandas/tests/test_scalars.py | 55 +++++++++++++++++++++++++++++
6 files changed, 100 insertions(+), 3 deletions(-)
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
index 8b06965332b..d776410fd2c 100644
--- a/dev/sparktestsupport/modules.py
+++ b/dev/sparktestsupport/modules.py
@@ -647,6 +647,7 @@ pyspark_pandas = Module(
"pyspark.pandas.tests.test_resample",
"pyspark.pandas.tests.test_reshape",
"pyspark.pandas.tests.test_rolling",
+ "pyspark.pandas.tests.test_scalars",
"pyspark.pandas.tests.test_series_conversion",
"pyspark.pandas.tests.test_series_datetime",
"pyspark.pandas.tests.test_series_string",
diff --git a/python/pyspark/pandas/__init__.py
b/python/pyspark/pandas/__init__.py
index e367ef5e252..518326c0c5e 100644
--- a/python/pyspark/pandas/__init__.py
+++ b/python/pyspark/pandas/__init__.py
@@ -27,6 +27,7 @@ from distutils.version import LooseVersion
from typing import Any
from pyspark.pandas.missing.general_functions import
_MissingPandasLikeGeneralFunctions
+from pyspark.pandas.missing.scalars import _MissingPandasLikeScalars
from pyspark.sql.pandas.utils import require_minimum_pandas_version,
require_minimum_pyarrow_version
try:
@@ -158,6 +159,8 @@ from pyspark.pandas.sql_formatter import sql
def __getattr__(key: str) -> Any:
if key.startswith("__"):
raise AttributeError(key)
+ if hasattr(_MissingPandasLikeScalars, key):
+ raise getattr(_MissingPandasLikeScalars, key)
if hasattr(_MissingPandasLikeGeneralFunctions, key):
return getattr(_MissingPandasLikeGeneralFunctions, key)
else:
diff --git a/python/pyspark/pandas/exceptions.py
b/python/pyspark/pandas/exceptions.py
index 829c753769e..d93f0bf0b68 100644
--- a/python/pyspark/pandas/exceptions.py
+++ b/python/pyspark/pandas/exceptions.py
@@ -69,10 +69,13 @@ class PandasNotImplementedError(NotImplementedError):
method_name: Optional[str] = None,
arg_name: Optional[str] = None,
property_name: Optional[str] = None,
+ scalar_name: Optional[str] = None,
deprecated: bool = False,
reason: str = "",
):
- assert (method_name is None) != (property_name is None)
+ assert [method_name is not None, property_name is not None,
scalar_name is not None].count(
+ True
+ ) == 1
self.class_name = class_name
self.method_name = method_name
self.arg_name = arg_name
@@ -95,6 +98,11 @@ class PandasNotImplementedError(NotImplementedError):
msg = "The method `{0}.{1}()` is not
implemented{2}".format(
class_name, method_name, reason
)
+ elif scalar_name is not None:
+ msg = (
+ "The scalar `{0}.{1}` is not reimplemented in pyspark.pandas;"
+ " use `pd.{1}`.".format(class_name, scalar_name)
+ )
else:
if deprecated:
msg = (
diff --git a/python/pyspark/pandas/missing/scalars.py
b/python/pyspark/pandas/missing/scalars.py
new file mode 100644
index 00000000000..a9c0277be01
--- /dev/null
+++ b/python/pyspark/pandas/missing/scalars.py
@@ -0,0 +1,29 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from pyspark.pandas.exceptions import PandasNotImplementedError
+
+
+def _unsupported_scalar(scalar_name):
+ return PandasNotImplementedError(class_name="ps", scalar_name=scalar_name)
+
+
+class _MissingPandasLikeScalars:
+ Timestamp = _unsupported_scalar("Timestamp")
+ Timedelta = _unsupported_scalar("Timedelta")
+ Period = _unsupported_scalar("Period")
+ Interval = _unsupported_scalar("Interval")
+ Categorical = _unsupported_scalar("Categorical")
diff --git a/python/pyspark/pandas/supported_api_gen.py
b/python/pyspark/pandas/supported_api_gen.py
index 019ed0ce254..df4e11ebd8c 100644
--- a/python/pyspark/pandas/supported_api_gen.py
+++ b/python/pyspark/pandas/supported_api_gen.py
@@ -27,6 +27,7 @@ from typing import Any, Callable, Dict, List, NamedTuple,
Set, TextIO, Tuple
import pyspark.pandas as ps
import pyspark.pandas.groupby as psg
import pyspark.pandas.window as psw
+from pyspark.pandas.exceptions import PandasNotImplementedError
import pandas as pd
import pandas.core.groupby as pdg
@@ -131,7 +132,7 @@ def _create_supported_by_module(
pd_module = getattr(pd_module_group, module_name) if module_name else
pd_module_group
try:
ps_module = getattr(ps_module_group, module_name) if module_name else
ps_module_group
- except AttributeError:
+ except (AttributeError, PandasNotImplementedError):
# module not implemented
return {}
@@ -262,7 +263,7 @@ def _transform_missing(
def _get_pd_modules(pd_module_group: Any) -> List[str]:
"""
- Returns sorted pandas memeber list from pandas module path.
+ Returns sorted pandas member list from pandas module path.
Parameters
----------
diff --git a/python/pyspark/pandas/tests/test_scalars.py
b/python/pyspark/pandas/tests/test_scalars.py
new file mode 100644
index 00000000000..47efde46b26
--- /dev/null
+++ b/python/pyspark/pandas/tests/test_scalars.py
@@ -0,0 +1,55 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import inspect
+
+import pyspark.pandas as ps
+from pyspark.pandas.exceptions import PandasNotImplementedError
+from pyspark.pandas.missing.scalars import _MissingPandasLikeScalars
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+
+
+class ScalarTest(PandasOnSparkTestCase):
+ def test_missing(self):
+ missing_scalars = inspect.getmembers(_MissingPandasLikeScalars)
+
+ missing_scalars = [
+ name
+ for (name, type_) in missing_scalars
+ if isinstance(type_, PandasNotImplementedError)
+ ]
+
+ for scalar_name in missing_scalars:
+ with self.assertRaisesRegex(
+ PandasNotImplementedError,
+ "The scalar `ps.{0}` is not reimplemented in pyspark.pandas;"
+ " use `pd.{0}`.".format(scalar_name),
+ ):
+ getattr(ps, scalar_name)
+
+
+if __name__ == "__main__":
+ import unittest
+ from pyspark.pandas.tests.test_scalars import * # noqa: F401
+
+ try:
+ import xmlrunner # type: ignore[import]
+
+ testRunner = xmlrunner.XMLTestRunner(output="target/test-reports",
verbosity=2)
+ except ImportError:
+ testRunner = None
+ unittest.main(testRunner=testRunner, verbosity=2)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]