This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 2a9999fe5bf0 [SPARK-53671][PYTHON] Exclude 0-args from `@udf` eval
type inference
2a9999fe5bf0 is described below
commit 2a9999fe5bf0d4122073ae92ca39c20769868573
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Sep 24 08:10:57 2025 +0800
[SPARK-53671][PYTHON] Exclude 0-args from `@udf` eval type inference
### What changes were proposed in this pull request?
Exclude 0-args from `udf` eval type inference
### Why are the changes needed?
this case fails after https://github.com/apache/spark/pull/52323
```
In [5]: from pyspark.sql.functions import udf
In [6]: udf()
...: def f1() -> int:
...: return 1
...:
In [7]: spark.range(10).select(f1().alias("res")).collect()
25/09/23 10:08:16 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID
21)
org.apache.spark.api.python.PythonException: Traceback (most recent call
last):
File
"/Users/ruifeng.zheng/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
line 473, in _create_array
return pa.Array.from_pandas(
~~~~~~~~~~~~~~~~~~~~^
series, mask=mask, type=arrow_type, safe=self._safecheck
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
File "pyarrow/array.pxi", line 1259, in pyarrow.lib.Array.from_pandas
File "pyarrow/array.pxi", line 365, in pyarrow.lib.array
File "pyarrow/array.pxi", line 91, in pyarrow.lib._ndarray_to_array
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowTypeError: Expected a string or bytes dtype, got int64
The above exception was the direct cause of the following exception:
```
that is due to `f1` is inferred as a `SQL_GROUPED_AGG_PANDAS_UDF`.
I think 0-arg for `SQL_GROUPED_AGG_PANDAS_UDF` doesn't make sense, but to
be conservative, I just exclude 0-arg in `udf` side in this PR. We can revisit
the `pandas_udf` / `arrow_udf` later.
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
added UTs
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #52416 from zhengruifeng/fix_udf_type_hint.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/pyspark/sql/connect/udf.py | 4 ++--
python/pyspark/sql/pandas/typehints.py | 6 ++++--
python/pyspark/sql/tests/test_unified_udf.py | 16 ++++++++++++++++
python/pyspark/sql/udf.py | 4 ++--
4 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/python/pyspark/sql/connect/udf.py
b/python/pyspark/sql/connect/udf.py
index fc5a4c79d8ad..3bc2f87e4bd3 100644
--- a/python/pyspark/sql/connect/udf.py
+++ b/python/pyspark/sql/connect/udf.py
@@ -92,11 +92,11 @@ def _create_py_udf(
eval_type: Optional[int] = None
if useArrow is None:
# If the user doesn't explicitly set useArrow
- from pyspark.sql.pandas.typehints import infer_eval_type_from_func
+ from pyspark.sql.pandas.typehints import infer_eval_type_for_udf
try:
# Try to infer the eval type from type hints
- eval_type = infer_eval_type_from_func(f)
+ eval_type = infer_eval_type_for_udf(f)
except Exception:
warnings.warn("Cannot infer the eval type from type hints. ",
UserWarning)
diff --git a/python/pyspark/sql/pandas/typehints.py
b/python/pyspark/sql/pandas/typehints.py
index 4252060f8b22..610bd1df40ac 100644
--- a/python/pyspark/sql/pandas/typehints.py
+++ b/python/pyspark/sql/pandas/typehints.py
@@ -278,7 +278,8 @@ def infer_eval_type(
return eval_type
-def infer_eval_type_from_func( # type: ignore[no-untyped-def]
+# infer the eval type for @udf
+def infer_eval_type_for_udf( # type: ignore[no-untyped-def]
f,
) -> Optional[
Union[
@@ -291,7 +292,8 @@ def infer_eval_type_from_func( # type:
ignore[no-untyped-def]
]
]:
argspec = getfullargspec(f)
- if len(argspec.annotations) > 0:
+ # different from inference of @pandas_udf/@arrow_udf, 0-arg is not allowed
here
+ if len(argspec.args) > 0 and len(argspec.annotations) > 0:
try:
type_hints = get_type_hints(f)
except NameError:
diff --git a/python/pyspark/sql/tests/test_unified_udf.py
b/python/pyspark/sql/tests/test_unified_udf.py
index d74e404d7528..3c105637c791 100644
--- a/python/pyspark/sql/tests/test_unified_udf.py
+++ b/python/pyspark/sql/tests/test_unified_udf.py
@@ -423,6 +423,22 @@ class UnifiedUDFTestsMixin:
result =
self.spark.range(10).select(f("id").alias("res")).collect()
self.assertEqual(result, expected)
+ def test_0_args(self):
+ with self.sql_conf({"spark.sql.execution.pythonUDF.arrow.enabled":
False}):
+
+ @udf()
+ def f1() -> int:
+ return 1
+
+ @udf(returnType=LongType())
+ def f2() -> int:
+ return 1
+
+ for f in [f1, f2]:
+ self.assertEqual(f.evalType, PythonEvalType.SQL_BATCHED_UDF)
+ result =
self.spark.range(10).select(f().alias("res")).collect()
+ self.assertEqual(len(result), 10)
+
class UnifiedUDFTests(UnifiedUDFTestsMixin, ReusedSQLTestCase):
pass
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 4e37ec88846b..b28bccb04bc7 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -125,11 +125,11 @@ def _create_py_udf(
eval_type: Optional[int] = None
if useArrow is None:
# If the user doesn't explicitly set useArrow
- from pyspark.sql.pandas.typehints import infer_eval_type_from_func
+ from pyspark.sql.pandas.typehints import infer_eval_type_for_udf
try:
# Try to infer the eval type from type hints
- eval_type = infer_eval_type_from_func(f)
+ eval_type = infer_eval_type_for_udf(f)
except Exception:
warnings.warn("Cannot infer the eval type from type hints. ",
UserWarning)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]