(spark) branch master updated: [SPARK-53671][PYTHON] Exclude 0-args from `@udf` eval type inference

ruifengz Tue, 23 Sep 2025 17:11:20 -0700

This is an automated email from the ASF dual-hosted git repository.

ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 2a9999fe5bf0 [SPARK-53671][PYTHON] Exclude 0-args from `@udf` eval 
type inference
2a9999fe5bf0 is described below

commit 2a9999fe5bf0d4122073ae92ca39c20769868573
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Wed Sep 24 08:10:57 2025 +0800

    [SPARK-53671][PYTHON] Exclude 0-args from `@udf` eval type inference
    
    ### What changes were proposed in this pull request?
    Exclude 0-args from `udf` eval type inference
    
    ### Why are the changes needed?
    this case fails after https://github.com/apache/spark/pull/52323
    
    ```
    In [5]: from pyspark.sql.functions import udf
    
    In [6]: udf()
       ...: def f1() -> int:
       ...:     return 1
       ...:
    
    In [7]: spark.range(10).select(f1().alias("res")).collect()
    25/09/23 10:08:16 ERROR Executor: Exception in task 0.0 in stage 5.0 (TID 
21)
    org.apache.spark.api.python.PythonException: Traceback (most recent call 
last):
      File 
"/Users/ruifeng.zheng/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
 line 473, in _create_array
        return pa.Array.from_pandas(
               ~~~~~~~~~~~~~~~~~~~~^
            series, mask=mask, type=arrow_type, safe=self._safecheck
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        )
        ^
      File "pyarrow/array.pxi", line 1259, in pyarrow.lib.Array.from_pandas
      File "pyarrow/array.pxi", line 365, in pyarrow.lib.array
      File "pyarrow/array.pxi", line 91, in pyarrow.lib._ndarray_to_array
      File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
    pyarrow.lib.ArrowTypeError: Expected a string or bytes dtype, got int64
    
    The above exception was the direct cause of the following exception:
    ```
    
    that is due to `f1` is inferred as a `SQL_GROUPED_AGG_PANDAS_UDF`.
    I think 0-arg for `SQL_GROUPED_AGG_PANDAS_UDF` doesn't make sense, but to 
be conservative, I just exclude 0-arg in `udf` side in this PR. We can revisit 
the `pandas_udf` / `arrow_udf` later.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    added UTs
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #52416 from zhengruifeng/fix_udf_type_hint.
    
    Authored-by: Ruifeng Zheng <[email protected]>
    Signed-off-by: Ruifeng Zheng <[email protected]>
---
 python/pyspark/sql/connect/udf.py            |  4 ++--
 python/pyspark/sql/pandas/typehints.py       |  6 ++++--
 python/pyspark/sql/tests/test_unified_udf.py | 16 ++++++++++++++++
 python/pyspark/sql/udf.py                    |  4 ++--
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/sql/connect/udf.py 
b/python/pyspark/sql/connect/udf.py
index fc5a4c79d8ad..3bc2f87e4bd3 100644
--- a/python/pyspark/sql/connect/udf.py
+++ b/python/pyspark/sql/connect/udf.py
@@ -92,11 +92,11 @@ def _create_py_udf(
     eval_type: Optional[int] = None
     if useArrow is None:
         # If the user doesn't explicitly set useArrow
-        from pyspark.sql.pandas.typehints import infer_eval_type_from_func
+        from pyspark.sql.pandas.typehints import infer_eval_type_for_udf
 
         try:
             # Try to infer the eval type from type hints
-            eval_type = infer_eval_type_from_func(f)
+            eval_type = infer_eval_type_for_udf(f)
         except Exception:
             warnings.warn("Cannot infer the eval type from type hints. ", 
UserWarning)
 
diff --git a/python/pyspark/sql/pandas/typehints.py 
b/python/pyspark/sql/pandas/typehints.py
index 4252060f8b22..610bd1df40ac 100644
--- a/python/pyspark/sql/pandas/typehints.py
+++ b/python/pyspark/sql/pandas/typehints.py
@@ -278,7 +278,8 @@ def infer_eval_type(
     return eval_type
 
 
-def infer_eval_type_from_func(  # type: ignore[no-untyped-def]
+# infer the eval type for @udf
+def infer_eval_type_for_udf(  # type: ignore[no-untyped-def]
     f,
 ) -> Optional[
     Union[
@@ -291,7 +292,8 @@ def infer_eval_type_from_func(  # type: 
ignore[no-untyped-def]
     ]
 ]:
     argspec = getfullargspec(f)
-    if len(argspec.annotations) > 0:
+    # different from inference of @pandas_udf/@arrow_udf, 0-arg is not allowed 
here
+    if len(argspec.args) > 0 and len(argspec.annotations) > 0:
         try:
             type_hints = get_type_hints(f)
         except NameError:
diff --git a/python/pyspark/sql/tests/test_unified_udf.py 
b/python/pyspark/sql/tests/test_unified_udf.py
index d74e404d7528..3c105637c791 100644
--- a/python/pyspark/sql/tests/test_unified_udf.py
+++ b/python/pyspark/sql/tests/test_unified_udf.py
@@ -423,6 +423,22 @@ class UnifiedUDFTestsMixin:
             result = 
self.spark.range(10).select(f("id").alias("res")).collect()
             self.assertEqual(result, expected)
 
+    def test_0_args(self):
+        with self.sql_conf({"spark.sql.execution.pythonUDF.arrow.enabled": 
False}):
+
+            @udf()
+            def f1() -> int:
+                return 1
+
+            @udf(returnType=LongType())
+            def f2() -> int:
+                return 1
+
+            for f in [f1, f2]:
+                self.assertEqual(f.evalType, PythonEvalType.SQL_BATCHED_UDF)
+                result = 
self.spark.range(10).select(f().alias("res")).collect()
+                self.assertEqual(len(result), 10)
+
 
 class UnifiedUDFTests(UnifiedUDFTestsMixin, ReusedSQLTestCase):
     pass
diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py
index 4e37ec88846b..b28bccb04bc7 100644
--- a/python/pyspark/sql/udf.py
+++ b/python/pyspark/sql/udf.py
@@ -125,11 +125,11 @@ def _create_py_udf(
     eval_type: Optional[int] = None
     if useArrow is None:
         # If the user doesn't explicitly set useArrow
-        from pyspark.sql.pandas.typehints import infer_eval_type_from_func
+        from pyspark.sql.pandas.typehints import infer_eval_type_for_udf
 
         try:
             # Try to infer the eval type from type hints
-            eval_type = infer_eval_type_from_func(f)
+            eval_type = infer_eval_type_for_udf(f)
         except Exception:
             warnings.warn("Cannot infer the eval type from type hints. ", 
UserWarning)
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-53671][PYTHON] Exclude 0-args from `@udf` eval type inference

Reply via email to