[
https://issues.apache.org/jira/browse/SPARK-48086?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Dongjoon Hyun closed SPARK-48086.
---------------------------------
> Different Arrow versions in client and server
> ----------------------------------------------
>
> Key: SPARK-48086
> URL: https://issues.apache.org/jira/browse/SPARK-48086
> Project: Spark
> Issue Type: Sub-task
> Components: Connect, PySpark, SQL
> Affects Versions: 4.0.0
> Reporter: Hyukjin Kwon
> Assignee: Hyukjin Kwon
> Priority: Major
> Labels: pull-request-available
> Fix For: 3.5.2
>
>
> {code}
> ======================================================================
> FAIL [1.071s]: test_pandas_udf_arrow_overflow
> (pyspark.sql.tests.connect.test_parity_pandas_udf.PandasUDFParityTests.test_pandas_udf_arrow_overflow)
> ----------------------------------------------------------------------
> pyspark.errors.exceptions.connect.PythonException:
> An exception was thrown from the Python worker. Please see the stack trace
> below.
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 302, in _create_array
> return pa.Array.from_pandas(
> ^^^^^^^^^^^^^^^^^^^^^
> File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
> File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
> File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
> File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
> pyarrow.lib.ArrowInvalid: Integer value 128 not in range: -128 to 127
> The above exception was the direct cause of the following exception:
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1834, in main
> process()
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1826, in process
> serializer.dump_stream(out_iter, outfile)
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 531, in dump_stream
> return ArrowStreamSerializer.dump_stream(self,
> init_stream_yield_batches(), stream)
>
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 104, in dump_stream
> for batch in iterator:
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 525, in init_stream_yield_batches
> batch = self._create_batch(series)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 511, in _create_batch
> arrs.append(self._create_array(s, t, arrow_cast=self._arrow_cast))
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 330, in _create_array
> raise PySparkValueError(error_msg % (series.dtype, series.na...
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf.py",
> line 299, in test_pandas_udf_arrow_overflow
> with self.assertRaisesRegex(
> AssertionError: "Exception thrown when converting pandas.Series" does not
> match "
> An exception was thrown from the Python worker. Please see the stack trace
> below.
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 302, in _create_array
> return pa.Array.from_pandas(
> ^^^^^^^^^^^^^^^^^^^^^
> File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
> File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
> File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
> File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
> pyarrow.lib.ArrowInvalid: Integer value 128 not in range: -128 to 127
> The above exception was the direct cause of the following exception:
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1834, in main
> process()
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1826, in process
> serializer.dump_stream(out_iter, outfile)
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 531, in dump_stream
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf.py",
> line 279, in test_pandas_udf_detect_unsafe_type_conversion
> with self.assertRaisesRegex(
> AssertionError: "Exception thrown when converting pandas.Series" does not
> match "
> An exception was thrown from the Python worker. Please see the stack trace
> below.
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line
> [302](https://github.com/HyukjinKwon/spark/actions/runs/8916220872/job/24487232590#step:9:303),
> in _create_array
> return pa.Array.from_pandas(
> ^^^^^^^^^^^^^^^^^^^^^
> File "pyarrow/array.pxi", line 1054, in pyarrow.lib.Array.from_pandas
> File "pyarrow/array.pxi", line 323, in pyarrow.lib.array
> File "pyarrow/array.pxi", line 83, in pyarrow.lib._ndarray_to_array
> File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
> pyarrow.lib.ArrowInvalid: Float value 0.5 was truncated converting to int32
> The above exception was the direct cause of the following exception:
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1834, in main
> process()
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1826, in process
> serializer.dump_stream(out_iter, outfile)
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 531, in dump_stream
> return ArrowStreamSerializer.dump_stream(self,
> init_stream_yield_batches(), stream)
>
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 104, in dump_stream
> for batch in iterator:
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 525, in init_stream_yield_batches
> batch = self._create_batch(series)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 511, in _create_batch
> arrs.append(self._create_array(s, t, arrow_cast=self._arrow_cast))
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 330, in _create_array
> raise PySparkValueError(error_msg % (series.dtype, ser..."
> ----------------------------------------------------------------------
> {code}
> {code}
> ======================================================================
> FAIL [0.162s]: test_vectorized_udf_exception
> (pyspark.sql.tests.connect.test_parity_pandas_udf_scalar.PandasUDFScalarParityTests.test_vectorized_udf_exception)
> ----------------------------------------------------------------------
> pyspark.errors.exceptions.connect.PythonException:
> An exception was thrown from the Python worker. Please see the stack trace
> below.
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1834, in main
> process()
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1826, in process
> serializer.dump_stream(out_iter, outfile)
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 531, in dump_stream
> return ArrowStreamSerializer.dump_stream(self,
> init_stream_yield_batches(), stream)
>
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 104, in dump_stream
> for batch in iterator:
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 524, in init_stream_yield_batches
> for series in iterator:
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1734, in mapper
> result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1734, in <genexpr>
> result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 146, in <lambda>
> verify_result_length(verify_result_type(func(*a)), len(a[0])),
> ^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/util.py", line
> 118, in wrapper
> return f(*args, **kwargs)
> ^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py",
> line 650, in <lambda>
> scalar_raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
> ~~^~...
> During handling of the above exception, another exception occurred:
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/connect/test_parity_pandas_udf_scalar.py",
> line 35, in test_vectorized_udf_exception
> self.check_vectorized_udf_exception()
> File
> "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py",
> line 658, in check_vectorized_udf_exception
> with self.assertRaisesRegex(Exception, "division( or modulo)? by zero"):
> AssertionError: "division( or modulo)? by zero" does not match "
> An exception was thrown from the Python worker. Please see the stack trace
> below.
> Traceback (most recent call last):
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1834, in main
> process()
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1826, in process
> serializer.dump_stream(out_iter, outfile)
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 531, in dump_stream
> return ArrowStreamSerializer.dump_stream(self,
> init_stream_yield_batches(), stream)
>
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 104, in dump_stream
> for batch in iterator:
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py",
> line 524, in init_stream_yield_batches
> for series in iterator:
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1734, in mapper
> result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 1734, in <genexpr>
> result = tuple(f(*[a[o] for o in arg_offsets]) for arg_offsets, f in udfs)
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/worker.py",
> line 146, in <lambda>
> verify_result_length(verify_result_type(func(*a)), len(a[0])),
> ^^^^^^^^
> File
> "/home/runner/work/spark/spark/python/lib/pyspark.zip/pyspark/util.py", line
> 118, in wrapper
> return f(*args, **kwargs)
> ^^^^^^^^^^^^^^^^^^
> File
> "/home/runner/work/spark/spark-3.5/python/pyspark/sql/tests/pandas/test_pandas_udf_scalar.py",
> line 650, in <lambda>
> scalar_raise_exception = pandas_udf(lambda x: x * (1 / 0), LongType())
> ~~^~..."
> ----------------------------------------------------------------------
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]