[ https://issues.apache.org/jira/browse/SPARK-46148?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Hyukjin Kwon updated SPARK-46148: --------------------------------- Description: {code} ********************************************************************** File "/__w/spark/spark/python/pyspark/pandas/mlflow.py", line 172, in pyspark.pandas.mlflow.load_model Failed example: prediction_df Exception raised: Traceback (most recent call last): File "/usr/lib/python3.10/doctest.py", line 1350, in __run exec(compile(example.source, filename, "single", File "<doctest pyspark.pandas.mlflow.load_model[18]>", line 1, in <module> prediction_df File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13291, in __repr__ pdf = cast("DataFrame", self._get_or_create_repr_pandas_cache(max_display_count)) File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13282, in _get_or_create_repr_pandas_cache self, "_repr_pandas_cache", {n: self.head(n + 1)._to_internal_pandas()} File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13277, in _to_internal_pandas return self._internal.to_pandas_frame File "/__w/spark/spark/python/pyspark/pandas/utils.py", line 599, in wrapped_lazy_property setattr(self, attr_name, fn(self)) File "/__w/spark/spark/python/pyspark/pandas/internal.py", line 1110, in to_pandas_frame pdf = sdf.toPandas() File "/__w/spark/spark/python/pyspark/sql/pandas/conversion.py", line 213, in toPandas rows = self.collect() File "/__w/spark/spark/python/pyspark/sql/dataframe.py", line 1369, in collect sock_info = self._jdf.collectToPython() File "/__w/spark/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in __call__ return_value = get_return_value( File "/__w/spark/spark/python/pyspark/errors/exceptions/captured.py", line 188, in deco raise converted from None pyspark.errors.exceptions.captured.PythonException: An exception was thrown from the Python worker. Please see the stack trace below. Traceback (most recent call last): File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1523, in main process() File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1515, in process serializer.dump_stream(out_iter, outfile) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 485, in dump_stream return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 101, in dump_stream for batch in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 478, in init_stream_yield_batches for series in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1284, in func for result_batch, result_type in result_iter: File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1619, in udf yield _predict_row_batch(batch_predict_fn, row_batch_args) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1383, in _predict_row_batch result = predict_fn(pdf, params) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1601, in batch_predict_fn return loaded_model.predict(pdf, params=params) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 491, in predict return _predict() File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 477, in _predict return self._predict_fn(data, params=params) File "/usr/local/lib/python3.10/dist-packages/mlflow/sklearn/__init__.py", line 517, in predict return self.sklearn_model.predict(data) File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line 386, in predict return self._decision_function(X) File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line 369, in _decision_function X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False) File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 580, in _validate_data self._check_feature_names(X, reset=reset) File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 507, in _check_feature_names raise ValueError(message) ValueError: The feature names should match those that were passed during fit. Feature names unseen at fit time: - 0 - 1 Feature names seen at fit time, yet now missing: - x1 - x2 JVM stacktrace: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 1.0 failed 1 times, most recent failure: Lost task 2.0 in stage 1.0 (TID 3) (localhost executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1523, in main process() File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1515, in process serializer.dump_stream(out_iter, outfile) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 485, in dump_stream return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 101, in dump_stream for batch in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 478, in init_stream_yield_batches for series in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1284, in func for result_batch, result_type in result_iter: File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1619, in udf yield _predict_row_batch(batch_predict_fn, row_batch_args) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1383, in _predict_row_batch at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:57) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54) at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166) at org.apache.spark.scheduler.Task.run(Task.scala:141) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:628) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:631) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) at java.base/java.lang.Thread.run(Thread.java:840) ********************************************************************** {code} See https://github.com/apache/spark/actions/runs/7020654429/job/19100965399 was: {code} ********************************************************************** File "/__w/spark/spark/python/pyspark/pandas/mlflow.py", line 172, in pyspark.pandas.mlflow.load_model Failed example: prediction_df Exception raised: Traceback (most recent call last): File "/usr/lib/python3.10/doctest.py", line 1350, in __run exec(compile(example.source, filename, "single", File "<doctest pyspark.pandas.mlflow.load_model[18]>", line 1, in <module> prediction_df File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13291, in __repr__ pdf = cast("DataFrame", self._get_or_create_repr_pandas_cache(max_display_count)) File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13282, in _get_or_create_repr_pandas_cache self, "_repr_pandas_cache", {n: self.head(n + 1)._to_internal_pandas()} File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13277, in _to_internal_pandas return self._internal.to_pandas_frame File "/__w/spark/spark/python/pyspark/pandas/utils.py", line 599, in wrapped_lazy_property setattr(self, attr_name, fn(self)) File "/__w/spark/spark/python/pyspark/pandas/internal.py", line 1110, in to_pandas_frame pdf = sdf.toPandas() File "/__w/spark/spark/python/pyspark/sql/pandas/conversion.py", line 213, in toPandas rows = self.collect() File "/__w/spark/spark/python/pyspark/sql/dataframe.py", line 1369, in collect sock_info = self._jdf.collectToPython() File "/__w/spark/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1322, in __call__ return_value = get_return_value( File "/__w/spark/spark/python/pyspark/errors/exceptions/captured.py", line 188, in deco raise converted from None pyspark.errors.exceptions.captured.PythonException: An exception was thrown from the Python worker. Please see the stack trace below. Traceback (most recent call last): File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1523, in main process() File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1515, in process serializer.dump_stream(out_iter, outfile) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 485, in dump_stream return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 101, in dump_stream for batch in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 478, in init_stream_yield_batches for series in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1284, in func for result_batch, result_type in result_iter: File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1619, in udf yield _predict_row_batch(batch_predict_fn, row_batch_args) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1383, in _predict_row_batch result = predict_fn(pdf, params) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1601, in batch_predict_fn return loaded_model.predict(pdf, params=params) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 491, in predict return _predict() File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 477, in _predict return self._predict_fn(data, params=params) File "/usr/local/lib/python3.10/dist-packages/mlflow/sklearn/__init__.py", line 517, in predict return self.sklearn_model.predict(data) File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line 386, in predict return self._decision_function(X) File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line 369, in _decision_function X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], reset=False) File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 580, in _validate_data self._check_feature_names(X, reset=reset) File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 507, in _check_feature_names raise ValueError(message) ValueError: The feature names should match those that were passed during fit. Feature names unseen at fit time: - 0 - 1 Feature names seen at fit time, yet now missing: - x1 - x2 JVM stacktrace: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 1.0 failed 1 times, most recent failure: Lost task 2.0 in stage 1.0 (TID 3) (localhost executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1523, in main process() File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1515, in process serializer.dump_stream(out_iter, outfile) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 485, in dump_stream return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream) File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 101, in dump_stream for batch in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 478, in init_stream_yield_batches for series in iterator: File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line 1284, in func for result_batch, result_type in result_iter: File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1619, in udf yield _predict_row_batch(batch_predict_fn, row_batch_args) File "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line 1383, in _predict_row_batch at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:57) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54) at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166) at org.apache.spark.scheduler.Task.run(Task.scala:141) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:628) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:631) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) at java.base/java.lang.Thread.run(Thread.java:840) ********************************************************************** {code} SEe https://github.com/apache/spark/actions/runs/7020654429/job/19100965399 > Fix pyspark.pandas.mlflow.load_model test (Python 3.12) > ------------------------------------------------------- > > Key: SPARK-46148 > URL: https://issues.apache.org/jira/browse/SPARK-46148 > Project: Spark > Issue Type: Sub-task > Components: PySpark > Affects Versions: 4.0.0 > Reporter: Hyukjin Kwon > Priority: Major > > {code} > ********************************************************************** > File "/__w/spark/spark/python/pyspark/pandas/mlflow.py", line 172, in > pyspark.pandas.mlflow.load_model > Failed example: > prediction_df > Exception raised: > Traceback (most recent call last): > File "/usr/lib/python3.10/doctest.py", line 1350, in __run > exec(compile(example.source, filename, "single", > File "<doctest pyspark.pandas.mlflow.load_model[18]>", line 1, in > <module> > prediction_df > File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13291, in > __repr__ > pdf = cast("DataFrame", > self._get_or_create_repr_pandas_cache(max_display_count)) > File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13282, in > _get_or_create_repr_pandas_cache > self, "_repr_pandas_cache", {n: self.head(n + > 1)._to_internal_pandas()} > File "/__w/spark/spark/python/pyspark/pandas/frame.py", line 13277, in > _to_internal_pandas > return self._internal.to_pandas_frame > File "/__w/spark/spark/python/pyspark/pandas/utils.py", line 599, in > wrapped_lazy_property > setattr(self, attr_name, fn(self)) > File "/__w/spark/spark/python/pyspark/pandas/internal.py", line 1110, > in to_pandas_frame > pdf = sdf.toPandas() > File "/__w/spark/spark/python/pyspark/sql/pandas/conversion.py", line > 213, in toPandas > rows = self.collect() > File "/__w/spark/spark/python/pyspark/sql/dataframe.py", line 1369, in > collect > sock_info = self._jdf.collectToPython() > File > "/__w/spark/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", > line 1322, in __call__ > return_value = get_return_value( > File "/__w/spark/spark/python/pyspark/errors/exceptions/captured.py", > line 188, in deco > raise converted from None > pyspark.errors.exceptions.captured.PythonException: > An exception was thrown from the Python worker. Please see the stack > trace below. > Traceback (most recent call last): > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1523, in main > process() > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1515, in process > serializer.dump_stream(out_iter, outfile) > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 485, in dump_stream > return ArrowStreamSerializer.dump_stream(self, > init_stream_yield_batches(), stream) > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 101, in dump_stream > for batch in iterator: > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 478, in init_stream_yield_batches > for series in iterator: > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1284, in func > for result_batch, result_type in result_iter: > File > "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line > 1619, in udf > yield _predict_row_batch(batch_predict_fn, row_batch_args) > File > "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line > 1383, in _predict_row_batch > result = predict_fn(pdf, params) > File > "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line > 1601, in batch_predict_fn > return loaded_model.predict(pdf, params=params) > File > "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line > 491, in predict > return _predict() > File > "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line > 477, in _predict > return self._predict_fn(data, params=params) > File > "/usr/local/lib/python3.10/dist-packages/mlflow/sklearn/__init__.py", line > 517, in predict > return self.sklearn_model.predict(data) > File > "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line > 386, in predict > return self._decision_function(X) > File > "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_base.py", line > 369, in _decision_function > X = self._validate_data(X, accept_sparse=["csr", "csc", "coo"], > reset=False) > File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line > 580, in _validate_data > self._check_feature_names(X, reset=reset) > File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line > 507, in _check_feature_names > raise ValueError(message) > ValueError: The feature names should match those that were passed during > fit. > Feature names unseen at fit time: > - 0 > - 1 > Feature names seen at fit time, yet now missing: > - x1 > - x2 > JVM stacktrace: > org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 > in stage 1.0 failed 1 times, most recent failure: Lost task 2.0 in stage 1.0 > (TID 3) (localhost executor driver): > org.apache.spark.api.python.PythonException: Traceback (most recent call > last): > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1523, in main > process() > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1515, in process > serializer.dump_stream(out_iter, outfile) > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 485, in dump_stream > return ArrowStreamSerializer.dump_stream(self, > init_stream_yield_batches(), stream) > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 101, in dump_stream > for batch in iterator: > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 478, in init_stream_yield_batches > for series in iterator: > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1284, in func > for result_batch, result_type in result_iter: > File > "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line > 1619, in udf > yield _predict_row_batch(batch_predict_fn, row_batch_args) > File > "/usr/local/lib/python3.10/dist-packages/mlflow/pyfunc/__init__.py", line > 1383, in _predict_row_batch > at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) > at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:57) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54) > at > org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166) > at org.apache.spark.scheduler.Task.run(Task.scala:141) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:628) > at > org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) > at > org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:631) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) > at java.base/java.lang.Thread.run(Thread.java:840) > ********************************************************************** > {code} > See https://github.com/apache/spark/actions/runs/7020654429/job/19100965399 -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org