[ https://issues.apache.org/jira/browse/SPARK-46177?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Hyukjin Kwon resolved SPARK-46177. ---------------------------------- Fix Version/s: 4.0.0 Resolution: Fixed Issue resolved by pull request 44086 [https://github.com/apache/spark/pull/44086] > Skip 'CrossValidatorTests.test_crossvalidator_with_fold_col' with Python 3.12 > ------------------------------------------------------------------------------ > > Key: SPARK-46177 > URL: https://issues.apache.org/jira/browse/SPARK-46177 > Project: Spark > Issue Type: Sub-task > Components: ML, PySpark, Tests > Affects Versions: 4.0.0 > Reporter: Hyukjin Kwon > Assignee: Hyukjin Kwon > Priority: Minor > Labels: pull-request-available > Fix For: 4.0.0 > > > {code} > ====================================================================== > ERROR [68.963s]: test_crossvalidator_with_fold_col > (pyspark.ml.tests.connect.test_legacy_mode_tuning.CrossValidatorTests.test_crossvalidator_with_fold_col) > ---------------------------------------------------------------------- > Traceback (most recent call last): > File > "/__w/spark/spark/python/pyspark/ml/tests/connect/test_legacy_mode_tuning.py", > line 274, in test_crossvalidator_with_fold_col > cv.fit(train_dataset) > File "/__w/spark/spark/python/pyspark/ml/connect/base.py", line 105, in fit > return self._fit(dataset) > ^^^^^^^^^^^^^^^^^^ > File "/__w/spark/spark/python/pyspark/ml/connect/tuning.py", line 447, in > _fit > bestModel = cast(Model, est.fit(dataset, epm[bestIndex])) > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File "/__w/spark/spark/python/pyspark/ml/connect/base.py", line 103, in fit > return self.copy(params)._fit(dataset) > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File "/__w/spark/spark/python/pyspark/ml/connect/classification.py", line > 251, in _fit > model_state_dict = distributor._train_on_dataframe( > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File "/__w/spark/spark/python/pyspark/ml/torch/distributor.py", line 1043, > in _train_on_dataframe > return self._run_distributed_training( > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File "/__w/spark/spark/python/pyspark/ml/torch/distributor.py", line 804, > in _run_distributed_training > ).collect() > ^^^^^^^^^ > File "/__w/spark/spark/python/pyspark/sql/dataframe.py", line 1369, in > collect > sock_info = self._jdf.collectToPython() > ^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File > "/__w/spark/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", > line 1322, in __call__ > return_value = get_return_value( > ^^^^^^^^^^^^^^^^^ > File "/__w/spark/spark/python/pyspark/errors/exceptions/captured.py", line > 182, in deco > return f(*a, **kw) > ^^^^^^^^^^^ > File "/__w/spark/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/protocol.py", > line 326, in get_return_value > raise Py4JJavaError( > py4j.protocol.Py4JJavaError: An error occurred while calling > o2582.collectToPython. > : org.apache.spark.SparkException: Job aborted due to stage failure: Could > not recover from a failed barrier ResultStage. Most recent failure reason: > Stage failed because barrier task ResultTask(89, 0) finished unsuccessfully. > org.apache.spark.api.python.PythonException: Traceback (most recent call > last): > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1535, in main > process() > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1527, in process > serializer.dump_stream(out_iter, outfile) > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 162, in dump_stream > return super(ArrowStreamUDFSerializer, > self).dump_stream(wrap_and_init_stream(), stream) > > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 101, in dump_stream > for batch in iterator: > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", > line 147, in wrap_and_init_stream > for batch, _ in iterator: > File "/__w/spark/spark/python/lib/pyspark.zip/pyspark/worker.py", line > 1296, in func > for result_batch, result_type in result_iter: > File "/__w/spark/spark/python/pyspark/ml/torch/distributor.py", line 721, > in wrapped_train_fn > output = TorchDistributor._get_output_from_framework_wrapper( > ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/ml/torch/distributor.py", > line 567, in _get_output_from_framework_wrapper > return framework_wrapper( > ^^^^^^^^^^^^^^^^^^ > File > "/__w/spark/spark/python/lib/pyspark.zip/pyspark/ml/torch/distributor.py", > line 908, in _run_training_on_pytorch_function > raise RuntimeError( > RuntimeError: TorchDistributor failed during training.View stdout logs for > detailed error message. > at > org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517) > at > org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:117) > at > org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:473) > at > org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:601) > at scala.collection.Iterator$$anon$9.hasNext(Iterator.scala:583) > at > org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:891) > at > org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:891) > at > org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:329) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93) > at > org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166) > at org.apache.spark.scheduler.Task.run(Task.scala:141) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:628) > at > org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) > at > org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:631) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) > at java.base/java.lang.Thread.run(Thread.java:840) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2820) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2817) > at scala.collection.immutable.List.foreach(List.scala:333) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2817) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskCompletion(DAGScheduler.scala:2252) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3081) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3021) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3010) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:990) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2428) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2449) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2468) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2493) > at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1047) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:408) > at org.apache.spark.rdd.RDD.collect(RDD.scala:1046) > at > org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:448) > at > org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:380) > at > org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:408) > at > org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:380) > at > org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4246) > at > org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4420) > at > org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:557) > at > org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4418) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$6(SQLExecution.scala:150) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:241) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId0$1(SQLExecution.scala:116) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:918) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId0(SQLExecution.scala:72) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:196) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4418) > at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4243) > at jdk.internal.reflect.GeneratedMethodAccessor84.invoke(Unknown Source) > at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.base/java.lang.reflect.Method.invoke(Method.java:568) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at > py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) > at py4j.ClientServerConnection.run(ClientServerConnection.java:106) > at java.base/java.lang.Thread.run(Thread.java:840) > ---------------------------------------------------------------------- > {code} -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org