[ https://issues.apache.org/jira/browse/SPARK-32174?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Ramin Hazegh updated SPARK-32174: --------------------------------- Description: h4. Converting a dataframe to Panda data frame using toPandas() fails. *Spark 3.0.0 Running in stand-alone mode* using docker containers based on jupyter docker stack here: [https://github.com/jupyter/docker-stacks/blob/master/pyspark-notebook/Dockerfile] $ conda list | grep arrow *arrow-cpp 0.17.1* py38h1234567_5_cpu conda-forge *pyarrow 0.17.1* py38h1234567_5_cpu conda-forge $ conda list | grep pandas *pandas 1.0.5* py38hcb8c335_0 conda-forge *To reproduce:* {code:java} import numpy as np import pandas as pd from pyspark.sql import SparkSessionspark = SparkSession.builder.master('spark://10.0.1.40:7077') \ .appName('test_arrow') \ .getOrCreate() # Generate a pandas DataFrame pdf = pd.DataFrame(np.random.rand(100, 3))# Create a Spark DataFrame from a pandas DataFrame using Arrow df = spark.createDataFrame(pdf)# Convert the Spark DataFrame back to a pandas DataFrame using Arrow result_pdf = df.select("*").toPandas() {code} ====================== /usr/local/spark/python/pyspark/sql/pandas/conversion.py:134: UserWarning: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has reached the error below and can not continue. Note that 'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an effect on failures in the middle of computation. An error occurred while calling o55.getResult. : org.apache.spark.SparkException: Exception thrown in awaitResult: at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:88) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:84) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.base/java.lang.Thread.run(Thread.java:834) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 0.0 failed 4 times, most recent failure: Lost task 14.3 in stage 0.0 (TID 31, 10.0.1.43, executor 0): java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2188) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:3558) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:3562) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:3538) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:130) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:132) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:127) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:104) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:98) at org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$1(SocketAuthServer.scala:60) at scala.util.Try$.apply(Try.scala:213) at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:60) Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) warnings.warn(msg) --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-1-35099e1ba5bf> in <module> 15 16 # Convert the Spark DataFrame back to a pandas DataFrame using Arrow ---> 17 result_pdf = df.select("*").toPandas() /usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self) 106 # Rename columns to avoid duplicated column names. 107 tmp_column_names = ['col_{}'.format(i) for i in range(len(self.columns))] --> 108 batches = self.toDF(*tmp_column_names)._collect_as_arrow() 109 if len(batches) > 0: 110 table = pyarrow.Table.from_batches(batches) /usr/local/spark/python/pyspark/sql/pandas/conversion.py in _collect_as_arrow(self) 242 finally: 243 # Join serving thread and raise any exceptions from collectAsArrowToPython --> 244 jsocket_auth_server.getResult() 245 246 # Separate RecordBatches from batch order indices in results /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 1302 1303 answer = self.gateway_client.send_command(command) -> 1304 return_value = get_return_value( 1305 answer, self.gateway_client, self.target_id, self.name) 1306 /usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 129 def deco(*a, **kw): 130 try: --> 131 return f(*a, **kw) 132 except py4j.protocol.Py4JJavaError as e: 133 converted = convert_exception(e.java_exception) /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) 325 if answer[1] == REFERENCE_TYPE: --> 326 raise Py4JJavaError( 327 "An error occurred while calling \{0} {1} {2} .\n". 328 format(target_id, ".", name), value) Py4JJavaError: An error occurred while calling o55.getResult. : org.apache.spark.SparkException: Exception thrown in awaitResult: at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:88) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:84) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.base/java.lang.Thread.run(Thread.java:834) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 0.0 failed 4 times, most recent failure: Lost task 14.3 in stage 0.0 (TID 31, 10.0.1.43, executor 0): java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2188) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:3558) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:3562) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:3538) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:130) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:132) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:127) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:104) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:98) at org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$1(SocketAuthServer.scala:60) at scala.util.Try$.apply(Try.scala:213) at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:60) Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) was: h4. Converting a dataframe to Panda data frame using toPandas() fails. *Spark 3.0.0 Running in stand-alone mode* using docker containers based on jupyter docker stack here: [https://github.com/jupyter/docker-stacks/blob/master/pyspark-notebook/Dockerfile] $ conda list | grep arrow *arrow-cpp 0.17.1* py38h1234567_5_cpu conda-forge *pyarrow 0.17.1* py38h1234567_5_cpu conda-forge $ conda list | grep pandas *pandas 1.0.5* py38hcb8c335_0 conda-forge *To reproduce:* import numpy as np import pandas as pd from pyspark.sql import SparkSession spark = SparkSession.builder.master('spark://10.0.1.40:7077') \ .appName('test_arrow') \ .getOrCreate() # Generate a pandas DataFrame pdf = pd.DataFrame(np.random.rand(100, 3)) # Create a Spark DataFrame from a pandas DataFrame using Arrow df = spark.createDataFrame(pdf) # Convert the Spark DataFrame back to a pandas DataFrame using Arrow result_pdf = df.select("*").toPandas() ====================== /usr/local/spark/python/pyspark/sql/pandas/conversion.py:134: UserWarning: toPandas attempted Arrow optimization because 'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has reached the error below and can not continue. Note that 'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an effect on failures in the middle of computation. An error occurred while calling o55.getResult. : org.apache.spark.SparkException: Exception thrown in awaitResult: at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:88) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:84) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.base/java.lang.Thread.run(Thread.java:834) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 0.0 failed 4 times, most recent failure: Lost task 14.3 in stage 0.0 (TID 31, 10.0.1.43, executor 0): java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2188) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:3558) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:3562) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:3538) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:130) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:132) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:127) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:104) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:98) at org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$1(SocketAuthServer.scala:60) at scala.util.Try$.apply(Try.scala:213) at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:60) Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) warnings.warn(msg) --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-1-35099e1ba5bf> in <module> 15 16 # Convert the Spark DataFrame back to a pandas DataFrame using Arrow ---> 17 result_pdf = df.select("*").toPandas() /usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self) 106 # Rename columns to avoid duplicated column names. 107 tmp_column_names = ['col_{}'.format(i) for i in range(len(self.columns))] --> 108 batches = self.toDF(*tmp_column_names)._collect_as_arrow() 109 if len(batches) > 0: 110 table = pyarrow.Table.from_batches(batches) /usr/local/spark/python/pyspark/sql/pandas/conversion.py in _collect_as_arrow(self) 242 finally: 243 # Join serving thread and raise any exceptions from collectAsArrowToPython --> 244 jsocket_auth_server.getResult() 245 246 # Separate RecordBatches from batch order indices in results /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args) 1302 1303 answer = self.gateway_client.send_command(command) -> 1304 return_value = get_return_value( 1305 answer, self.gateway_client, self.target_id, self.name) 1306 /usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw) 129 def deco(*a, **kw): 130 try: --> 131 return f(*a, **kw) 132 except py4j.protocol.Py4JJavaError as e: 133 converted = convert_exception(e.java_exception) /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) 325 if answer[1] == REFERENCE_TYPE: --> 326 raise Py4JJavaError( 327 "An error occurred while calling \{0} {1} {2}.\n". 328 format(target_id, ".", name), value) Py4JJavaError: An error occurred while calling o55.getResult. : org.apache.spark.SparkException: Exception thrown in awaitResult: at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:88) at org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:84) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:566) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.base/java.lang.Thread.run(Thread.java:834) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 14 in stage 0.0 failed 4 times, most recent failure: Lost task 14.3 in stage 0.0 (TID 31, 10.0.1.43, executor 0): java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972) at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971) at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950) at scala.Option.foreach(Option.scala:407) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) at org.apache.spark.SparkContext.runJob(SparkContext.scala:2188) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:3558) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:3562) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:3539) at org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:3538) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:130) at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:132) at org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:127) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:104) at org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:98) at org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$1(SocketAuthServer.scala:60) at scala.util.Try$.apply(Try.scala:213) at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:60) Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or java.nio.DirectByteBuffer.<init>(long, int) not available at io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) at org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) at org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) at org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) at org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) at org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) at org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:127) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) at java.base/java.lang.Thread.run(Thread.java:834) > toPandas attempted Arrow optimization but has reached an error and can not > continue > ----------------------------------------------------------------------------------- > > Key: SPARK-32174 > URL: https://issues.apache.org/jira/browse/SPARK-32174 > Project: Spark > Issue Type: Bug > Components: Optimizer, PySpark > Affects Versions: 3.0.0 > Environment: Spark 3.0.0, running in *stand-alone* mode using docker > containers based on jupyter docker stack here: > [https://github.com/jupyter/docker-stacks/blob/master/pyspark-notebook/Dockerfile] > > $ conda list | grep arrow > *arrow-cpp 0.17.1* py38h1234567_5_cpu conda-forge > *pyarrow 0.17.1* py38h1234567_5_cpu conda-forge > > $ conda list | grep pandas > *pandas 1.0.5* py38hcb8c335_0 conda-forge > > *Code to reproduce the error:* > {{import numpy as np}} > {{import pandas as pd}} > {{from pyspark.sql import SparkSession}} > {{url = }}{{'spark://10.0.1.40:7077'}}{{}} > {{spark = SparkSession.builder.master(url) \}} > {{ .appName('test_arrow') \}} > {{ .getOrCreate()}} > > {{# Generate a pandas DataFrame}} > {{pdf = pd.DataFrame(np.random.rand(100, 3))}} > {{# Create a Spark DataFrame from a pandas DataFrame using Arrow}} > {{df = spark.createDataFrame(pdf)}} > {{# Convert the Spark DataFrame back to a pandas DataFrame using Arrow}} > {{result_pdf = df.select("*").toPandas()}} > Reporter: Ramin Hazegh > Priority: Major > > h4. Converting a dataframe to Panda data frame using toPandas() fails. > > *Spark 3.0.0 Running in stand-alone mode* using docker containers based on > jupyter docker stack here: > [https://github.com/jupyter/docker-stacks/blob/master/pyspark-notebook/Dockerfile] > > $ conda list | grep arrow > *arrow-cpp 0.17.1* py38h1234567_5_cpu conda-forge > *pyarrow 0.17.1* py38h1234567_5_cpu conda-forge > $ conda list | grep pandas > *pandas 1.0.5* py38hcb8c335_0 conda-forge > > *To reproduce:* > {code:java} > import numpy as np > import pandas as pd > from pyspark.sql import SparkSessionspark = > SparkSession.builder.master('spark://10.0.1.40:7077') \ > .appName('test_arrow') \ > .getOrCreate() > > # Generate a pandas DataFrame > pdf = pd.DataFrame(np.random.rand(100, 3))# Create a Spark DataFrame from a > pandas DataFrame using Arrow > df = spark.createDataFrame(pdf)# Convert the Spark DataFrame back to a pandas > DataFrame using Arrow > result_pdf = df.select("*").toPandas() > {code} > > ====================== > /usr/local/spark/python/pyspark/sql/pandas/conversion.py:134: UserWarning: > toPandas attempted Arrow optimization because > 'spark.sql.execution.arrow.pyspark.enabled' is set to true, but has reached > the error below and can not continue. Note that > 'spark.sql.execution.arrow.pyspark.fallback.enabled' does not have an effect > on failures in the middle of computation. > An error occurred while calling o55.getResult. > : org.apache.spark.SparkException: Exception thrown in awaitResult: > at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302) > at > org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:88) > at > org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:84) > at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native > Method) > at > java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.base/java.lang.reflect.Method.invoke(Method.java:566) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:238) > at java.base/java.lang.Thread.run(Thread.java:834) > Caused by: org.apache.spark.SparkException: Job aborted due to stage > failure: Task 14 in stage 0.0 failed 4 times, most recent failure: Lost task > 14.3 in stage 0.0 (TID 31, 10.0.1.43, executor 0): > java.lang.UnsupportedOperationException: sun.misc.Unsafe or > java.nio.DirectByteBuffer.<init>(long, int) not available > at > io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) > at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) > at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) > at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) > at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:127) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971) > at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950) > at scala.Option.foreach(Option.scala:407) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2188) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:3558) > at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:3562) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:3539) > at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:3539) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:3538) > at > org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:130) > at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at > org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:132) > at > org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:127) > at > org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:104) > at > org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:98) > at > org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$1(SocketAuthServer.scala:60) > at scala.util.Try$.apply(Try.scala:213) > at > org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:60) > Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or > java.nio.DirectByteBuffer.<init>(long, int) not available > at > io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) > at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) > at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) > at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) > at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:127) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) > warnings.warn(msg) > --------------------------------------------------------------------------- > Py4JJavaError Traceback (most recent call last) > <ipython-input-1-35099e1ba5bf> in <module> > 15 > 16 # Convert the Spark DataFrame back to a pandas DataFrame using Arrow > ---> 17 result_pdf = df.select("*").toPandas() > /usr/local/spark/python/pyspark/sql/pandas/conversion.py in toPandas(self) > 106 # Rename columns to avoid duplicated column names. > 107 tmp_column_names = ['col_{}'.format(i) for i in range(len(self.columns))] > --> 108 batches = self.toDF(*tmp_column_names)._collect_as_arrow() > 109 if len(batches) > 0: > 110 table = pyarrow.Table.from_batches(batches) > /usr/local/spark/python/pyspark/sql/pandas/conversion.py in > _collect_as_arrow(self) > 242 finally: > 243 # Join serving thread and raise any exceptions from > collectAsArrowToPython > --> 244 jsocket_auth_server.getResult() > 245 > 246 # Separate RecordBatches from batch order indices in results > /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in > __call__(self, *args) > 1302 > 1303 answer = self.gateway_client.send_command(command) > -> 1304 return_value = get_return_value( > 1305 answer, self.gateway_client, self.target_id, self.name) > 1306 > /usr/local/spark/python/pyspark/sql/utils.py in deco(*a, **kw) > 129 def deco(*a, **kw): > 130 try: > --> 131 return f(*a, **kw) > 132 except py4j.protocol.Py4JJavaError as e: > 133 converted = convert_exception(e.java_exception) > /usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in > get_return_value(answer, gateway_client, target_id, name) > 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) > 325 if answer[1] == REFERENCE_TYPE: > --> 326 raise Py4JJavaError( > 327 "An error occurred while calling \{0} > {1} > {2} > .\n". > 328 format(target_id, ".", name), value) > Py4JJavaError: An error occurred while calling o55.getResult. > : org.apache.spark.SparkException: Exception thrown in awaitResult: > at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:302) > at > org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:88) > at > org.apache.spark.security.SocketAuthServer.getResult(SocketAuthServer.scala:84) > at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native > Method) > at > java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.base/java.lang.reflect.Method.invoke(Method.java:566) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:238) > at java.base/java.lang.Thread.run(Thread.java:834) > Caused by: org.apache.spark.SparkException: Job aborted due to stage > failure: Task 14 in stage 0.0 failed 4 times, most recent failure: Lost task > 14.3 in stage 0.0 (TID 31, 10.0.1.43, executor 0): > java.lang.UnsupportedOperationException: sun.misc.Unsafe or > java.nio.DirectByteBuffer.<init>(long, int) not available > at > io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) > at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) > at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) > at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) > at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:127) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2023) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:1972) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:1971) > at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1971) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:950) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:950) > at scala.Option.foreach(Option.scala:407) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:950) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2203) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2152) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2141) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:752) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2093) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2188) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$5(Dataset.scala:3558) > at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2(Dataset.scala:3562) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$2$adapted(Dataset.scala:3539) > at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3616) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:100) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:160) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:87) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:763) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64) > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3614) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1(Dataset.scala:3539) > at > org.apache.spark.sql.Dataset.$anonfun$collectAsArrowToPython$1$adapted(Dataset.scala:3538) > at > org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$2(SocketAuthServer.scala:130) > at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at > org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1(SocketAuthServer.scala:132) > at > org.apache.spark.security.SocketAuthServer$.$anonfun$serveToStream$1$adapted(SocketAuthServer.scala:127) > at > org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:104) > at > org.apache.spark.security.SocketFuncServer.handleConnection(SocketAuthServer.scala:98) > at > org.apache.spark.security.SocketAuthServer$$anon$1.$anonfun$run$1(SocketAuthServer.scala:60) > at scala.util.Try$.apply(Try.scala:213) > at > org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:60) > Caused by: java.lang.UnsupportedOperationException: sun.misc.Unsafe or > java.nio.DirectByteBuffer.<init>(long, int) not available > at > io.netty.util.internal.PlatformDependent.directBuffer(PlatformDependent.java:490) > at io.netty.buffer.NettyArrowBuf.getDirectBuffer(NettyArrowBuf.java:243) > at io.netty.buffer.NettyArrowBuf.nioBuffer(NettyArrowBuf.java:233) > at io.netty.buffer.ArrowBuf.nioBuffer(ArrowBuf.java:245) > at org.apache.arrow.vector.ipc.ReadChannel.readFully(ReadChannel.java:81) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.readMessageBody(MessageSerializer.java:696) > at > org.apache.arrow.vector.ipc.message.MessageSerializer.deserializeRecordBatch(MessageSerializer.java:344) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.loadBatch(ArrowConverters.scala:189) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.nextBatch(ArrowConverters.scala:165) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$$anon$2.<init>(ArrowConverters.scala:144) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.fromBatchIterator(ArrowConverters.scala:143) > at > org.apache.spark.sql.execution.arrow.ArrowConverters$.$anonfun$toDataFrame$1(ArrowConverters.scala:203) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:837) > at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:837) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:349) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:313) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) > at org.apache.spark.scheduler.Task.run(Task.scala:127) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:444) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1377) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:447) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:834) > > -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org