[ https://issues.apache.org/jira/browse/SPARK-34351?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17279128#comment-17279128 ]
Huseyin Elci commented on SPARK-34351: -------------------------------------- I used StackOverflow for this issue but I didn't find anything. I spent over the 3 days for solving of this issue. I looked http://spark.apache.org/community.html. And It has lots of "Py4JJavaError" error. I checked a few comment. Almost of there is not same issue or there is not solving about another error of "Py4JJavaError" > Running into "Py4JJavaError" while counting to text file or list using > Pyspark, Jupyter notebook > ------------------------------------------------------------------------------------------------ > > Key: SPARK-34351 > URL: https://issues.apache.org/jira/browse/SPARK-34351 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 2.3.1 > Environment: PS> python --version > *Python 3.6.8* > PS> jupyter --version > j*upyter core : 4.7.0* > *jupyter-notebook : 6.2.0* > qtconsole : 5.0.2 > ipython : 7.16.1 > ipykernel : 5.4.3 > jupyter client : 6.1.11 > jupyter lab : not installed > nbconvert : 6.0.7 > ipywidgets : 7.6.3 > nbformat : 5.1.2 > traitlets : 4.3.3 > PS > java -version > *java version "1.8.0_271"* > Java(TM) SE Runtime Environment (build 1.8.0_271-b09) > Java HotSpot(TM) 64-Bit Server VM (build 25.271-b09, mixed mode) > > Spark versiyon > *spark-2.3.1-bin-hadoop2.7* > Reporter: Huseyin Elci > Priority: Major > > I run into the following error: > Any help resolving this error is greatly appreciated. > *My Code 1:* > {code:python} > import findspark > findspark.init("C:\Spark") > from pyspark.sql import SparkSession > from pyspark.conf import SparkConf > spark = SparkSession.builder\ > .master("local[4]")\ > .appName("WordCount_RDD")\ > .getOrCreate() > sc = spark.sparkContext > data = "D:\\05 Spark\\data\\MyArticle.txt" > story_rdd = sc.textFile(data) > story_rdd.count() > {code} > *My Code 2:* > {code:python} > import findspark > findspark.init("C:\Spark") > from pyspark import SparkContext > sc = SparkContext() > mylist = [1,2,2,3,5,48,98,62,14,55] > mylist_rdd = sc.parallelize(mylist) > mylist_rdd.map(lambda x: x*x) > mylist_rdd.map(lambda x: x*x).collect() > {code} > *ERROR:* > I took same error code for my codes. > {code:python} > --------------------------------------------------------------------------- > Py4JJavaError Traceback (most recent call last) > <ipython-input-9-1af9abd2340f> in <module> > ----> 1 story_rdd.count() > C:\Spark\python\pyspark\rdd.py in count(self) > 1071 3 > 1072 """ > -> 1073 return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() > 1074 > 1075 def stats(self): > C:\Spark\python\pyspark\rdd.py in sum(self) > 1062 6.0 > 1063 """ > -> 1064 return self.mapPartitions(lambda x: [sum(x)]).fold(0, operator.add) > 1065 > 1066 def count(self): > C:\Spark\python\pyspark\rdd.py in fold(self, zeroValue, op) > 933 # zeroValue provided to each partition is unique from the one provided > 934 # to the final reduce call > --> 935 vals = self.mapPartitions(func).collect() > 936 return reduce(op, vals, zeroValue) > 937 > C:\Spark\python\pyspark\rdd.py in collect(self) > 832 """ > 833 with SCCallSiteSync(self.context) as css: > --> 834 sock_info = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) > 835 return list(_load_from_socket(sock_info, self._jrdd_deserializer)) > 836 > C:\Spark\python\lib\py4j-0.10.7-src.zip\py4j\java_gateway.py in > __call__(self, *args) > 1255 answer = self.gateway_client.send_command(command) > 1256 return_value = get_return_value( > -> 1257 answer, self.gateway_client, self.target_id, self.name) > 1258 > 1259 for temp_arg in temp_args: > C:\Spark\python\pyspark\sql\utils.py in deco(*a, **kw) > 61 def deco(*a, **kw): > 62 try: > ---> 63 return f(*a, **kw) > 64 except py4j.protocol.Py4JJavaError as e: > 65 s = e.java_exception.toString() > C:\Spark\python\lib\py4j-0.10.7-src.zip\py4j\protocol.py in > get_return_value(answer, gateway_client, target_id, name) > 326 raise Py4JJavaError( > 327 "An error occurred while calling > {0} \{1} \{2} > .\n". > --> 328 format(target_id, ".", name), value) > 329 else: > 330 raise Py4JError( > Py4JJavaError: An error occurred while calling > z:org.apache.spark.api.python.PythonRDD.collectAndServe. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 > in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 > (TID 1, localhost, executor driver): org.apache.spark.SparkException: Python > worker failed to connect back. > at > org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:148) > at > org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:76) > at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117) > at > org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:86) > at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:64) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:109) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > at java.lang.Thread.run(Thread.java:748) > Caused by: java.net.SocketTimeoutException: Accept timed out > at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method) > at > java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131) > at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535) > at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189) > at java.net.ServerSocket.implAccept(ServerSocket.java:545) > at java.net.ServerSocket.accept(ServerSocket.java:513) > at > org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:142) > ... 12 more > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) > at scala.Option.foreach(Option.scala:257) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099) > at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:363) > at org.apache.spark.rdd.RDD.collect(RDD.scala:938) > at > org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:162) > at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at > sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) > at java.lang.reflect.Method.invoke(Method.java:498) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) > at py4j.Gateway.invoke(Gateway.java:282) > at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:238) > at java.lang.Thread.run(Thread.java:748) > Caused by: org.apache.spark.SparkException: Python worker failed to connect > back. > at > org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:148) > at > org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:76) > at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:117) > at > org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:86) > at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:64) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:109) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) > ... 1 more > Caused by: java.net.SocketTimeoutException: Accept timed out > at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method) > at > java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:131) > at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:535) > at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:189) > at java.net.ServerSocket.implAccept(ServerSocket.java:545) > at java.net.ServerSocket.accept(ServerSocket.java:513) > at > org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:142) > ... 12 more > {code} > -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org