[ https://issues.apache.org/jira/browse/SPARK-10189?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
ABHISHEK CHOUDHARY updated SPARK-10189: --------------------------------------- Description: I am trying to use wholeTextFiles with pyspark , and now I am getting the same error - {code} textFiles = sc.wholeTextFiles('/file/content') textFiles.take(1) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/rdd.py", line 1277, in take res = self.context.runJob(self, takeUpToNumLeft, p, True) File "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/context.py", line 898, in runJob return list(_load_from_socket(port, mappedRDD._jrdd_deserializer)) File "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/rdd.py", line 138, in _load_from_socket raise Exception("could not open socket") Exception: could not open socket >>> 15/08/24 20:09:27 ERROR PythonRDD: Error while sending iterator java.net.SocketTimeoutException: Accept timed out at java.net.PlainSocketImpl.socketAccept(Native Method) at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:404) at java.net.ServerSocket.implAccept(ServerSocket.java:545) at java.net.ServerSocket.accept(ServerSocket.java:513) at org.apache.spark.api.python.PythonRDD$$anon$2.run(PythonRDD.scala:623) {code} Current piece of code in rdd.py- {code:title=rdd.py|borderStyle=solid} def _load_from_socket(port, serializer): sock = None # Support for both IPv4 and IPv6. # On most of IPv6-ready systems, IPv6 will take precedence. for res in socket.getaddrinfo("localhost", port, socket.AF_UNSPEC, socket.SOCK_STREAM): af, socktype, proto, canonname, sa = res try: sock = socket.socket(af, socktype, proto) sock.settimeout(3) sock.connect(sa) except socket.error: sock = None continue break if not sock: raise Exception("could not open socket") try: rf = sock.makefile("rb", 65536) for item in serializer.load_stream(rf): yield item finally: sock.close() {code} On further investigate the issue , i realized that in context.py , runJob is not actually triggering the server and so there is nothing to connect - {code:title=context.py|borderStyle=solid} port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions) {code} was: I am trying to use wholeTextFiles with pyspark , and now I am getting the same error - {code} textFiles = sc.wholeTextFiles('/file/content') textFiles.take(1) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/rdd.py", line 1277, in take res = self.context.runJob(self, takeUpToNumLeft, p, True) File "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/context.py", line 898, in runJob return list(_load_from_socket(port, mappedRDD._jrdd_deserializer)) File "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/rdd.py", line 138, in _load_from_socket raise Exception("could not open socket") Exception: could not open socket >>> 15/08/24 20:09:27 ERROR PythonRDD: Error while sending iterator java.net.SocketTimeoutException: Accept timed out at java.net.PlainSocketImpl.socketAccept(Native Method) at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:404) at java.net.ServerSocket.implAccept(ServerSocket.java:545) at java.net.ServerSocket.accept(ServerSocket.java:513) at org.apache.spark.api.python.PythonRDD$$anon$2.run(PythonRDD.scala:623) {code} Current piece of code in rdd.py- {code:title=rdd.py|borderStyle=solid} def _load_from_socket(port, serializer): sock = None # Support for both IPv4 and IPv6. # On most of IPv6-ready systems, IPv6 will take precedence. for res in socket.getaddrinfo("localhost", port, socket.AF_UNSPEC, socket.SOCK_STREAM): af, socktype, proto, canonname, sa = res try: sock = socket.socket(af, socktype, proto) sock.settimeout(3) sock.connect(sa) except socket.error: sock = None continue break if not sock: raise Exception("could not open socket") try: rf = sock.makefile("rb", 65536) for item in serializer.load_stream(rf): yield item finally: sock.close() {code} On further investigate the issue , i realized that in context.py , runJob is not actually triggering the server and so there is nothing to connect - ``` port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions) ``` > python rdd socket connection problem > ------------------------------------ > > Key: SPARK-10189 > URL: https://issues.apache.org/jira/browse/SPARK-10189 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 1.4.1 > Reporter: ABHISHEK CHOUDHARY > Labels: pyspark, socket > > I am trying to use wholeTextFiles with pyspark , and now I am getting the > same error - > {code} > textFiles = sc.wholeTextFiles('/file/content') > textFiles.take(1) > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File > "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/rdd.py", > line 1277, in take > res = self.context.runJob(self, takeUpToNumLeft, p, True) > File > "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/context.py", > line 898, in runJob > return list(_load_from_socket(port, mappedRDD._jrdd_deserializer)) > File > "/Volumes/work/bigdata/CHD5.4/spark-1.4.1-bin-hadoop2.6/python/pyspark/rdd.py", > line 138, in _load_from_socket > raise Exception("could not open socket") > Exception: could not open socket > >>> 15/08/24 20:09:27 ERROR PythonRDD: Error while sending iterator > java.net.SocketTimeoutException: Accept timed out > at java.net.PlainSocketImpl.socketAccept(Native Method) > at > java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:404) > at java.net.ServerSocket.implAccept(ServerSocket.java:545) > at java.net.ServerSocket.accept(ServerSocket.java:513) > at org.apache.spark.api.python.PythonRDD$$anon$2.run(PythonRDD.scala:623) > {code} > Current piece of code in rdd.py- > {code:title=rdd.py|borderStyle=solid} > def _load_from_socket(port, serializer): > sock = None > # Support for both IPv4 and IPv6. > # On most of IPv6-ready systems, IPv6 will take precedence. > for res in socket.getaddrinfo("localhost", port, socket.AF_UNSPEC, > socket.SOCK_STREAM): > af, socktype, proto, canonname, sa = res > try: > sock = socket.socket(af, socktype, proto) > sock.settimeout(3) > sock.connect(sa) > except socket.error: > sock = None > continue > break > if not sock: > raise Exception("could not open socket") > try: > rf = sock.makefile("rb", 65536) > for item in serializer.load_stream(rf): > yield item > finally: > sock.close() > {code} > On further investigate the issue , i realized that in context.py , runJob is > not actually triggering the server and so there is nothing to connect - > {code:title=context.py|borderStyle=solid} > port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org