extra forward slash at the end. sometime I have seen this kind of issues On 25 April 2015 at 20:50, Jeetendra Gangele <gangele...@gmail.com> wrote:
> loc = "D:\\Project\\Spark\\code\\news\\jsonfeeds\\" > > On 25 April 2015 at 20:49, Jeetendra Gangele <gangele...@gmail.com> wrote: > >> Hi Ayan can you try below line >> >> loc = "D:\\Project\\Spark\\code\\news\\jsonfeeds" >> >> On 25 April 2015 at 20:08, ayan guha <guha.a...@gmail.com> wrote: >> >>> Hi >>> >>> I am facing this weird issue..... >>> >>> I am on Windows, and I am trying to load all files within a folder. Here >>> is my code - >>> >>> loc = "D:\\Project\\Spark\\code\\news\\jsonfeeds" >>> newsY = sc.textFile(loc) >>> print newsY.count() >>> >>> Even this simple code fails. I have tried with giving exact file names, >>> everything works. >>> >>> Am I missing something stupid here? Anyone facing this (anyone still use >>> windows?:)) >>> >>> Here is error trace: >>> >>> D:\Project\Spark\code\news\jsonfeeds >>> >>> Traceback (most recent call last): >>> File "D:/Project/Spark/code/newsfeeder.py", line 28, in <module> >>> print newsY.count() >>> File >>> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >>> line 932, in count >>> return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() >>> File >>> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >>> line 923, in sum >>> return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add) >>> File >>> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >>> line 739, in reduce >>> vals = self.mapPartitions(func).collect() >>> File >>> "D:\spark\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\spark-1.3.1-bin-hadoop2.6\python\pyspark\rdd.py", >>> line 713, in collect >>> port = self.ctx._jvm.PythonRDD.collectAndServe(self._jrdd.rdd()) >>> File "C:\Python27\lib\site-packages\py4j\java_gateway.py", line 537, >>> in __call__ >>> self.target_id, self.name) >>> File "C:\Python27\lib\site-packages\py4j\protocol.py", line 300, in >>> get_return_value >>> format(target_id, '.', name), value) >>> Py4JJavaError: An error occurred while calling >>> z:org.apache.spark.api.python.PythonRDD.collectAndServe. >>> : java.lang.NullPointerException >>> >>> at java.lang.ProcessBuilder.start(Unknown Source) >>> >>> at org.apache.hadoop.util.Shell.runCommand(Shell.java:482) >>> >>> at org.apache.hadoop.util.Shell.run(Shell.java:455) >>> >>> at >>> org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:715) >>> >>> at org.apache.hadoop.util.Shell.execCommand(Shell.java:808) >>> >>> at org.apache.hadoop.util.Shell.execCommand(Shell.java:791) >>> >>> at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) >>> >>> at >>> org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:582) >>> >>> at >>> org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:557) >>> >>> at >>> org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) >>> >>> at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1699) >>> >>> at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1681) >>> >>> at >>> org.apache.hadoop.mapred.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:268) >>> >>> at >>> org.apache.hadoop.mapred.FileInputFormat.listStatus(FileInputFormat.java:228) >>> >>> at >>> org.apache.hadoop.mapred.FileInputFormat.getSplits(FileInputFormat.java:313) >>> >>> at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:203) >>> >>> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) >>> >>> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) >>> >>> at scala.Option.getOrElse(Option.scala:120) >>> >>> at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) >>> >>> at >>> org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:32) >>> >>> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) >>> >>> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) >>> >>> at scala.Option.getOrElse(Option.scala:120) >>> >>> at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) >>> >>> at >>> org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:57) >>> >>> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:219) >>> >>> at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:217) >>> >>> at scala.Option.getOrElse(Option.scala:120) >>> >>> at org.apache.spark.rdd.RDD.partitions(RDD.scala:217) >>> >>> at org.apache.spark.SparkContext.runJob(SparkContext.scala:1512) >>> >>> at org.apache.spark.rdd.RDD.collect(RDD.scala:813) >>> >>> at >>> org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:374) >>> >>> at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala) >>> >>> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) >>> >>> at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) >>> >>> at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) >>> >>> at java.lang.reflect.Method.invoke(Unknown Source) >>> >>> at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) >>> >>> at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) >>> >>> at py4j.Gateway.invoke(Gateway.java:259) >>> >>> at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) >>> >>> at py4j.commands.CallCommand.execute(CallCommand.java:79) >>> >>> at py4j.GatewayConnection.run(GatewayConnection.java:207) >>> >>> at java.lang.Thread.run(Unknown Source) >>> >>> -- >>> Best Regards, >>> Ayan Guha >>> >> >> >> >> > > >