[ https://issues.apache.org/jira/browse/SPARK-3972?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14174110#comment-14174110 ]
Michael Griffiths commented on SPARK-3972: ------------------------------------------ This issue does NOT occur if I build Spark from source; using Bash and sbt\sbt assembly. It's restricted to the pre-compiled version. > PySpark Error on Windows with sc.wholeTextFiles > ----------------------------------------------- > > Key: SPARK-3972 > URL: https://issues.apache.org/jira/browse/SPARK-3972 > Project: Spark > Issue Type: Bug > Components: Input/Output, PySpark, Windows > Affects Versions: 1.1.0 > Environment: Windows 8.1 x64 > Java SE Version 8 Update 20 (build 1.8.0_20-b26); > Python 2.7.7 > Reporter: Michael Griffiths > Priority: Minor > > When running sc.wholeTextFiles() on a directory, I can run the command but > not do anything with the resulting RDD – specifically, I get an error in > py4j.protocol.Py4JJavaError; the error is unspecified. This occurs even if I > can read the text file(s) individually with sc.textFile() > Steps followed: > 1) Download Spark 1.1.0 (pre-builet for Hadoop 2.4: > [spark-1.1.0-bin-hadoop2.4.tgz|http://d3kbcqa49mib13.cloudfront.net/spark-1.1.0-bin-hadoop2.4.tgz]) > 2) Extract into folder at root of drive: **D:\spark** > 3) Create test folder at **D:\testdata** with one (HTML) file contained > within it. > 4) Launch PySpark at **bin\PySpark** > 5) Try to use sc.wholeTextFiles('d:/testdata'); fail. > Note: I followed instructions from the upcoming O'Reilly book [Learning > Spark|http://shop.oreilly.com/product/0636920028512.do] for this. I do not > have any related tools installed (e.g. Hadoop) on the Windows machine. > See session (below)with tracebacks from errors. > {noformat} > Welcome to > ____ __ > / __/__ ___ _____/ /__ > _\ \/ _ \/ _ `/ __/ '_/ > /__ / .__/\_,_/_/ /_/\_\ version 1.1.0 > /_/ > Using Python version 2.7.7 (default, Jun 11 2014 10:40:02) > SparkContext available as sc. > >>> file = sc.textFile("d:/testdata/0000cbcc5b470ec06f212990c68c8f76e887b884") > >>> file.count() > 732 > >>> file.first() > u'<!DOCTYPE html>' > >>> data = sc.wholeTextFiles('d:/testdata') > >>> data.first() > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "D:\spark\python\pyspark\rdd.py", line 1167, in first > return self.take(1)[0] > File "D:\spark\python\pyspark\rdd.py", line 1126, in take > totalParts = self._jrdd.partitions().size() > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py", line > 538, in __call__ > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py", line 300, > in get_return_value > py4j.protocol.Py4JJavaError: An error occurred while calling o21.partitions. > : java.lang.NullPointerException > at java.lang.ProcessBuilder.start(Unknown Source) > at org.apache.hadoop.util.Shell.runCommand(Shell.java:445) > at org.apache.hadoop.util.Shell.run(Shell.java:418) > at > org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:650) > at org.apache.hadoop.util.Shell.execCommand(Shell.java:739) > at org.apache.hadoop.util.Shell.execCommand(Shell.java:722) > at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:559) > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:534) > at > org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1697) > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1679) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:302) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:263) > at > org.apache.spark.input.WholeTextFileInputFormat.setMaxSplitSize(WholeTextFileInputFormat.scala:54) > at > org.apache.spark.rdd.WholeTextFileRDD.getPartitions(NewHadoopRDD.scala:219) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > at > org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:50) > at > org.apache.spark.api.java.JavaPairRDD.partitions(JavaPairRDD.scala:44) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) > at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) > at java.lang.reflect.Method.invoke(Unknown Source) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > at py4j.Gateway.invoke(Gateway.java:259) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:207) > at java.lang.Thread.run(Unknown Source) > >>> data.count() > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "D:\spark\python\pyspark\rdd.py", line 847, in count > return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum() > File "D:\spark\python\pyspark\rdd.py", line 838, in sum > return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add) > File "D:\spark\python\pyspark\rdd.py", line 759, in reduce > vals = self.mapPartitions(func).collect() > File "D:\spark\python\pyspark\rdd.py", line 723, in collect > bytesInJava = self._jrdd.collect().iterator() > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py", line > 538, in __call__ > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py", line 300, > in get_return_value > py4j.protocol.Py4JJavaError: An error occurred while calling o28.collect. > : java.lang.NullPointerException > at java.lang.ProcessBuilder.start(Unknown Source) > at org.apache.hadoop.util.Shell.runCommand(Shell.java:445) > at org.apache.hadoop.util.Shell.run(Shell.java:418) > at > org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:650) > at org.apache.hadoop.util.Shell.execCommand(Shell.java:739) > at org.apache.hadoop.util.Shell.execCommand(Shell.java:722) > at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:559) > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:534) > at > org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1697) > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1679) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:302) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:263) > at > org.apache.spark.input.WholeTextFileInputFormat.setMaxSplitSize(WholeTextFileInputFormat.scala:54) > at > org.apache.spark.rdd.WholeTextFileRDD.getPartitions(NewHadoopRDD.scala:219) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > at > org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:56) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:1135) > at org.apache.spark.rdd.RDD.collect(RDD.scala:774) > at > org.apache.spark.api.java.JavaRDDLike$class.collect(JavaRDDLike.scala:305) > at org.apache.spark.api.java.JavaRDD.collect(JavaRDD.scala:32) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) > at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) > at java.lang.reflect.Method.invoke(Unknown Source) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > at py4j.Gateway.invoke(Gateway.java:259) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:207) > at java.lang.Thread.run(Unknown Source) > >>> data.map(lambda x: len(x)).take(1) > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File "D:\spark\python\pyspark\rdd.py", line 1126, in take > totalParts = self._jrdd.partitions().size() > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py", line > 538, in __call__ > File "D:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py", line 300, > in get_return_value > py4j.protocol.Py4JJavaError: An error occurred while calling o61.partitions. > : java.lang.NullPointerException > at java.lang.ProcessBuilder.start(Unknown Source) > at org.apache.hadoop.util.Shell.runCommand(Shell.java:445) > at org.apache.hadoop.util.Shell.run(Shell.java:418) > at > org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:650) > at org.apache.hadoop.util.Shell.execCommand(Shell.java:739) > at org.apache.hadoop.util.Shell.execCommand(Shell.java:722) > at org.apache.hadoop.fs.FileUtil.execCommand(FileUtil.java:1097) > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.loadPermissionInfo(RawLocalFileSystem.java:559) > at > org.apache.hadoop.fs.RawLocalFileSystem$DeprecatedRawLocalFileStatus.getPermission(RawLocalFileSystem.java:534) > at > org.apache.hadoop.fs.LocatedFileStatus.<init>(LocatedFileStatus.java:42) > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1697) > at org.apache.hadoop.fs.FileSystem$4.next(FileSystem.java:1679) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.singleThreadedListStatus(FileInputFormat.java:302) > at > org.apache.hadoop.mapreduce.lib.input.FileInputFormat.listStatus(FileInputFormat.java:263) > at > org.apache.spark.input.WholeTextFileInputFormat.setMaxSplitSize(WholeTextFileInputFormat.scala:54) > at > org.apache.spark.rdd.WholeTextFileRDD.getPartitions(NewHadoopRDD.scala:219) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > at > org.apache.spark.api.python.PythonRDD.getPartitions(PythonRDD.scala:56) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:204) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:202) > at scala.Option.getOrElse(Option.scala:120) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:202) > at > org.apache.spark.api.java.JavaRDDLike$class.partitions(JavaRDDLike.scala:50) > at org.apache.spark.api.java.JavaRDD.partitions(JavaRDD.scala:32) > at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) > at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) > at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) > at java.lang.reflect.Method.invoke(Unknown Source) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) > at py4j.Gateway.invoke(Gateway.java:259) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at py4j.GatewayConnection.run(GatewayConnection.java:207) > at java.lang.Thread.run(Unknown Source) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org