I'm trying to run a simple pipeline using PySpark, version 1.0.1 I've created an RDD over a parquetFile and am mapping the contents with a transformer function and now wish to write the data out to HDFS.
All of the executors fail with the same stack trace (below) I do get a directory on HDFS, but it's empty except for a file named _temporary. Any ideas? java.io.IOException (java.io.IOException: Filesystem closed} org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:629) org.apache.hadoop.hdfs.DFSInputStream.readWithStrategy(DFSInputStream.java:735) org.apache.hadoop.hdfs.DFSInputStream.read(DFSInputStream.java:793) java.io.DataInputStream.readFully(DataInputStream.java:195) java.io.DataInputStream.readFully(DataInputStream.java:169) parquet.hadoop.ParquetFileReader$Chunk.<init>(ParquetFileReader.java:369) parquet.hadoop.ParquetFileReader$Chunk.<init>(ParquetFileReader.java:362) parquet.hadoop.ParquetFileReader.readColumnChunkPages(ParquetFileReader.java:411) parquet.hadoop.ParquetFileReader.readNextRowGroup(ParquetFileReader.java:349) parquet.hadoop.InternalParquetRecordReader.checkRead(InternalParquetRecordReader.java:100) parquet.hadoop.InternalParquetRecordReader.nextKeyValue(InternalParquetRecordReader.java:172) parquet.hadoop.ParquetRecordReader.nextKeyValue(ParquetRecordReader.java:130) org.apache.spark.rdd.NewHadoopRDD$$anon$1.hasNext(NewHadoopRDD.scala:122) org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:39) scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:388) scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) scala.collection.Iterator$GroupedIterator.fill(Iterator.scala:966) scala.collection.Iterator$GroupedIterator.hasNext(Iterator.scala:972) scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327) org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:293) org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply$mcV$sp(PythonRDD.scala:200) org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply(PythonRDD.scala:175) org.apache.spark.api.python.PythonRDD$WriterThread$$anonfun$run$1.apply(PythonRDD.scala:175) org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1160) org.apache.spark.api.python.PythonRDD$WriterThread.run(PythonRDD.scala:174)