I'm using Spark SQL to query one partition at a time of Hive external table that sits atop .gzip data, and then I'm saving that partition to a new HDFS location as a set of parquet snappy files using .saveAsParquetFile()
The query completes successfully, but then I get a vague error message I think saying one of the temporary files is missing. Any idea what the issue is here? -------------- --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-22-39049a9c70f8> in <module>() 8 status, output = commands.getstatusoutput("hdfs dfs -rmr " + save_path) 9 partitions = int((df.count() // 750000)+1) ---> 10 df.repartition(partitions).saveAsParquetFile(save_path + '/' + str(dt1)[:10]) 11 12 dt1 += timedelta(days=1) /opt/cloudera/parcels/CDH/lib/spark/python/pyspark/sql/dataframe.py in saveAsParquetFile(self, path) 119 True 120 """ --> 121 self._jdf.saveAsParquetFile(path) 122 123 def registerTempTable(self, name): /opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 536 answer = self.gateway_client.send_command(command) 537 return_value = get_return_value(answer, self.gateway_client, --> 538 self.target_id, self.name) 539 540 for temp_arg in temp_args: /opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 298 raise Py4JJavaError( 299 'An error occurred while calling {0}{1}{2}.\n'. --> 300 format(target_id, '.', name), value) 301 else: 302 raise Py4JError( Py4JJavaError: An error occurred while calling o206.saveAsParquetFile. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 72 in stage 28.1 failed 4 times, most recent failure: Lost task 72.3 in stage 28.1 (TID 603, phd4.xxx.com): java.io.FileNotFoundException: /hdata/1/yarn/nm/usercache/me/appcache/application_1446009923448_20507/blockmgr-20140dd7-7f36-4971-afe7-0278241f4439/22/temp_shuffle_c274830e-f626-4ae6-8923-ad72c561a84e (No such file or directory) at java.io.FileOutputStream.open(Native Method) at java.io.FileOutputStream.<init>(FileOutputStream.java:221) at org.apache.spark.storage.DiskBlockObjectWriter.open(BlockObjectWriter.scala:130) at org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:360) at org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:355) at scala.Array$.fill(Array.scala:267) at org.apache.spark.util.collection.ExternalSorter.spillToPartitionFiles(ExternalSorter.scala:355) at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:211) at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Vague-Spark-SQL-error-message-with-saveAsParquetFile-tp25265.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org