I'm using Spark SQL to query one partition at a time of Hive external table
that sits atop .gzip data, and then I'm saving that partition to a new HDFS
location as a set of parquet snappy files using .saveAsParquetFile()

The query completes successfully, but then I get a vague error message I
think saying one of the temporary files is missing.  Any idea what the issue
is here?

--------------

---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-22-39049a9c70f8> in <module>()
      8     status, output = commands.getstatusoutput("hdfs dfs -rmr " +
save_path)
      9     partitions = int((df.count() // 750000)+1)
---> 10     df.repartition(partitions).saveAsParquetFile(save_path + '/' +
str(dt1)[:10])
     11 
     12     dt1 += timedelta(days=1)

/opt/cloudera/parcels/CDH/lib/spark/python/pyspark/sql/dataframe.py in
saveAsParquetFile(self, path)
    119         True
    120         """
--> 121         self._jdf.saveAsParquetFile(path)
    122 
    123     def registerTempTable(self, name):

/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py
in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539 
    540         for temp_arg in temp_args:

/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
    298                 raise Py4JJavaError(
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:
    302                 raise Py4JError(

Py4JJavaError: An error occurred while calling o206.saveAsParquetFile.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 72
in stage 28.1 failed 4 times, most recent failure: Lost task 72.3 in stage
28.1 (TID 603, phd4.xxx.com): java.io.FileNotFoundException:
/hdata/1/yarn/nm/usercache/me/appcache/application_1446009923448_20507/blockmgr-20140dd7-7f36-4971-afe7-0278241f4439/22/temp_shuffle_c274830e-f626-4ae6-8923-ad72c561a84e
(No such file or directory)
    at java.io.FileOutputStream.open(Native Method)
    at java.io.FileOutputStream.<init>(FileOutputStream.java:221)
    at
org.apache.spark.storage.DiskBlockObjectWriter.open(BlockObjectWriter.scala:130)
    at
org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:360)
    at
org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:355)
    at scala.Array$.fill(Array.scala:267)
    at
org.apache.spark.util.collection.ExternalSorter.spillToPartitionFiles(ExternalSorter.scala:355)
    at
org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:211)
    at
org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
    at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
    at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
    at org.apache.spark.scheduler.Task.run(Task.scala:64)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
    at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
    at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
    at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
    at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
    at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
    at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
    at scala.Option.foreach(Option.scala:236)
    at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
    at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)



--
View this message in context: 
http://apache-spark-user-list.1001560.n3.nabble.com/Vague-Spark-SQL-error-message-with-saveAsParquetFile-tp25265.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Reply via email to