[ https://issues.apache.org/jira/browse/SPARK-18512?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15751329#comment-15751329 ]
Mathieu D commented on SPARK-18512: ----------------------------------- I'm experiencing the same problem, but we're using HDFS, not S3. {code} 16/12/15 11:25:18 ERROR InsertIntoHadoopFsRelationCommand: Aborting job. java.io.FileNotFoundException: File hdfs://nameservice1/user/xdstore/rfs/rfsDB/_temporary/0 does not exist. at org.apache.hadoop.hdfs.DistributedFileSystem.listStatusInternal(DistributedFileSystem.java:795) at org.apache.hadoop.hdfs.DistributedFileSystem.access$700(DistributedFileSystem.java:106) at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:853) at org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:849) at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) at org.apache.hadoop.hdfs.DistributedFileSystem.listStatus(DistributedFileSystem.java:860) at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1517) at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1557) at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.getAllCommittedTaskPaths(FileOutputCommitter.java:291) at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:361) at org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:334) at org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:46) at org.apache.spark.sql.execution.datasources.BaseWriterContainer.commitJob(WriterContainer.scala:222) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelationCommand.scala:144) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115) at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:115) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58) at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56) at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115) at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115) at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:136) at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:133) at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:114) at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:86) at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:86) at org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:525) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:211) at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:194) at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:488) at com.bluedme.woda.ng.indexer.RfsRepository.append(RfsRepository.scala:36) at com.bluedme.woda.ng.indexer.RfsRepository.insert(RfsRepository.scala:23) at com.bluedme.woda.cmd.ShareDatasetImpl.runImmediate(ShareDatasetImpl.scala:33) at com.bluedme.woda.cmd.ShareDatasetImpl.runImmediate(ShareDatasetImpl.scala:13) at com.bluedme.woda.cmd.ImmediateCommandImpl$$anonfun$run$1.apply(CommandImpl.scala:21) at com.bluedme.woda.cmd.ImmediateCommandImpl$$anonfun$run$1.apply(CommandImpl.scala:21) at scala.concurrent.impl.Future$PromiseCompletingRunnable.liftedTree1$1(Future.scala:24) at scala.concurrent.impl.Future$PromiseCompletingRunnable.run(Future.scala:24) at scala.concurrent.impl.ExecutionContextImpl$AdaptedForkJoinTask.exec(ExecutionContextImpl.scala:121) at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) {code} We're on a CDH 5.7, Hadoop 2.6. Should I file another JIRA ? > FileNotFoundException on _temporary directory with Spark Streaming 2.0.1 and > S3A > -------------------------------------------------------------------------------- > > Key: SPARK-18512 > URL: https://issues.apache.org/jira/browse/SPARK-18512 > Project: Spark > Issue Type: Bug > Components: Structured Streaming > Affects Versions: 2.0.1 > Environment: AWS EMR 5.0.1 > Spark 2.0.1 > S3 EU-West-1 (S3A) > Reporter: Giuseppe Bonaccorso > > After a few hours of streaming processing and data saving in Parquet format, > I got always this exception: > {code:java} > java.io.FileNotFoundException: No such file or directory: > s3a://xxx/_temporary/0/task_xxxx > at > org.apache.hadoop.fs.s3a.S3AFileSystem.getFileStatus(S3AFileSystem.java:1004) > at > org.apache.hadoop.fs.s3a.S3AFileSystem.listStatus(S3AFileSystem.java:745) > at > org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:426) > at > org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJobInternal(FileOutputCommitter.java:362) > at > org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:334) > at > org.apache.parquet.hadoop.ParquetOutputCommitter.commitJob(ParquetOutputCommitter.java:46) > at > org.apache.spark.sql.execution.datasources.BaseWriterContainer.commitJob(WriterContainer.scala:222) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply$mcV$sp(InsertIntoHadoopFsRelationCommand.scala:144) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$$anonfun$run$1.apply(InsertIntoHadoopFsRelationCommand.scala:115) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:115) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:60) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:58) > at > org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:115) > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:136) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:133) > at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:114) > at > org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:86) > at > org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:86) > at > org.apache.spark.sql.execution.datasources.DataSource.write(DataSource.scala:510) > at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:211) > at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:194) > at > org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:488) > {code} > I've tried also s3:// and s3n:// but it always happens after a 3-5 hours. -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org