[jira] [Commented] (SPARK-2984) FileNotFoundException on _temporary directory
[ https://issues.apache.org/jira/browse/SPARK-2984?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15398289#comment-15398289 ] Denis Serduik commented on SPARK-2984: -- Something like this... The problem occurs when I change importPartition to importPartitionAsync service.getHitRecordsFromFile return DataFrame. {code} import import java.nio.file.{Path => JavaPath, Paths, Files} class DefaultHitDataManager(service : SparkService) extends DataSourceManager with Logging { val proceeded_prefix = "_proceeded_" val hit_data_pattern = "*_data*" val dsCache = new ConcurrentHashMap[String, HitsDataSource]() override def loadDataSource(remoteLocationBase: URI, alias: String): Future[HitsDataSource] = { val dsWorkingDir = java.nio.Files.createTempDirectory(alias) val fs = FileSystem.get(remoteLocationBase, service.sqlContext.sparkContext.hadoopConfiguration) val loadFuture = scanPartitions(remoteLocationBase).flatMap { case newPartitions => Future.traverse(newPartitions.toList)( { case remotePartitionFile => importPartitionAsync(alias, dsWorkingDir, fs, remotePartitionFile) }).flatMap { case statuses => FileUtil.fullyDelete(dsWorkingDir.toFile) val ds = HitsDataSource(remoteLocationBase, alias) dsCache.put(ds.alias, ds) Future.successful(ds) } } loadFuture } ///XXX: async loading causes race condition in FileSystem and things like that https://issues.apache.org/jira/browse/SPARK-2984 def importPartitionAsync(alias: String, dsWorkingDir: JavaPath, partitionFS: FileSystem, remotePartitionFile: FileStatus) = Future { importPartition(alias, dsWorkingDir,partitionFS, remotePartitionFile ) } def importPartition(alias: String, dsWorkingDir: JavaPath, partitionFS: FileSystem, remotePartitionFile: FileStatus) = { val partitionWorkingDir = Files.createTempDirectory(dsWorkingDir, remotePartitionFile.getPath.getName) val localPath = new Path("file://" + partitionWorkingDir.toString, remotePartitionFile.getPath.getName) partitionFS.copyToLocalFile(remotePartitionFile.getPath, localPath) val unzippedPartition = s"$partitionWorkingDir/hit_data*.tsv" val filePath = localPath.toUri.getPath val untarCommand = s"tar -xzf $filePath --wildcards --no-anchored '$hit_data_pattern'" val shellCmd = Array("bash", "-c", untarCommand.toString) val shexec = new ShellCommandExecutor(shellCmd, partitionWorkingDir.toFile) shexec.execute() val exitcode = shexec.getExitCode if (exitcode != 0) { throw new IOException(s"Error untarring file $filePath. Tar process exited with exit code $exitcode") } val partitionData = service.getHitRecordsFromFile("file://" + unzippedPartition) partitionData.coalesce(1).write.partitionBy("createdAtYear", "createdAtMonth"). mode(SaveMode.Append).saveAsTable(alias) val newPartitionPath = new Path(remotePartitionFile.getPath.getParent, proceeded_prefix + remotePartitionFile.getPath.getName) partitionFS.rename(remotePartitionFile.getPath, newPartitionPath) } def scanPartitions(locationBase: URI) = Future { val fs = FileSystem.get(locationBase, service.sqlContext.sparkContext.hadoopConfiguration) val location = new Path(locationBase) val newPartitions = if (fs.isDirectory(location)) { fs.listStatus(location, new PathFilter { override def accept(path: Path): Boolean = { !path.getName.contains(proceeded_prefix) } }) } else { Array(fs.getFileStatus(location)) } logDebug(s"Found new partitions : $newPartitions") newPartitions } } {code} I hope this will help > FileNotFoundException on _temporary directory > - > > Key: SPARK-2984 > URL: https://issues.apache.org/jira/browse/SPARK-2984 > Project: Spark > Issue Type: Bug > Components: Spark Core >Affects Versions: 1.1.0 >Reporter: Andrew Ash >Assignee: Josh Rosen >Priority: Critical > Fix For: 1.3.0 > > > We've seen several stacktraces and threads on the user mailing list where > people are having issues with a {{FileNotFoundException}} stemming from an > HDFS path containing {{_temporary}}. > I ([~aash]) think this may be related to {{spark.speculation}}. I think the > error condition might manifest in this circumstance: > 1) task T starts on a executor E1 > 2) it takes a long time, so task T' is started on another executor E2 > 3) T finishes in E1 so moves its data from {{_temporary}} to the final > destination and deletes the {{_temporary}} directory during cleanup > 4) T' finishes in E2 and attempts to move its data from {{_temporary}}, but > those files no longer exist!
[jira] [Commented] (SPARK-2984) FileNotFoundException on _temporary directory
[ https://issues.apache.org/jira/browse/SPARK-2984?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel=15397324#comment-15397324 ] Denis Serduik commented on SPARK-2984: -- I've faced with such error in Spark 1.6.1 and Hadoop 2.6.2 even for local filesystem (during unit tests ) when I was trying to append/import partitions in parallel/simultaneously. The issue is gone when I switch to sequential import. > FileNotFoundException on _temporary directory > - > > Key: SPARK-2984 > URL: https://issues.apache.org/jira/browse/SPARK-2984 > Project: Spark > Issue Type: Bug > Components: Spark Core >Affects Versions: 1.1.0 >Reporter: Andrew Ash >Assignee: Josh Rosen >Priority: Critical > Fix For: 1.3.0 > > > We've seen several stacktraces and threads on the user mailing list where > people are having issues with a {{FileNotFoundException}} stemming from an > HDFS path containing {{_temporary}}. > I ([~aash]) think this may be related to {{spark.speculation}}. I think the > error condition might manifest in this circumstance: > 1) task T starts on a executor E1 > 2) it takes a long time, so task T' is started on another executor E2 > 3) T finishes in E1 so moves its data from {{_temporary}} to the final > destination and deletes the {{_temporary}} directory during cleanup > 4) T' finishes in E2 and attempts to move its data from {{_temporary}}, but > those files no longer exist! exception > Some samples: > {noformat} > 14/08/11 08:05:08 ERROR JobScheduler: Error running job streaming job > 140774430 ms.0 > java.io.FileNotFoundException: File > hdfs://hadoopc/user/csong/output/human_bot/-140774430.out/_temporary/0/task_201408110805__m_07 > does not exist. > at > org.apache.hadoop.hdfs.DistributedFileSystem.listStatusInternal(DistributedFileSystem.java:654) > at > org.apache.hadoop.hdfs.DistributedFileSystem.access$600(DistributedFileSystem.java:102) > at > org.apache.hadoop.hdfs.DistributedFileSystem$14.doCall(DistributedFileSystem.java:712) > at > org.apache.hadoop.hdfs.DistributedFileSystem$14.doCall(DistributedFileSystem.java:708) > at > org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) > at > org.apache.hadoop.hdfs.DistributedFileSystem.listStatus(DistributedFileSystem.java:708) > at > org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.mergePaths(FileOutputCommitter.java:360) > at > org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.commitJob(FileOutputCommitter.java:310) > at > org.apache.hadoop.mapred.FileOutputCommitter.commitJob(FileOutputCommitter.java:136) > at > org.apache.spark.SparkHadoopWriter.commitJob(SparkHadoopWriter.scala:126) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:841) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:724) > at > org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:643) > at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1068) > at > org.apache.spark.streaming.dstream.DStream$$anonfun$8.apply(DStream.scala:773) > at > org.apache.spark.streaming.dstream.DStream$$anonfun$8.apply(DStream.scala:771) > at > org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:41) > at > org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40) > at > org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:40) > at scala.util.Try$.apply(Try.scala:161) > at org.apache.spark.streaming.scheduler.Job.run(Job.scala:32) > at > org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:172) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > {noformat} > -- Chen Song at > http://apache-spark-user-list.1001560.n3.nabble.com/saveAsTextFiles-file-not-found-exception-td10686.html > {noformat} > I am running a Spark Streaming job that uses saveAsTextFiles to save results > into hdfs files. However, it has an exception after 20 batches > result-140631234/_temporary/0/task_201407251119__m_03 does not > exist. > {noformat} > and > {noformat} > org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.hdfs.server.namenode.LeaseExpiredException): > No lease on /apps/data/vddil/real-time/checkpoint/temp: File does not exist. > Holder DFSClient_NONMAPREDUCE_327993456_13 does not have any
[jira] [Commented] (SPARK-2019) Spark workers die/disappear when job fails for nearly any reason
[ https://issues.apache.org/jira/browse/SPARK-2019?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14166555#comment-14166555 ] Denis Serduik commented on SPARK-2019: -- I have noticed the same problem with workers behavior. My installation: Spark 1.0.2-hadoop2.0.0-mr1-cdh4.2.0 on Mesos 0.13. In my case, workers fail when there was an error while serialization the closure. Spark workers die/disappear when job fails for nearly any reason Key: SPARK-2019 URL: https://issues.apache.org/jira/browse/SPARK-2019 Project: Spark Issue Type: Bug Affects Versions: 0.9.0 Reporter: sam We either have to reboot all the nodes, or run 'sudo service spark-worker restart' across our cluster. I don't think this should happen - the job failures are often not even that bad. There is a 5 upvoted SO question here: http://stackoverflow.com/questions/22031006/spark-0-9-0-worker-keeps-dying-in-standalone-mode-when-job-fails We shouldn't be giving restart privileges to our devs, and therefore our sysadm has to frequently restart the workers. When the sysadm is not around, there is nothing our devs can do. Many thanks -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Comment Edited] (SPARK-2019) Spark workers die/disappear when job fails for nearly any reason
[ https://issues.apache.org/jira/browse/SPARK-2019?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14166555#comment-14166555 ] Denis Serduik edited comment on SPARK-2019 at 10/10/14 8:39 AM: I have noticed the same problem with workers behavior. My installation: Spark 1.0.2-hadoop2.0.0-mr1-cdh4.2.0 on Mesos 0.13. In my case, workers fail when there was an error while serialization the closure. Also please notice that we run Spark in coarse-grained mode was (Author: dmaverick): I have noticed the same problem with workers behavior. My installation: Spark 1.0.2-hadoop2.0.0-mr1-cdh4.2.0 on Mesos 0.13. In my case, workers fail when there was an error while serialization the closure. Spark workers die/disappear when job fails for nearly any reason Key: SPARK-2019 URL: https://issues.apache.org/jira/browse/SPARK-2019 Project: Spark Issue Type: Bug Affects Versions: 0.9.0 Reporter: sam We either have to reboot all the nodes, or run 'sudo service spark-worker restart' across our cluster. I don't think this should happen - the job failures are often not even that bad. There is a 5 upvoted SO question here: http://stackoverflow.com/questions/22031006/spark-0-9-0-worker-keeps-dying-in-standalone-mode-when-job-fails We shouldn't be giving restart privileges to our devs, and therefore our sysadm has to frequently restart the workers. When the sysadm is not around, there is nothing our devs can do. Many thanks -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Comment Edited] (SPARK-2019) Spark workers die/disappear when job fails for nearly any reason
[ https://issues.apache.org/jira/browse/SPARK-2019?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14166555#comment-14166555 ] Denis Serduik edited comment on SPARK-2019 at 10/10/14 8:40 AM: I have noticed the same problem with workers behavior. My installation: Spark 1.0.2-hadoop2.0.0-mr1-cdh4.2.0 on Mesos 0.13. In my case, workers fail when there was an error while serialization the closure. Also please note, we run Spark in coarse-grained mode was (Author: dmaverick): I have noticed the same problem with workers behavior. My installation: Spark 1.0.2-hadoop2.0.0-mr1-cdh4.2.0 on Mesos 0.13. In my case, workers fail when there was an error while serialization the closure. Also please notice that we run Spark in coarse-grained mode Spark workers die/disappear when job fails for nearly any reason Key: SPARK-2019 URL: https://issues.apache.org/jira/browse/SPARK-2019 Project: Spark Issue Type: Bug Affects Versions: 0.9.0 Reporter: sam We either have to reboot all the nodes, or run 'sudo service spark-worker restart' across our cluster. I don't think this should happen - the job failures are often not even that bad. There is a 5 upvoted SO question here: http://stackoverflow.com/questions/22031006/spark-0-9-0-worker-keeps-dying-in-standalone-mode-when-job-fails We shouldn't be giving restart privileges to our devs, and therefore our sysadm has to frequently restart the workers. When the sysadm is not around, there is nothing our devs can do. Many thanks -- This message was sent by Atlassian JIRA (v6.3.4#6332) - To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org
[jira] [Commented] (SPARK-1305) Support persisting RDD's directly to Tachyon
[ https://issues.apache.org/jira/browse/SPARK-1305?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14066465#comment-14066465 ] Denis Serduik commented on SPARK-1305: -- I'm interesting in using this feature especially with SchemeRDD to be able cache intermediate results. Where I can find some code examples how to use it ? From SparkContext's code I see the following: private[spark] def persistRDD(rdd: RDD[_]) { persistentRdds(rdd.id) = rdd } /** * Unpersist an RDD from memory and/or disk storage */ private[spark] def unpersistRDD(rddId: Int, blocking: Boolean = true) { env.blockManager.master.removeRdd(rddId, blocking) persistentRdds.remove(rddId) listenerBus.post(SparkListenerUnpersistRDD(rddId)) } So persist don't put RDD into tachyonStore of blockmanager ? How does this feature work ? Support persisting RDD's directly to Tachyon Key: SPARK-1305 URL: https://issues.apache.org/jira/browse/SPARK-1305 Project: Spark Issue Type: New Feature Components: Block Manager Reporter: Patrick Wendell Assignee: Haoyuan Li Priority: Blocker Fix For: 1.0.0 This is already an ongoing pull request - in a nutshell we want to support Tachyon as a storage level in Spark. -- This message was sent by Atlassian JIRA (v6.2#6252)
[jira] [Comment Edited] (SPARK-1305) Support persisting RDD's directly to Tachyon
[ https://issues.apache.org/jira/browse/SPARK-1305?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14066465#comment-14066465 ] Denis Serduik edited comment on SPARK-1305 at 7/18/14 4:00 PM: --- I'm interested in using this feature especially with SchemeRDD to be able cache intermediate results. Where I can find some code examples how to use it ? From SparkContext's code I see the following: private[spark] def persistRDD(rdd: RDD[_]) { persistentRdds(rdd.id) = rdd } /** * Unpersist an RDD from memory and/or disk storage */ private[spark] def unpersistRDD(rddId: Int, blocking: Boolean = true) { env.blockManager.master.removeRdd(rddId, blocking) persistentRdds.remove(rddId) listenerBus.post(SparkListenerUnpersistRDD(rddId)) } So persist don't put RDD into tachyonStore of blockmanager ? How does this feature work ? was (Author: dmaverick): I'm interesting in using this feature especially with SchemeRDD to be able cache intermediate results. Where I can find some code examples how to use it ? From SparkContext's code I see the following: private[spark] def persistRDD(rdd: RDD[_]) { persistentRdds(rdd.id) = rdd } /** * Unpersist an RDD from memory and/or disk storage */ private[spark] def unpersistRDD(rddId: Int, blocking: Boolean = true) { env.blockManager.master.removeRdd(rddId, blocking) persistentRdds.remove(rddId) listenerBus.post(SparkListenerUnpersistRDD(rddId)) } So persist don't put RDD into tachyonStore of blockmanager ? How does this feature work ? Support persisting RDD's directly to Tachyon Key: SPARK-1305 URL: https://issues.apache.org/jira/browse/SPARK-1305 Project: Spark Issue Type: New Feature Components: Block Manager Reporter: Patrick Wendell Assignee: Haoyuan Li Priority: Blocker Fix For: 1.0.0 This is already an ongoing pull request - in a nutshell we want to support Tachyon as a storage level in Spark. -- This message was sent by Atlassian JIRA (v6.2#6252)
[jira] [Commented] (SPARK-1215) Clustering: Index out of bounds error
[ https://issues.apache.org/jira/browse/SPARK-1215?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanelfocusedCommentId=14007090#comment-14007090 ] Denis Serduik commented on SPARK-1215: -- I don't think that the problem is about size of dataset. I've faced with similar issue on dataset with about 900 items. As a workaround we've decided to fallback with random init mode. Clustering: Index out of bounds error - Key: SPARK-1215 URL: https://issues.apache.org/jira/browse/SPARK-1215 Project: Spark Issue Type: Bug Components: MLlib Reporter: dewshick Assignee: Xiangrui Meng Priority: Minor code: import org.apache.spark.mllib.clustering._ val test = sc.makeRDD(Array(4,4,4,4,4).map(e = Array(e.toDouble))) val kmeans = new KMeans().setK(4) kmeans.run(test) evals with java.lang.ArrayIndexOutOfBoundsException error: 14/01/17 12:35:54 INFO scheduler.DAGScheduler: Stage 25 (collectAsMap at KMeans.scala:243) finished in 0.047 s 14/01/17 12:35:54 INFO spark.SparkContext: Job finished: collectAsMap at KMeans.scala:243, took 16.389537116 s Exception in thread main java.lang.reflect.InvocationTargetException at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at com.simontuffs.onejar.Boot.run(Boot.java:340) at com.simontuffs.onejar.Boot.main(Boot.java:166) Caused by: java.lang.ArrayIndexOutOfBoundsException: -1 at org.apache.spark.mllib.clustering.LocalKMeans$.kMeansPlusPlus(LocalKMeans.scala:47) at org.apache.spark.mllib.clustering.KMeans$$anonfun$19.apply(KMeans.scala:247) at org.apache.spark.mllib.clustering.KMeans$$anonfun$19.apply(KMeans.scala:244) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:233) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:233) at scala.collection.immutable.Range.foreach(Range.scala:81) at scala.collection.TraversableLike$class.map(TraversableLike.scala:233) at scala.collection.immutable.Range.map(Range.scala:46) at org.apache.spark.mllib.clustering.KMeans.initKMeansParallel(KMeans.scala:244) at org.apache.spark.mllib.clustering.KMeans.run(KMeans.scala:124) at Clustering$$anonfun$1.apply$mcDI$sp(Clustering.scala:21) at Clustering$$anonfun$1.apply(Clustering.scala:19) at Clustering$$anonfun$1.apply(Clustering.scala:19) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:233) at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:233) at scala.collection.immutable.Range.foreach(Range.scala:78) at scala.collection.TraversableLike$class.map(TraversableLike.scala:233) at scala.collection.immutable.Range.map(Range.scala:46) at Clustering$.main(Clustering.scala:19) at Clustering.main(Clustering.scala) ... 6 more -- This message was sent by Atlassian JIRA (v6.2#6252)