Hey all, I was able to spawn up a cluster, but when I'm trying to submit a simple jar via spark-submit it fails to run. I am trying to run the simple "Standalone Application" from the quickstart.
Oddly enough, I could get another application running through the spark-shell. What am I doing wrong here? :( http://spark.apache.org/docs/latest/quick-start.html * Here's my setup: * $ ls project simple.sbt src target $ ls -R src src: main src/main: scala src/main/scala: SimpleApp.scala $ cat src/main/scala/SimpleApp.scala package main.scala /* SimpleApp.scala */ import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.SparkConf object SimpleApp { def main(args: Array[String]) { val logFile = "/tmp/README.md" val conf = new SparkConf().setAppName("Simple Application") val sc = new SparkContext(conf) val logData = sc.textFile(logFile, 2).cache() val numAs = logData.filter(line => line.contains("a")).count() val numBs = logData.filter(line => line.contains("b")).count() println("Lines with a: %s, Lines with b: %s".format(numAs, numBs)) } } $ cat simple.sbt name := "Simple Project" version := "1.0" scalaVersion := "2.10.4" libraryDependencies += "org.apache.spark" %% "spark-core" % "1.0.1" resolvers += "Akka Repository" at "http://repo.akka.io/releases/" * Here's how I run the job: * $ /root/spark/bin/spark-submit --class "main.scala.SimpleApp" --master local[4] ./target/scala-2.10/simple-project_2.10-1.0.jar *Here is the error: * 14/07/31 16:23:56 INFO scheduler.DAGScheduler: Failed to run count at SimpleApp.scala:14 Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0:1 failed 1 times, most recent failure: Exception failure in TID 1 on host localhost: java.io.IOException: No such file or directory java.io.UnixFileSystem.createFileExclusively(Native Method) java.io.File.createNewFile(File.java:1006) java.io.File.createTempFile(File.java:1989) org.apache.spark.util.Utils$.fetchFile(Utils.scala:326) org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$6.apply(Executor.scala:332) org.apache.spark.executor.Executor$$anonfun$org$apache$spark$executor$Executor$$updateDependencies$6.apply(Executor.scala:330) scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:772) scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226) scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39) scala.collection.mutable.HashMap.foreach(HashMap.scala:98) scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:771) org.apache.spark.executor.Executor.org $apache$spark$executor$Executor$$updateDependencies(Executor.scala:330) org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:168) java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633) at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207) at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) at akka.actor.ActorCell.invoke(ActorCell.scala:456) at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) at akka.dispatch.Mailbox.run(Mailbox.scala:219) at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) 14/07/31 16:23:56 INFO scheduler.TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool *Another note: * I do not believe the issue is because the input file is missing. It is most certainly there, and I got a different error when I tried pointing it to a non-existent file. $ /root/ephemeral-hdfs/bin/hadoop fs -lsr / Warning: $HADOOP_HOME is deprecated. drwxr-xr-x - root supergroup 0 2014-07-31 00:10 /tachyon drwxr-xr-x - root supergroup 0 2014-07-31 00:10 /tachyon/data drwxr-xr-x - root supergroup 0 2014-07-31 00:10 /tachyon/workers drwxr-xr-x - root supergroup 0 2014-07-31 01:01 /tmp -rw-r--r-- 3 root supergroup 281471 2014-07-31 00:17 /tmp/CHANGES.txt -rw-r--r-- 3 root supergroup 4221 2014-07-31 01:01 /tmp/README.md Regards, Ryan Tabora http://ryantabora.com