I reworked my app using your idea of throwing the data in a map. It looks like it should work but I'm getting some strange errors and my job gets terminated. I get a
"WARN TaskSchedulerImpl: Initial job has not accepted any resources; check your cluster UI to ensure that workers are registered and have sufficient memory" and before that in the Spark output I receive a "Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 2.0:0 failed 1 times, most recent failure: Exception failure in TID 3 on host localhost: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0:0 failed 4 times, most recent failure: TID 7 on host cloudera01.local.company.com failed for unknown reason Driver stacktrace: org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033) org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017) org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015) scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015) org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633) org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633) scala.Option.foreach(Option.scala:236) org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633) org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207) akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) akka.actor.ActorCell.invoke(ActorCell.scala:456) akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) akka.dispatch.Mailbox.run(Mailbox.scala:219) akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1033) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1017) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1015) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1015) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:633) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:633) at org.apache.spark.scheduler.DAGSchedulerEventProcessActor$$anonfun$receive$2.applyOrElse(DAGScheduler.scala:1207) at akka.actor.ActorCell.receiveMessage(ActorCell.scala:498) at akka.actor.ActorCell.invoke(ActorCell.scala:456) at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:237) at akka.dispatch.Mailbox.run(Mailbox.scala:219) at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(AbstractDispatcher.scala:386) at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) at scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339) at scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) at scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)" I am using CDH 5.1.2 and went and set my worker_max_heapsize to its default val of 512MB and also the executor_total_max_heapsize to default value of 8 GB after first experiencing this failure after reading it helped cure this problem for someone. The code is below object App { def main(args: Array[String]) { val ssc = new StreamingContext("local[2]", "Data", Seconds(20)) ssc.checkpoint("checkpoint") val eventMap = scala.collection.immutable.Map("uShip.Events" -> 1) val pipe = KafkaUtils.createStream(ssc, "dockerrepo,dockerrepo,dockerrepo", "Cons1", eventMap).map(_._2) val eventStream = pipe.map(data => { parse(data) }).map(json => { implicit val formats = DefaultFormats val eventName = (json \ "event").extractOpt[String] Event(eventName.getOrElse("*** NO EVENT NAME ***"), json) }) eventStream.foreachRDD(rdd => { var eventMap: Map[String, scala.collection.mutable.MutableList[org.json4s.JValue]] = Map() val eventArray = rdd.toArray eventArray.foreach({ event => if (eventMap.contains(event.EventName)) { var eventList = eventMap.getOrElse(event.EventName, new scala.collection.mutable.MutableList[org.json4s.JValue]) eventList += event.Payload } else { var eventList = new scala.collection.mutable.MutableList[org.json4s.JValue] += event.Payload eventMap += (event.EventName -> eventList) } }) rdd.foreachPartition(rdd => { val conf = new SparkConf().setMaster("spark://cloudera01.local.company.com:7077") val sc = new SparkContext(conf) eventMap.foreach(event => { def uuid = java.util.UUID.randomUUID.toString val eventRDD = sc.makeRDD(event._2) eventRDD.saveAsTextFile("hdfs://cloudera01.local.company.com:8020/user/hdfs/" + event._1 + "/rdd=" + eventRDD.id + "_ID_" + uuid) }) }) }) ssc.start() ssc.awaitTermination() } } -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Using-data-in-RDD-to-specify-HDFS-directory-to-write-to-tp18789p18974.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org