I'm running Spark 1.3 on CDH 5.4.4, and trying to set up Spark to run via iPython Notebook. I'm getting collect() to work just fine, but take() errors. (I'm having issues with collect() on other datasets ... but take() seems to break every time I run it.)
My code is below. Any thoughts? >> sc <pyspark.context.SparkContext at 0x7ffbfa310f10> >> sys.version '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]' >> hourly = sc.textFile('tester') >> hourly.collect() [u'a man', u'a plan', u'a canal', u'panama'] >> hourly = sc.textFile('tester') >> hourly.take(2) --------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-15-1feecba5868b> in <module>() 1 hourly = sc.textFile('tester') ----> 2 hourly.take(2) /opt/cloudera/parcels/CDH/lib/spark/python/pyspark/rdd.py in take(self, num) 1223 1224 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts)) -> 1225 res = self.context.runJob(self, takeUpToNumLeft, p, True) 1226 1227 items += res /opt/cloudera/parcels/CDH/lib/spark/python/pyspark/context.py in runJob(self, rdd, partitionFunc, partitions, allowLocal) 841 # SparkContext#runJob. 842 mappedRDD = rdd.mapPartitions(partitionFunc) --> 843 it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal) 844 return list(mappedRDD._collect_iterator_through_file(it)) 845 /opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py in __call__(self, *args) 536 answer = self.gateway_client.send_command(command) 537 return_value = get_return_value(answer, self.gateway_client, --> 538 self.target_id, self.name) 539 540 for temp_arg in temp_args: /opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) 298 raise Py4JJavaError( 299 'An error occurred while calling {0}{1}{2}.\n'. --> 300 format(target_id, '.', name), value) 301 else: 302 raise Py4JError( Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 10.0 failed 4 times, most recent failure: Lost task 0.3 in stage 10.0 (TID 47, dhd490101.autotrader.com): org.apache.spark.api.python.PythonException: Traceback (most recent call last): File "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/worker.py", line 101, in main process() File "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/worker.py", line 96, in process serializer.dump_stream(func(split_index, iterator), outfile) File "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/serializers.py", line 236, in dump_stream vs = list(itertools.islice(iterator, batch)) File "/opt/cloudera/parcels/CDH/lib/spark/python/pyspark/rdd.py", line 1220, in takeUpToNumLeft while taken < left: ImportError: No module named iter at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:135) at org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:176) at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/collect-works-take-returns-ImportError-No-module-named-iter-tp24199.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org