There is was a similar problem reported before on this list. Weird python errors like this generally mean you have different versions of python in the nodes of your cluster. Can you check that?
>From error stack you use 2.7.10 |Anaconda 2.3.0 while OS/CDH version of Python is probably 2.6. -- Ruslan Dautkhanov On Mon, Aug 10, 2015 at 3:53 PM, YaoPau <jonrgr...@gmail.com> wrote: > I'm running Spark 1.3 on CDH 5.4.4, and trying to set up Spark to run via > iPython Notebook. I'm getting collect() to work just fine, but take() > errors. (I'm having issues with collect() on other datasets ... but take() > seems to break every time I run it.) > > My code is below. Any thoughts? > > >> sc > <pyspark.context.SparkContext at 0x7ffbfa310f10> > >> sys.version > '2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC > 4.4.7 20120313 (Red Hat 4.4.7-1)]' > >> hourly = sc.textFile('tester') > >> hourly.collect() > [u'a man', > u'a plan', > u'a canal', > u'panama'] > >> hourly = sc.textFile('tester') > >> hourly.take(2) > --------------------------------------------------------------------------- > Py4JJavaError Traceback (most recent call last) > <ipython-input-15-1feecba5868b> in <module>() > 1 hourly = sc.textFile('tester') > ----> 2 hourly.take(2) > > /opt/cloudera/parcels/CDH/lib/spark/python/pyspark/rdd.py in take(self, > num) > 1223 > 1224 p = range(partsScanned, min(partsScanned + > numPartsToTry, totalParts)) > -> 1225 res = self.context.runJob(self, takeUpToNumLeft, p, > True) > 1226 > 1227 items += res > > /opt/cloudera/parcels/CDH/lib/spark/python/pyspark/context.py in > runJob(self, rdd, partitionFunc, partitions, allowLocal) > 841 # SparkContext#runJob. > 842 mappedRDD = rdd.mapPartitions(partitionFunc) > --> 843 it = self._jvm.PythonRDD.runJob(self._jsc.sc(), > mappedRDD._jrdd, javaPartitions, allowLocal) > 844 return list(mappedRDD._collect_iterator_through_file(it)) > 845 > > > /opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py > in __call__(self, *args) > 536 answer = self.gateway_client.send_command(command) > 537 return_value = get_return_value(answer, > self.gateway_client, > --> 538 self.target_id, self.name) > 539 > 540 for temp_arg in temp_args: > > > /opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py > in get_return_value(answer, gateway_client, target_id, name) > 298 raise Py4JJavaError( > 299 'An error occurred while calling {0}{1}{2}.\n'. > --> 300 format(target_id, '.', name), value) > 301 else: > 302 raise Py4JError( > > Py4JJavaError: An error occurred while calling > z:org.apache.spark.api.python.PythonRDD.runJob. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 > in stage 10.0 failed 4 times, most recent failure: Lost task 0.3 in stage > 10.0 (TID 47, dhd490101.autotrader.com): > org.apache.spark.api.python.PythonException: Traceback (most recent call > last): > File > > "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/worker.py", > line 101, in main > process() > File > > "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/worker.py", > line 96, in process > serializer.dump_stream(func(split_index, iterator), outfile) > File > > "/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/serializers.py", > line 236, in dump_stream > vs = list(itertools.islice(iterator, batch)) > File "/opt/cloudera/parcels/CDH/lib/spark/python/pyspark/rdd.py", line > 1220, in takeUpToNumLeft > while taken < left: > ImportError: No module named iter > > at > org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:135) > at > org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:176) > at > org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at > org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) > at org.apache.spark.scheduler.Task.run(Task.scala:64) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) > at > > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:745) > > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org > $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191) > at > > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at > scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) > at > > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) > at scala.Option.foreach(Option.scala:236) > at > > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693) > at > > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393) > at > > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > > > > > -- > View this message in context: > http://apache-spark-user-list.1001560.n3.nabble.com/collect-works-take-returns-ImportError-No-module-named-iter-tp24199.html > Sent from the Apache Spark User List mailing list archive at Nabble.com. > > --------------------------------------------------------------------- > To unsubscribe, e-mail: user-unsubscr...@spark.apache.org > For additional commands, e-mail: user-h...@spark.apache.org > >