I'm running Spark 1.3 on CDH 5.4.4, and trying to set up Spark to run via
iPython Notebook.  I'm getting collect() to work just fine, but take()
errors.  (I'm having issues with collect() on other datasets ... but take()
seems to break every time I run it.)

My code is below.  Any thoughts?

>> sc
<pyspark.context.SparkContext at 0x7ffbfa310f10>
>> sys.version
'2.7.10 |Anaconda 2.3.0 (64-bit)| (default, May 28 2015, 17:02:03) \n[GCC
4.4.7 20120313 (Red Hat 4.4.7-1)]'
>> hourly = sc.textFile('tester')
>> hourly.collect()
[u'a man',
 u'a plan',
 u'a canal',
 u'panama']
>> hourly = sc.textFile('tester')
>> hourly.take(2)
---------------------------------------------------------------------------
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-15-1feecba5868b> in <module>()
      1 hourly = sc.textFile('tester')
----> 2 hourly.take(2)

/opt/cloudera/parcels/CDH/lib/spark/python/pyspark/rdd.py in take(self, num)
   1223 
   1224             p = range(partsScanned, min(partsScanned +
numPartsToTry, totalParts))
-> 1225             res = self.context.runJob(self, takeUpToNumLeft, p,
True)
   1226 
   1227             items += res

/opt/cloudera/parcels/CDH/lib/spark/python/pyspark/context.py in
runJob(self, rdd, partitionFunc, partitions, allowLocal)
    841         # SparkContext#runJob.
    842         mappedRDD = rdd.mapPartitions(partitionFunc)
--> 843         it = self._jvm.PythonRDD.runJob(self._jsc.sc(),
mappedRDD._jrdd, javaPartitions, allowLocal)
    844         return list(mappedRDD._collect_iterator_through_file(it))
    845 

/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py
in __call__(self, *args)
    536         answer = self.gateway_client.send_command(command)
    537         return_value = get_return_value(answer, self.gateway_client,
--> 538                 self.target_id, self.name)
    539 
    540         for temp_arg in temp_args:

/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
    298                 raise Py4JJavaError(
    299                     'An error occurred while calling {0}{1}{2}.\n'.
--> 300                     format(target_id, '.', name), value)
    301             else:
    302                 raise Py4JError(

Py4JJavaError: An error occurred while calling
z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0
in stage 10.0 failed 4 times, most recent failure: Lost task 0.3 in stage
10.0 (TID 47, dhd490101.autotrader.com):
org.apache.spark.api.python.PythonException: Traceback (most recent call
last):
  File
"/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/worker.py",
line 101, in main
    process()
  File
"/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/worker.py",
line 96, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File
"/opt/cloudera/parcels/CDH-5.4.4-1.cdh5.4.4.p894.568/jars/spark-assembly-1.3.0-cdh5.4.4-hadoop2.6.0-cdh5.4.4.jar/pyspark/serializers.py",
line 236, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/opt/cloudera/parcels/CDH/lib/spark/python/pyspark/rdd.py", line
1220, in takeUpToNumLeft
    while taken < left:
ImportError: No module named iter

        at 
org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:135)
        at
org.apache.spark.api.python.PythonRDD$$anon$1.<init>(PythonRDD.scala:176)
        at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
        at org.apache.spark.scheduler.Task.run(Task.scala:64)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
        at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
        at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
        at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
        at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
        at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
        at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
        at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
        at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
        at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
        at scala.Option.foreach(Option.scala:236)
        at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
        at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
        at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
        at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)




--
View this message in context: 
http://apache-spark-user-list.1001560.n3.nabble.com/collect-works-take-returns-ImportError-No-module-named-iter-tp24199.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.

---------------------------------------------------------------------
To unsubscribe, e-mail: user-unsubscr...@spark.apache.org
For additional commands, e-mail: user-h...@spark.apache.org

Reply via email to