kevindahl wrote > I'm trying to create a spark data frame from a pandas data frame, but for > even the most trivial of datasets I get an error along the lines of this: > > --------------------------------------------------------------------------- > Py4JJavaError Traceback (most recent call > last) > <ipython-input-11-7857f9a55971> > in > <module> > () > 3 BabyDataSet = zip(names,births) > 4 df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) > ----> 5 rdf = sqlCtx.createDataFrame(df) > > C:\spark\python\pyspark\sql\context.pyc in createDataFrame(self, data, > schema, samplingRatio) > 332 > 333 if isinstance(schema, (list, tuple)): > --> 334 first = data.first() > 335 if not isinstance(first, (list, tuple)): > 336 raise ValueError("each row in `rdd` should be list > or tuple, " > > C:\spark\python\pyspark\rdd.pyc in first(self) > 1241 ValueError: RDD is empty > 1242 """ > -> 1243 rs = self.take(1) > 1244 if rs: > 1245 return rs[0] > > C:\spark\python\pyspark\rdd.pyc in take(self, num) > 1223 > 1224 p = range(partsScanned, min(partsScanned + > numPartsToTry, totalParts)) > -> 1225 res = self.context.runJob(self, takeUpToNumLeft, p, > True) > 1226 > 1227 items += res > > C:\spark\python\pyspark\context.pyc in runJob(self, rdd, partitionFunc, > partitions, allowLocal) > 841 # SparkContext#runJob. > 842 mappedRDD = rdd.mapPartitions(partitionFunc) > --> 843 it = self._jvm.PythonRDD.runJob(self._jsc.sc(), > mappedRDD._jrdd, javaPartitions, allowLocal) > 844 return list(mappedRDD._collect_iterator_through_file(it)) > 845 > > C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py in > __call__(self, *args) > 536 answer = self.gateway_client.send_command(command) > 537 return_value = get_return_value(answer, > self.gateway_client, > --> 538 self.target_id, self.name) > 539 > 540 for temp_arg in temp_args: > > C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py in > get_return_value(answer, gateway_client, target_id, name) > 298 raise Py4JJavaError( > 299 'An error occurred while calling > {0}{1}{2}.\n'. > --> 300 format(target_id, '.', name), value) > 301 else: > 302 raise Py4JError( > > Py4JJavaError: An error occurred while calling > z:org.apache.spark.api.python.PythonRDD.runJob. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task > 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage > 4.0 (TID 7, localhost): java.net.SocketException: Connection reset > at java.net.SocketInputStream.read(SocketInputStream.java:189) > at java.net.SocketInputStream.read(SocketInputStream.java:121) > at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) > at java.io.BufferedInputStream.read(BufferedInputStream.java:265) > at java.io.DataInputStream.readInt(DataInputStream.java:387) > at > org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:108) > at org.apache.spark.api.python.PythonRDD$$anon$1. > <init> > (PythonRDD.scala:176) > at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94) > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) > at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) > at org.apache.spark.scheduler.Task.run(Task.scala:64) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > > Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191) > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) > at scala.Option.foreach(Option.scala:236) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > > A simple repro: > > names = ['Bob','Jessica','Mary','John','Mel'] > births = [968, 155, 77, 578, 973] > BabyDataSet = zip(names,births) > df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) > rdf = sqlCtx.createDataFrame(df) > > On Win 8.1, spark 1.3, Anaconda python 2.7, IPython 3.0.0, pandas 0.15.2.
The original message may not have made it through, as I think I fumbled my subscription. Anyways, I have since tested the same thing on Spark 1.3 on Ubuntu, same Anaconda python 2.7/IPython 3.0.0/pandas 0.15.2, and found that it works fine there. -- View this message in context: http://apache-spark-user-list.1001560.n3.nabble.com/Spark-1-3-createDataframe-error-with-pandas-df-tp22053p22082.html Sent from the Apache Spark User List mailing list archive at Nabble.com. --------------------------------------------------------------------- To unsubscribe, e-mail: user-unsubscr...@spark.apache.org For additional commands, e-mail: user-h...@spark.apache.org