Re: Spark 1.3 createDataframe error with pandas df
On Mon, Mar 16, 2015 at 6:23 AM, kevindahl kevin.d...@gmail.com wrote: kevindahl wrote I'm trying to create a spark data frame from a pandas data frame, but for even the most trivial of datasets I get an error along the lines of this: --- Py4JJavaError Traceback (most recent call last) ipython-input-11-7857f9a55971 in module () 3 BabyDataSet = zip(names,births) 4 df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) 5 rdf = sqlCtx.createDataFrame(df) C:\spark\python\pyspark\sql\context.pyc in createDataFrame(self, data, schema, samplingRatio) 332 333 if isinstance(schema, (list, tuple)): -- 334 first = data.first() 335 if not isinstance(first, (list, tuple)): 336 raise ValueError(each row in `rdd` should be list or tuple, C:\spark\python\pyspark\rdd.pyc in first(self) 1241 ValueError: RDD is empty 1242 - 1243 rs = self.take(1) 1244 if rs: 1245 return rs[0] C:\spark\python\pyspark\rdd.pyc in take(self, num) 1223 1224 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts)) - 1225 res = self.context.runJob(self, takeUpToNumLeft, p, True) 1226 1227 items += res C:\spark\python\pyspark\context.pyc in runJob(self, rdd, partitionFunc, partitions, allowLocal) 841 # SparkContext#runJob. 842 mappedRDD = rdd.mapPartitions(partitionFunc) -- 843 it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal) 844 return list(mappedRDD._collect_iterator_through_file(it)) 845 C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py in __call__(self, *args) 536 answer = self.gateway_client.send_command(command) 537 return_value = get_return_value(answer, self.gateway_client, -- 538 self.target_id, self.name) 539 540 for temp_arg in temp_args: C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name) 298 raise Py4JJavaError( 299 'An error occurred while calling {0}{1}{2}.\n'. -- 300 format(target_id, '.', name), value) 301 else: 302 raise Py4JError( Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage 4.0 (TID 7, localhost): java.net.SocketException: Connection reset The python process had crashed, do you have any logging for this? at java.net.SocketInputStream.read(SocketInputStream.java:189) at java.net.SocketInputStream.read(SocketInputStream.java:121) at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) at java.io.BufferedInputStream.read(BufferedInputStream.java:265) at java.io.DataInputStream.readInt(DataInputStream.java:387) at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:108) at org.apache.spark.api.python.PythonRDD$$anon$1. init (PythonRDD.scala:176) at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at
Re: Spark 1.3 createDataframe error with pandas df
kevindahl wrote I'm trying to create a spark data frame from a pandas data frame, but for even the most trivial of datasets I get an error along the lines of this: --- Py4JJavaError Traceback (most recent call last) ipython-input-11-7857f9a55971 in module () 3 BabyDataSet = zip(names,births) 4 df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births']) 5 rdf = sqlCtx.createDataFrame(df) C:\spark\python\pyspark\sql\context.pyc in createDataFrame(self, data, schema, samplingRatio) 332 333 if isinstance(schema, (list, tuple)): -- 334 first = data.first() 335 if not isinstance(first, (list, tuple)): 336 raise ValueError(each row in `rdd` should be list or tuple, C:\spark\python\pyspark\rdd.pyc in first(self) 1241 ValueError: RDD is empty 1242 - 1243 rs = self.take(1) 1244 if rs: 1245 return rs[0] C:\spark\python\pyspark\rdd.pyc in take(self, num) 1223 1224 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts)) - 1225 res = self.context.runJob(self, takeUpToNumLeft, p, True) 1226 1227 items += res C:\spark\python\pyspark\context.pyc in runJob(self, rdd, partitionFunc, partitions, allowLocal) 841 # SparkContext#runJob. 842 mappedRDD = rdd.mapPartitions(partitionFunc) -- 843 it = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, javaPartitions, allowLocal) 844 return list(mappedRDD._collect_iterator_through_file(it)) 845 C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py in __call__(self, *args) 536 answer = self.gateway_client.send_command(command) 537 return_value = get_return_value(answer, self.gateway_client, -- 538 self.target_id, self.name) 539 540 for temp_arg in temp_args: C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name) 298 raise Py4JJavaError( 299 'An error occurred while calling {0}{1}{2}.\n'. -- 300 format(target_id, '.', name), value) 301 else: 302 raise Py4JError( Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob. : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage 4.0 (TID 7, localhost): java.net.SocketException: Connection reset at java.net.SocketInputStream.read(SocketInputStream.java:189) at java.net.SocketInputStream.read(SocketInputStream.java:121) at java.io.BufferedInputStream.fill(BufferedInputStream.java:246) at java.io.BufferedInputStream.read(BufferedInputStream.java:265) at java.io.DataInputStream.readInt(DataInputStream.java:387) at org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:108) at org.apache.spark.api.python.PythonRDD$$anon$1. init (PythonRDD.scala:176) at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61) at org.apache.spark.scheduler.Task.run(Task.scala:64) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693) at