Re: Spark 1.3 createDataframe error with pandas df

2015-03-19 Thread Davies Liu
On Mon, Mar 16, 2015 at 6:23 AM, kevindahl kevin.d...@gmail.com wrote:
 kevindahl wrote
 I'm trying to create a spark data frame from a pandas data frame, but for
 even the most trivial of datasets I get an error along the lines of this:

 ---
 Py4JJavaError Traceback (most recent call
 last)
 ipython-input-11-7857f9a55971
  in
 module
 ()
   3 BabyDataSet = zip(names,births)
   4 df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
  5 rdf = sqlCtx.createDataFrame(df)

 C:\spark\python\pyspark\sql\context.pyc in createDataFrame(self, data,
 schema, samplingRatio)
 332
 333 if isinstance(schema, (list, tuple)):
 -- 334 first = data.first()
 335 if not isinstance(first, (list, tuple)):
 336 raise ValueError(each row in `rdd` should be list
 or tuple, 

 C:\spark\python\pyspark\rdd.pyc in first(self)
1241 ValueError: RDD is empty
1242 
 - 1243 rs = self.take(1)
1244 if rs:
1245 return rs[0]

 C:\spark\python\pyspark\rdd.pyc in take(self, num)
1223
1224 p = range(partsScanned, min(partsScanned +
 numPartsToTry, totalParts))
 - 1225 res = self.context.runJob(self, takeUpToNumLeft, p,
 True)
1226
1227 items += res

 C:\spark\python\pyspark\context.pyc in runJob(self, rdd, partitionFunc,
 partitions, allowLocal)
 841 # SparkContext#runJob.
 842 mappedRDD = rdd.mapPartitions(partitionFunc)
 -- 843 it = self._jvm.PythonRDD.runJob(self._jsc.sc(),
 mappedRDD._jrdd, javaPartitions, allowLocal)
 844 return list(mappedRDD._collect_iterator_through_file(it))
 845

 C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py in
 __call__(self, *args)
 536 answer = self.gateway_client.send_command(command)
 537 return_value = get_return_value(answer,
 self.gateway_client,
 -- 538 self.target_id, self.name)
 539
 540 for temp_arg in temp_args:

 C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py in
 get_return_value(answer, gateway_client, target_id, name)
 298 raise Py4JJavaError(
 299 'An error occurred while calling
 {0}{1}{2}.\n'.
 -- 300 format(target_id, '.', name), value)
 301 else:
 302 raise Py4JError(

 Py4JJavaError: An error occurred while calling
 z:org.apache.spark.api.python.PythonRDD.runJob.
 : org.apache.spark.SparkException: Job aborted due to stage failure: Task
 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage
 4.0 (TID 7, localhost): java.net.SocketException: Connection reset

The python process had crashed, do you have any logging for this?

   at java.net.SocketInputStream.read(SocketInputStream.java:189)
   at java.net.SocketInputStream.read(SocketInputStream.java:121)
   at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
   at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
   at java.io.DataInputStream.readInt(DataInputStream.java:387)
   at
 org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:108)
   at org.apache.spark.api.python.PythonRDD$$anon$1.
 init
 (PythonRDD.scala:176)
   at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
   at org.apache.spark.scheduler.Task.run(Task.scala:64)
   at 
 org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
   at
 java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
   at
 java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
   at java.lang.Thread.run(Thread.java:745)

 Driver stacktrace:
   at
 org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
   at
 scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
   at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
   at
 org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
   at 

Re: Spark 1.3 createDataframe error with pandas df

2015-03-16 Thread kevindahl
kevindahl wrote
 I'm trying to create a spark data frame from a pandas data frame, but for
 even the most trivial of datasets I get an error along the lines of this:
 
 ---
 Py4JJavaError Traceback (most recent call
 last)
 ipython-input-11-7857f9a55971
  in 
 module
 ()
   3 BabyDataSet = zip(names,births)
   4 df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
  5 rdf = sqlCtx.createDataFrame(df)
 
 C:\spark\python\pyspark\sql\context.pyc in createDataFrame(self, data,
 schema, samplingRatio)
 332 
 333 if isinstance(schema, (list, tuple)):
 -- 334 first = data.first()
 335 if not isinstance(first, (list, tuple)):
 336 raise ValueError(each row in `rdd` should be list
 or tuple, 
 
 C:\spark\python\pyspark\rdd.pyc in first(self)
1241 ValueError: RDD is empty
1242 
 - 1243 rs = self.take(1)
1244 if rs:
1245 return rs[0]
 
 C:\spark\python\pyspark\rdd.pyc in take(self, num)
1223 
1224 p = range(partsScanned, min(partsScanned +
 numPartsToTry, totalParts))
 - 1225 res = self.context.runJob(self, takeUpToNumLeft, p,
 True)
1226 
1227 items += res
 
 C:\spark\python\pyspark\context.pyc in runJob(self, rdd, partitionFunc,
 partitions, allowLocal)
 841 # SparkContext#runJob.
 842 mappedRDD = rdd.mapPartitions(partitionFunc)
 -- 843 it = self._jvm.PythonRDD.runJob(self._jsc.sc(),
 mappedRDD._jrdd, javaPartitions, allowLocal)
 844 return list(mappedRDD._collect_iterator_through_file(it))
 845 
 
 C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\java_gateway.py in
 __call__(self, *args)
 536 answer = self.gateway_client.send_command(command)
 537 return_value = get_return_value(answer,
 self.gateway_client,
 -- 538 self.target_id, self.name)
 539 
 540 for temp_arg in temp_args:
 
 C:\spark\python\lib\py4j-0.8.2.1-src.zip\py4j\protocol.py in
 get_return_value(answer, gateway_client, target_id, name)
 298 raise Py4JJavaError(
 299 'An error occurred while calling
 {0}{1}{2}.\n'.
 -- 300 format(target_id, '.', name), value)
 301 else:
 302 raise Py4JError(
 
 Py4JJavaError: An error occurred while calling
 z:org.apache.spark.api.python.PythonRDD.runJob.
 : org.apache.spark.SparkException: Job aborted due to stage failure: Task
 0 in stage 4.0 failed 1 times, most recent failure: Lost task 0.0 in stage
 4.0 (TID 7, localhost): java.net.SocketException: Connection reset
   at java.net.SocketInputStream.read(SocketInputStream.java:189)
   at java.net.SocketInputStream.read(SocketInputStream.java:121)
   at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
   at java.io.BufferedInputStream.read(BufferedInputStream.java:265)
   at java.io.DataInputStream.readInt(DataInputStream.java:387)
   at
 org.apache.spark.api.python.PythonRDD$$anon$1.read(PythonRDD.scala:108)
   at org.apache.spark.api.python.PythonRDD$$anon$1.
 init
 (PythonRDD.scala:176)
   at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:94)
   at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
   at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:61)
   at org.apache.spark.scheduler.Task.run(Task.scala:64)
   at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
   at
 java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
   at
 java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
   at java.lang.Thread.run(Thread.java:745)
 
 Driver stacktrace:
   at
 org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
   at
 scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
   at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
   at
 org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
   at
 org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
   at scala.Option.foreach(Option.scala:236)
   at
 org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
   at