spark git commit: [SPARK-11258] Converting a Spark DataFrame into an R data.frame is slow / requires a lot of memory

shivaram Mon, 26 Oct 2015 15:54:58 -0700

Repository: spark
Updated Branches:
  refs/heads/master 3689beb98 -> b60aab8a9



[SPARK-11258] Converting a Spark DataFrame into an R data.frame is slow / 
requires a lot of memory

https://issues.apache.org/jira/browse/SPARK-11258

I was not able to locate an existing unit test for this function so I wrote one.

Author: Frank Rosner <fr...@fam-rosner.de>

Closes #9222 from FRosner/master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b60aab8a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b60aab8a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b60aab8a

Branch: refs/heads/master
Commit: b60aab8a95e2a35a1d4023a9d0a0d9724e4164f9
Parents: 3689beb
Author: Frank Rosner <fr...@fam-rosner.de>
Authored: Mon Oct 26 15:46:59 2015 -0700
Committer: Shivaram Venkataraman <shiva...@cs.berkeley.edu>
Committed: Mon Oct 26 15:46:59 2015 -0700

----------------------------------------------------------------------
 .../org/apache/spark/sql/api/r/SQLUtils.scala   | 16 +++++----
 .../apache/spark/sql/api/r/SQLUtilsSuite.scala  | 38 ++++++++++++++++++++
 2 files changed, 47 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/b60aab8a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
index b0120a8..b3f1346 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala
@@ -130,16 +130,18 @@ private[r] object SQLUtils {
   }
 
   def dfToCols(df: DataFrame): Array[Array[Any]] = {
-    // localDF is Array[Row]
-    val localDF = df.collect()
+    val localDF: Array[Row] = df.collect()
     val numCols = df.columns.length
+    val numRows = localDF.length
 
-    // result is Array[Array[Any]]
-    (0 until numCols).map { colIdx =>
-      localDF.map { row =>
-        row(colIdx)
+    val colArray = new Array[Array[Any]](numCols)
+    for (colNo <- 0 until numCols) {
+      colArray(colNo) = new Array[Any](numRows)
+      for (rowNo <- 0 until numRows) {
+        colArray(colNo)(rowNo) = localDF(rowNo)(colNo)
       }
-    }.toArray
+    }
+    colArray
   }
 
   def saveMode(mode: String): SaveMode = {

http://git-wip-us.apache.org/repos/asf/spark/blob/b60aab8a/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
new file mode 100644
index 0000000..f54e23e
--- /dev/null
+++ b/sql/core/src/test/scala/org/apache/spark/sql/api/r/SQLUtilsSuite.scala
@@ -0,0 +1,38 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*    http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+package org.apache.spark.sql.api.r
+
+import org.apache.spark.sql.test.SharedSQLContext
+
+class SQLUtilsSuite extends SharedSQLContext {
+
+  import testImplicits._
+
+  test("dfToCols should collect and transpose a data frame") {
+    val df = Seq(
+      (1, 2, 3),
+      (4, 5, 6)
+    ).toDF
+    assert(SQLUtils.dfToCols(df) === Array(
+      Array(1, 4),
+      Array(2, 5),
+      Array(3, 6)
+    ))
+  }
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11258] Converting a Spark DataFrame into an R data.frame is slow / requires a lot of memory

Reply via email to