git commit: [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException
Repository: spark Updated Branches: refs/heads/master f0496ee10 - 4352a2fda [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException JIRA: https://issues.apache.org/jira/browse/SPARK-2376 Author: Yin Huai h...@cse.ohio-state.edu Closes #1320 from yhuai/SPARK-2376 and squashes the following commits: 0107417 [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2376 480803d [Yin Huai] Correctly handling JSON arrays in PySpark. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4352a2fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4352a2fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4352a2fd Branch: refs/heads/master Commit: 4352a2fdaa64efee7158eabef65703460ff284ec Parents: f0496ee Author: Yin Huai h...@cse.ohio-state.edu Authored: Mon Jul 7 18:37:38 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 7 18:37:38 2014 -0700 -- python/pyspark/sql.py | 24 ++- .../scala/org/apache/spark/sql/SchemaRDD.scala | 45 +--- 2 files changed, 44 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4352a2fd/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 5051c82..ffe1775 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -152,10 +152,12 @@ class SQLContext: ofn.close() srdd = sqlCtx.jsonFile(jsonFile) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True jschema_rdd = self._ssql_ctx.jsonFile(path) @@ -167,10 +169,12 @@ class SQLContext: srdd = sqlCtx.jsonRDD(json) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True def func(split, iterator): @@ -492,8 +496,8 @@ def _test(): globs['rdd'] = sc.parallelize([{field1 : 1, field2 : row1}, {field1 : 2, field2: row2}, {field1 : 3, field2: row3}]) jsonStrings = ['{field1: 1, field2: row1, field3:{field4:11}}', - '{field1 : 2, field2: row2, field3:{field4:22}}', - '{field1 : 3, field2: row3, field3:{field4:33}}'] + '{field1 : 2, field3:{field4:22, field5: [10, 11]}, field6:[{field7: row2}]}', + '{field1 : null, field2: row3, field3:{field4:33, field5: []}}'] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([ http://git-wip-us.apache.org/repos/asf/spark/blob/4352a2fd/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 8f9f54f..8bcfc7c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql +import java.util.{Map = JMap, List = JList, Set = JSet} + +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + import net.razorvine.pickle.Pickler import org.apache.spark.{Dependency, OneToOneDependency, Partition, Partitioner, TaskContext} @@ -27,10 +32,9 @@ import org.apache.spark.sql.catalyst.analysis._ import
git commit: [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException
Repository: spark Updated Branches: refs/heads/branch-1.0 1032c2875 - 9dce7beff [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException JIRA: https://issues.apache.org/jira/browse/SPARK-2376 Author: Yin Huai h...@cse.ohio-state.edu Closes #1320 from yhuai/SPARK-2376 and squashes the following commits: 0107417 [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2376 480803d [Yin Huai] Correctly handling JSON arrays in PySpark. (cherry picked from commit 4352a2fdaa64efee7158eabef65703460ff284ec) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9dce7bef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9dce7bef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9dce7bef Branch: refs/heads/branch-1.0 Commit: 9dce7beffb668e42ee05d961ae47f33047d579cc Parents: 1032c28 Author: Yin Huai h...@cse.ohio-state.edu Authored: Mon Jul 7 18:37:38 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 7 18:52:51 2014 -0700 -- python/pyspark/sql.py | 24 ++- .../scala/org/apache/spark/sql/SchemaRDD.scala | 45 +--- 2 files changed, 44 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9dce7bef/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 5051c82..ffe1775 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -152,10 +152,12 @@ class SQLContext: ofn.close() srdd = sqlCtx.jsonFile(jsonFile) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True jschema_rdd = self._ssql_ctx.jsonFile(path) @@ -167,10 +169,12 @@ class SQLContext: srdd = sqlCtx.jsonRDD(json) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True def func(split, iterator): @@ -492,8 +496,8 @@ def _test(): globs['rdd'] = sc.parallelize([{field1 : 1, field2 : row1}, {field1 : 2, field2: row2}, {field1 : 3, field2: row3}]) jsonStrings = ['{field1: 1, field2: row1, field3:{field4:11}}', - '{field1 : 2, field2: row2, field3:{field4:22}}', - '{field1 : 3, field2: row3, field3:{field4:33}}'] + '{field1 : 2, field3:{field4:22, field5: [10, 11]}, field6:[{field7: row2}]}', + '{field1 : null, field2: row3, field3:{field4:33, field5: []}}'] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([ http://git-wip-us.apache.org/repos/asf/spark/blob/9dce7bef/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 8f9f54f..8bcfc7c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql +import java.util.{Map = JMap, List = JList, Set = JSet} + +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + import net.razorvine.pickle.Pickler import org.apache.spark.{Dependency,