git commit: [SPARK-1977][MLLIB] register mutable BitSet in MovieLenseALS
Repository: spark Updated Branches: refs/heads/branch-1.0 b459aa77f - 5044ba60a [SPARK-1977][MLLIB] register mutable BitSet in MovieLenseALS Author: Neville Li nevi...@spotify.com Closes #1319 from nevillelyh/gh/SPARK-1977 and squashes the following commits: 1f0a355 [Neville Li] [SPARK-1977][MLLIB] register mutable BitSet in MovieLenseALS (cherry picked from commit f7ce1b3b48f0354434456241188c6a5d954852e2) Signed-off-by: Xiangrui Meng m...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5044ba60 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5044ba60 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5044ba60 Branch: refs/heads/branch-1.0 Commit: 5044ba60a92495124b97ee97b87a45ce46d6073e Parents: b459aa7 Author: Neville Li nevi...@spotify.com Authored: Mon Jul 7 15:06:14 2014 -0700 Committer: Xiangrui Meng m...@databricks.com Committed: Mon Jul 7 15:08:10 2014 -0700 -- .../main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5044ba60/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala index 6eb41e7..7a5a4cc 100644 --- a/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala +++ b/examples/src/main/scala/org/apache/spark/examples/mllib/MovieLensALS.scala @@ -17,6 +17,8 @@ package org.apache.spark.examples.mllib +import scala.collection.mutable + import com.esotericsoftware.kryo.Kryo import org.apache.log4j.{Level, Logger} import scopt.OptionParser @@ -41,6 +43,7 @@ object MovieLensALS { class ALSRegistrator extends KryoRegistrator { override def registerClasses(kryo: Kryo) { kryo.register(classOf[Rating]) + kryo.register(classOf[mutable.BitSet]) } }
svn commit: r1608626 - in /spark: downloads.md site/downloads.html
Author: rxin Date: Mon Jul 7 22:47:19 2014 New Revision: 1608626 URL: http://svn.apache.org/r1608626 Log: Fix 1.0.0 release notes link. Modified: spark/downloads.md spark/site/downloads.html Modified: spark/downloads.md URL: http://svn.apache.org/viewvc/spark/downloads.md?rev=1608626r1=1608625r2=1608626view=diff == --- spark/downloads.md (original) +++ spark/downloads.md Mon Jul 7 22:47:19 2014 @@ -52,7 +52,7 @@ If you are interested in working with th Once you've downloaded Spark, you can find instructions for installing and building it on the a href={{site.url}}documentation.htmldocumentation page/a. h3 id=all-releasesAll Releases/h3 -* [Spark 1.0.0](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0.tgz) (May 30, 2014) [(release notes)]({{site.url}}releases/spark-release-1.0.0.html) (prebuilt: [Hadoop1 [HDP1, CDH3]](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop1.tgz), [CDH4](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-cdh4.tgz), [Hadoop2 [HDP2, CDH5]](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop2.tgz)) +* [Spark 1.0.0](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0.tgz) (May 30, 2014) [(release notes)]({{site.url}}releases/spark-release-1-0-0.html) (prebuilt: [Hadoop1 [HDP1, CDH3]](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop1.tgz), [CDH4](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-cdh4.tgz), [Hadoop2 [HDP2, CDH5]](http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop2.tgz)) * [Spark 0.9.1](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1.tgz) (Apr 9, 2014) [(release notes)]({{site.url}}releases/spark-release-0-9-1.html) (prebuilt: [Hadoop1 [HDP1, CDH3]](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop1.tgz), [CDH4](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-cdh4.tgz), [Hadoop2 [HDP2, CDH5]](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz)) * [Spark 0.9.0](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating.tgz) (Feb 2, 2014) [(release notes)]({{site.url}}releases/spark-release-0-9-0.html) (prebuilt: [Hadoop1 [HDP1, CDH3]](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating-bin-hadoop1.tgz), [CDH4](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating-bin-cdh4.tgz), [Hadoop2 [HDP2, CDH5]](http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating-bin-hadoop2.tgz)) * [Spark 0.8.1](http://d3kbcqa49mib13.cloudfront.net/spark-0.8.1-incubating.tgz) (Dec 19, 2013) [(release notes)]({{site.url}}releases/spark-release-0-8-1.html) (prebuilt: [Hadoop1 [HDP1, CDH3]](http://d3kbcqa49mib13.cloudfront.net/spark-0.8.1-incubating-bin-hadoop1.tgz), [CDH4](http://d3kbcqa49mib13.cloudfront.net/spark-0.8.1-incubating-bin-cdh4.tgz), [Hadoop2 [HDP2, CDH5]](http://d3kbcqa49mib13.cloudfront.net/spark-0.8.1-incubating-bin-hadoop2.tgz)) Modified: spark/site/downloads.html URL: http://svn.apache.org/viewvc/spark/site/downloads.html?rev=1608626r1=1608625r2=1608626view=diff == --- spark/site/downloads.html (original) +++ spark/site/downloads.html Mon Jul 7 22:47:19 2014 @@ -206,7 +206,7 @@ version: 1.0.0 h3 id=all-releasesAll Releases/h3 ul - lia href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0.tgz;Spark 1.0.0/a (May 30, 2014) a href=/releases/spark-release-1.0.0.html(release notes)/a (prebuilt: a href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop1.tgz;Hadoop1 [HDP1, CDH3]/a, a href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-cdh4.tgz;CDH4/a, a href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop2.tgz;Hadoop2 [HDP2, CDH5]/a)/li + lia href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0.tgz;Spark 1.0.0/a (May 30, 2014) a href=/releases/spark-release-1-0-0.html(release notes)/a (prebuilt: a href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop1.tgz;Hadoop1 [HDP1, CDH3]/a, a href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-cdh4.tgz;CDH4/a, a href=http://d3kbcqa49mib13.cloudfront.net/spark-1.0.0-bin-hadoop2.tgz;Hadoop2 [HDP2, CDH5]/a)/li lia href=http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1.tgz;Spark 0.9.1/a (Apr 9, 2014) a href=/releases/spark-release-0-9-1.html(release notes)/a (prebuilt: a href=http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop1.tgz;Hadoop1 [HDP1, CDH3]/a, a href=http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-cdh4.tgz;CDH4/a, a href=http://d3kbcqa49mib13.cloudfront.net/spark-0.9.1-bin-hadoop2.tgz;Hadoop2 [HDP2, CDH5]/a)/li lia href=http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating.tgz;Spark 0.9.0/a (Feb 2, 2014) a href=/releases/spark-release-0-9-0.html(release notes)/a (prebuilt: a href=http://d3kbcqa49mib13.cloudfront.net/spark-0.9.0-incubating-bin-hadoop1.tgz;Hadoop1 [HDP1, CDH3]/a, a
git commit: [SPARK-2339][SQL] SQL parser in sql-core is case sensitive, but a table alias is converted to lower case when we create Subquery
Repository: spark Updated Branches: refs/heads/branch-1.0 5044ba60a - e522971e8 [SPARK-2339][SQL] SQL parser in sql-core is case sensitive, but a table alias is converted to lower case when we create Subquery Reported by http://apache-spark-user-list.1001560.n3.nabble.com/Spark-SQL-Join-throws-exception-td8599.html After we get the table from the catalog, because the table has an alias, we will temporarily insert a Subquery. Then, we convert the table alias to lower case no matter if the parser is case sensitive or not. To see the issue ... ``` val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.createSchemaRDD case class Person(name: String, age: Int) val people = sc.textFile(examples/src/main/resources/people.txt).map(_.split(,)).map(p = Person(p(0), p(1).trim.toInt)) people.registerAsTable(people) sqlContext.sql(select PEOPLE.name from people PEOPLE) ``` The plan is ... ``` == Query Plan == Project ['PEOPLE.name] ExistingRdd [name#0,age#1], MapPartitionsRDD[4] at mapPartitions at basicOperators.scala:176 ``` You can find that `PEOPLE.name` is not resolved. This PR introduces three changes. 1. If a table has an alias, the catalog will not lowercase the alias. If a lowercase alias is needed, the analyzer will do the work. 2. A catalog has a new val caseSensitive that indicates if this catalog is case sensitive or not. For example, a SimpleCatalog is case sensitive, but 3. Corresponding unit tests. With this PR, case sensitivity of database names and table names is handled by the catalog. Case sensitivity of other identifiers are handled by the analyzer. JIRA: https://issues.apache.org/jira/browse/SPARK-2339 Author: Yin Huai h...@cse.ohio-state.edu Closes #1317 from yhuai/SPARK-2339 and squashes the following commits: 12d8006 [Yin Huai] Handling case sensitivity correctly. This patch introduces three changes. 1. If a table has an alias, the catalog will not lowercase the alias. If a lowercase alias is needed, the analyzer will do the work. 2. A catalog has a new val caseSensitive that indicates if this catalog is case sensitive or not. For example, a SimpleCatalog is case sensitive, but 3. Corresponding unit tests. With this patch, case sensitivity of database names and table names is handled by the catalog. Case sensitivity of other identifiers is handled by the analyzer. (cherry picked from commit c0b4cf097de50eb2c4b0f0e67da53ee92efc1f77) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e522971e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e522971e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e522971e Branch: refs/heads/branch-1.0 Commit: e522971e81efd3a7ec4a39b20082b890d11caa42 Parents: 5044ba6 Author: Yin Huai h...@cse.ohio-state.edu Authored: Mon Jul 7 17:01:44 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 7 17:01:59 2014 -0700 -- .../spark/sql/catalyst/analysis/Catalog.scala | 55 .../sql/catalyst/analysis/AnalysisSuite.scala | 69 +--- .../scala/org/apache/spark/sql/SQLContext.scala | 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 23 --- ...ive table-0-5d14d21a239daa42b086cc895215009a | 14 .../sql/hive/execution/HiveQuerySuite.scala | 16 + 6 files changed, 149 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e522971e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala index f30b5d8..0d05d98 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala @@ -25,6 +25,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery} * An interface for looking up relations by name. Used by an [[Analyzer]]. */ trait Catalog { + + def caseSensitive: Boolean + def lookupRelation( databaseName: Option[String], tableName: String, @@ -35,22 +38,44 @@ trait Catalog { def unregisterTable(databaseName: Option[String], tableName: String): Unit def unregisterAllTables(): Unit + + protected def processDatabaseAndTableName( + databaseName: Option[String], + tableName: String): (Option[String], String) = { +if (!caseSensitive) { + (databaseName.map(_.toLowerCase), tableName.toLowerCase) +} else { + (databaseName, tableName) +} + } + + protected def
git commit: [SPARK-2339][SQL] SQL parser in sql-core is case sensitive, but a table alias is converted to lower case when we create Subquery
Repository: spark Updated Branches: refs/heads/master f7ce1b3b4 - c0b4cf097 [SPARK-2339][SQL] SQL parser in sql-core is case sensitive, but a table alias is converted to lower case when we create Subquery Reported by http://apache-spark-user-list.1001560.n3.nabble.com/Spark-SQL-Join-throws-exception-td8599.html After we get the table from the catalog, because the table has an alias, we will temporarily insert a Subquery. Then, we convert the table alias to lower case no matter if the parser is case sensitive or not. To see the issue ... ``` val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.createSchemaRDD case class Person(name: String, age: Int) val people = sc.textFile(examples/src/main/resources/people.txt).map(_.split(,)).map(p = Person(p(0), p(1).trim.toInt)) people.registerAsTable(people) sqlContext.sql(select PEOPLE.name from people PEOPLE) ``` The plan is ... ``` == Query Plan == Project ['PEOPLE.name] ExistingRdd [name#0,age#1], MapPartitionsRDD[4] at mapPartitions at basicOperators.scala:176 ``` You can find that `PEOPLE.name` is not resolved. This PR introduces three changes. 1. If a table has an alias, the catalog will not lowercase the alias. If a lowercase alias is needed, the analyzer will do the work. 2. A catalog has a new val caseSensitive that indicates if this catalog is case sensitive or not. For example, a SimpleCatalog is case sensitive, but 3. Corresponding unit tests. With this PR, case sensitivity of database names and table names is handled by the catalog. Case sensitivity of other identifiers are handled by the analyzer. JIRA: https://issues.apache.org/jira/browse/SPARK-2339 Author: Yin Huai h...@cse.ohio-state.edu Closes #1317 from yhuai/SPARK-2339 and squashes the following commits: 12d8006 [Yin Huai] Handling case sensitivity correctly. This patch introduces three changes. 1. If a table has an alias, the catalog will not lowercase the alias. If a lowercase alias is needed, the analyzer will do the work. 2. A catalog has a new val caseSensitive that indicates if this catalog is case sensitive or not. For example, a SimpleCatalog is case sensitive, but 3. Corresponding unit tests. With this patch, case sensitivity of database names and table names is handled by the catalog. Case sensitivity of other identifiers is handled by the analyzer. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c0b4cf09 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c0b4cf09 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c0b4cf09 Branch: refs/heads/master Commit: c0b4cf097de50eb2c4b0f0e67da53ee92efc1f77 Parents: f7ce1b3 Author: Yin Huai h...@cse.ohio-state.edu Authored: Mon Jul 7 17:01:44 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 7 17:01:44 2014 -0700 -- .../spark/sql/catalyst/analysis/Catalog.scala | 55 .../sql/catalyst/analysis/AnalysisSuite.scala | 69 +--- .../scala/org/apache/spark/sql/SQLContext.scala | 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 23 --- ...ive table-0-5d14d21a239daa42b086cc895215009a | 14 .../sql/hive/execution/HiveQuerySuite.scala | 16 + 6 files changed, 149 insertions(+), 30 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c0b4cf09/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala index f30b5d8..0d05d98 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Catalog.scala @@ -25,6 +25,9 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Subquery} * An interface for looking up relations by name. Used by an [[Analyzer]]. */ trait Catalog { + + def caseSensitive: Boolean + def lookupRelation( databaseName: Option[String], tableName: String, @@ -35,22 +38,44 @@ trait Catalog { def unregisterTable(databaseName: Option[String], tableName: String): Unit def unregisterAllTables(): Unit + + protected def processDatabaseAndTableName( + databaseName: Option[String], + tableName: String): (Option[String], String) = { +if (!caseSensitive) { + (databaseName.map(_.toLowerCase), tableName.toLowerCase) +} else { + (databaseName, tableName) +} + } + + protected def processDatabaseAndTableName( + databaseName: String, + tableName: String): (String, String) = { +if (!caseSensitive) { +
git commit: [SPARK-2375][SQL] JSON schema inference may not resolve type conflicts correctly for a field inside an array of structs
Repository: spark Updated Branches: refs/heads/branch-1.0 691b554f3 - 1032c2875 [SPARK-2375][SQL] JSON schema inference may not resolve type conflicts correctly for a field inside an array of structs For example, for ``` {array: [{field:214748364700}, {field:1}]} ``` the type of field is resolved as IntType. While, for ``` {array: [{field:1}, {field:214748364700}]} ``` the type of field is resolved as LongType. JIRA: https://issues.apache.org/jira/browse/SPARK-2375 Author: Yin Huai huaiyin@gmail.com Closes #1308 from yhuai/SPARK-2375 and squashes the following commits: 3e2e312 [Yin Huai] Update unit test. 1b2ff9f [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2375 10794eb [Yin Huai] Correctly resolve the type of a field inside an array of structs. (cherry picked from commit f0496ee10847db921a028a34f70385f9b740b3f3) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1032c287 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1032c287 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1032c287 Branch: refs/heads/branch-1.0 Commit: 1032c28750ee2305756b44817e99951fe6385a63 Parents: 691b554 Author: Yin Huai huaiyin@gmail.com Authored: Mon Jul 7 17:05:59 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 7 17:06:10 2014 -0700 -- .../src/main/scala/org/apache/spark/sql/json/JsonRDD.scala | 9 + .../test/scala/org/apache/spark/sql/json/JsonSuite.scala| 8 +--- .../test/scala/org/apache/spark/sql/json/TestJsonData.scala | 3 ++- 3 files changed, 12 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1032c287/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala index edf8677..f6cbca9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/json/JsonRDD.scala @@ -198,11 +198,12 @@ private[sql] object JsonRDD extends Logging { * in this JSON object can appear in other JSON objects. */ private def allKeysWithValueTypes(m: Map[String, Any]): Set[(String, DataType)] = { -m.map{ +val keyValuePairs = m.map { // Quote the key with backticks to handle cases which have dots // in the field name. - case (key, dataType) = (s`$key`, dataType) -}.flatMap { + case (key, value) = (s`$key`, value) +}.toSet +keyValuePairs.flatMap { case (key: String, struct: Map[String, Any]) = { // The value associted with the key is an JSON object. allKeysWithValueTypes(struct).map { @@ -224,7 +225,7 @@ private[sql] object JsonRDD extends Logging { } } case (key: String, value) = (key, typeOfPrimitiveValue(value)) :: Nil -}.toSet +} } /** http://git-wip-us.apache.org/repos/asf/spark/blob/1032c287/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala index 10bd9f0..e765cfc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/json/JsonSuite.scala @@ -451,7 +451,9 @@ class JsonSuite extends QueryTest { val jsonSchemaRDD = jsonRDD(arrayElementTypeConflict) val expectedSchema = - AttributeReference(array, ArrayType(StringType), true)() :: Nil + AttributeReference(array1, ArrayType(StringType), true)() :: + AttributeReference(array2, ArrayType(StructType( +StructField(field, LongType, true) :: Nil)), true)() :: Nil comparePlans(Schema(expectedSchema), Schema(jsonSchemaRDD.logicalPlan.output)) @@ -460,12 +462,12 @@ class JsonSuite extends QueryTest { checkAnswer( sql(select * from jsonTable), Seq(Seq(1, 1.1, true, null, [], {}, [2,3,4], -{field:str})) :: Nil +{field:str}), Seq(Seq(214748364700L), Seq(1))) :: Nil ) // Treat an element as a number. checkAnswer( - sql(select array[0] + 1 from jsonTable), + sql(select array1[0] + 1 from jsonTable), 2 ) } http://git-wip-us.apache.org/repos/asf/spark/blob/1032c287/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/json/TestJsonData.scala
git commit: [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException
Repository: spark Updated Branches: refs/heads/master f0496ee10 - 4352a2fda [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException JIRA: https://issues.apache.org/jira/browse/SPARK-2376 Author: Yin Huai h...@cse.ohio-state.edu Closes #1320 from yhuai/SPARK-2376 and squashes the following commits: 0107417 [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2376 480803d [Yin Huai] Correctly handling JSON arrays in PySpark. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4352a2fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4352a2fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4352a2fd Branch: refs/heads/master Commit: 4352a2fdaa64efee7158eabef65703460ff284ec Parents: f0496ee Author: Yin Huai h...@cse.ohio-state.edu Authored: Mon Jul 7 18:37:38 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 7 18:37:38 2014 -0700 -- python/pyspark/sql.py | 24 ++- .../scala/org/apache/spark/sql/SchemaRDD.scala | 45 +--- 2 files changed, 44 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4352a2fd/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 5051c82..ffe1775 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -152,10 +152,12 @@ class SQLContext: ofn.close() srdd = sqlCtx.jsonFile(jsonFile) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True jschema_rdd = self._ssql_ctx.jsonFile(path) @@ -167,10 +169,12 @@ class SQLContext: srdd = sqlCtx.jsonRDD(json) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True def func(split, iterator): @@ -492,8 +496,8 @@ def _test(): globs['rdd'] = sc.parallelize([{field1 : 1, field2 : row1}, {field1 : 2, field2: row2}, {field1 : 3, field2: row3}]) jsonStrings = ['{field1: 1, field2: row1, field3:{field4:11}}', - '{field1 : 2, field2: row2, field3:{field4:22}}', - '{field1 : 3, field2: row3, field3:{field4:33}}'] + '{field1 : 2, field3:{field4:22, field5: [10, 11]}, field6:[{field7: row2}]}', + '{field1 : null, field2: row3, field3:{field4:33, field5: []}}'] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([ http://git-wip-us.apache.org/repos/asf/spark/blob/4352a2fd/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 8f9f54f..8bcfc7c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql +import java.util.{Map = JMap, List = JList, Set = JSet} + +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + import net.razorvine.pickle.Pickler import org.apache.spark.{Dependency, OneToOneDependency, Partition, Partitioner, TaskContext} @@ -27,10 +32,9 @@ import org.apache.spark.sql.catalyst.analysis._ import
git commit: [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException
Repository: spark Updated Branches: refs/heads/branch-1.0 1032c2875 - 9dce7beff [SPARK-2376][SQL] Selecting list values inside nested JSON objects raises java.lang.IllegalArgumentException JIRA: https://issues.apache.org/jira/browse/SPARK-2376 Author: Yin Huai h...@cse.ohio-state.edu Closes #1320 from yhuai/SPARK-2376 and squashes the following commits: 0107417 [Yin Huai] Merge remote-tracking branch 'upstream/master' into SPARK-2376 480803d [Yin Huai] Correctly handling JSON arrays in PySpark. (cherry picked from commit 4352a2fdaa64efee7158eabef65703460ff284ec) Signed-off-by: Michael Armbrust mich...@databricks.com Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9dce7bef Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9dce7bef Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9dce7bef Branch: refs/heads/branch-1.0 Commit: 9dce7beffb668e42ee05d961ae47f33047d579cc Parents: 1032c28 Author: Yin Huai h...@cse.ohio-state.edu Authored: Mon Jul 7 18:37:38 2014 -0700 Committer: Michael Armbrust mich...@databricks.com Committed: Mon Jul 7 18:52:51 2014 -0700 -- python/pyspark/sql.py | 24 ++- .../scala/org/apache/spark/sql/SchemaRDD.scala | 45 +--- 2 files changed, 44 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/9dce7bef/python/pyspark/sql.py -- diff --git a/python/pyspark/sql.py b/python/pyspark/sql.py index 5051c82..ffe1775 100644 --- a/python/pyspark/sql.py +++ b/python/pyspark/sql.py @@ -152,10 +152,12 @@ class SQLContext: ofn.close() srdd = sqlCtx.jsonFile(jsonFile) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True jschema_rdd = self._ssql_ctx.jsonFile(path) @@ -167,10 +169,12 @@ class SQLContext: srdd = sqlCtx.jsonRDD(json) sqlCtx.registerRDDAsTable(srdd, table1) - srdd2 = sqlCtx.sql(SELECT field1 AS f1, field2 as f2, field3 as f3 from table1) - srdd2.collect() == [{f1: 1, f2: row1, f3:{field4:11}}, -... {f1: 2, f2: row2, f3:{field4:22}}, -... {f1: 3, f2: row3, f3:{field4:33}}] + srdd2 = sqlCtx.sql( +... SELECT field1 AS f1, field2 as f2, field3 as f3, field6 as f4 from table1) + srdd2.collect() == [ +... {f1:1, f2:row1, f3:{field4:11, field5: None}, f4:None}, +... {f1:2, f2:None, f3:{field4:22, field5: [10, 11]}, f4:[{field7: row2}]}, +... {f1:None, f2:row3, f3:{field4:33, field5: []}, f4:None}] True def func(split, iterator): @@ -492,8 +496,8 @@ def _test(): globs['rdd'] = sc.parallelize([{field1 : 1, field2 : row1}, {field1 : 2, field2: row2}, {field1 : 3, field2: row3}]) jsonStrings = ['{field1: 1, field2: row1, field3:{field4:11}}', - '{field1 : 2, field2: row2, field3:{field4:22}}', - '{field1 : 3, field2: row3, field3:{field4:33}}'] + '{field1 : 2, field3:{field4:22, field5: [10, 11]}, field6:[{field7: row2}]}', + '{field1 : null, field2: row3, field3:{field4:33, field5: []}}'] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([ http://git-wip-us.apache.org/repos/asf/spark/blob/9dce7bef/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala index 8f9f54f..8bcfc7c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/SchemaRDD.scala @@ -17,6 +17,11 @@ package org.apache.spark.sql +import java.util.{Map = JMap, List = JList, Set = JSet} + +import scala.collection.JavaConversions._ +import scala.collection.JavaConverters._ + import net.razorvine.pickle.Pickler import org.apache.spark.{Dependency,