spark git commit: [SPARK-16546][SQL][PYSPARK] update python dataframe.drop
Repository: spark Updated Branches: refs/heads/master 2e4075e2e -> 183242382 [SPARK-16546][SQL][PYSPARK] update python dataframe.drop ## What changes were proposed in this pull request? Make `dataframe.drop` API in python support multi-columns parameters, so that it is the same with scala API. ## How was this patch tested? The doc test. Author: WeichenXuCloses #14203 from WeichenXu123/drop_python_api. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18324238 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18324238 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18324238 Branch: refs/heads/master Commit: 1832423827fd518853b63f91c321e4568a39107d Parents: 2e4075e Author: WeichenXu Authored: Thu Jul 14 22:55:49 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 22:55:49 2016 -0700 -- python/pyspark/sql/dataframe.py | 27 +++ 1 file changed, 19 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/18324238/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index ab41e88..adf549d 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1399,11 +1399,11 @@ class DataFrame(object): @since(1.4) @ignore_unicode_prefix -def drop(self, col): +def drop(self, *cols): """Returns a new :class:`DataFrame` that drops the specified column. -:param col: a string name of the column to drop, or a -:class:`Column` to drop. +:param cols: a string name of the column to drop, or a +:class:`Column` to drop, or a list of string name of the columns to drop. >>> df.drop('age').collect() [Row(name=u'Alice'), Row(name=u'Bob')] @@ -1416,13 +1416,24 @@ class DataFrame(object): >>> df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect() [Row(age=5, name=u'Bob', height=85)] + +>>> df.join(df2, 'name', 'inner').drop('age', 'height').collect() +[Row(name=u'Bob')] """ -if isinstance(col, basestring): -jdf = self._jdf.drop(col) -elif isinstance(col, Column): -jdf = self._jdf.drop(col._jc) +if len(cols) == 1: +col = cols[0] +if isinstance(col, basestring): +jdf = self._jdf.drop(col) +elif isinstance(col, Column): +jdf = self._jdf.drop(col._jc) +else: +raise TypeError("col should be a string or a Column") else: -raise TypeError("col should be a string or a Column") +for col in cols: +if not isinstance(col, basestring): +raise TypeError("each col in the param list should be a string") +jdf = self._jdf.drop(self._jseq(cols)) + return DataFrame(jdf, self.sql_ctx) @ignore_unicode_prefix - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16557][SQL] Remove stale doc in sql/README.md
Repository: spark Updated Branches: refs/heads/branch-2.0 aa4690b1b -> c5f935582 [SPARK-16557][SQL] Remove stale doc in sql/README.md ## What changes were proposed in this pull request? Most of the documentation in https://github.com/apache/spark/blob/master/sql/README.md is stale. It would be useful to keep the list of projects to explain what's going on, and everything else should be removed. ## How was this patch tested? N/A Author: Reynold XinCloses #14211 from rxin/SPARK-16557. (cherry picked from commit 2e4075e2ece9574100c79558cab054485e25c2ee) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5f93558 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5f93558 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5f93558 Branch: refs/heads/branch-2.0 Commit: c5f935582c07787271dcabcfbd6a7b8e776d607a Parents: aa4690b Author: Reynold Xin Authored: Thu Jul 14 19:24:42 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 19:24:47 2016 -0700 -- sql/README.md | 75 +- 1 file changed, 1 insertion(+), 74 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c5f93558/sql/README.md -- diff --git a/sql/README.md b/sql/README.md index b090398..58e9097 100644 --- a/sql/README.md +++ b/sql/README.md @@ -1,83 +1,10 @@ Spark SQL = -This module provides support for executing relational queries expressed in either SQL or a LINQ-like Scala DSL. +This module provides support for executing relational queries expressed in either SQL or the DataFrame/Dataset API. Spark SQL is broken up into four subprojects: - Catalyst (sql/catalyst) - An implementation-agnostic framework for manipulating trees of relational operators and expressions. - Execution (sql/core) - A query planner / execution engine for translating Catalyst's logical query plans into Spark RDDs. This component also includes a new public interface, SQLContext, that allows users to execute SQL or LINQ statements against existing RDDs and Parquet files. - Hive Support (sql/hive) - Includes an extension of SQLContext called HiveContext that allows users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes. There are also wrappers that allows users to run queries that include Hive UDFs, UDAFs, and UDTFs. - HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server. - - -Other dependencies for developers -- -In order to create new hive test cases (i.e. a test suite based on `HiveComparisonTest`), -you will need to setup your development environment based on the following instructions. - -If you are working with Hive 0.12.0, you will need to set several environmental variables as follows. - -``` -export HIVE_HOME="/hive/build/dist" -export HIVE_DEV_HOME="/hive/" -export HADOOP_HOME="/hadoop" -``` - -If you are working with Hive 0.13.1, the following steps are needed: - -1. Download Hive's [0.13.1](https://archive.apache.org/dist/hive/hive-0.13.1) and set `HIVE_HOME` with `export HIVE_HOME=""`. Please do not set `HIVE_DEV_HOME` (See [SPARK-4119](https://issues.apache.org/jira/browse/SPARK-4119)). -2. Set `HADOOP_HOME` with `export HADOOP_HOME=""` -3. Download all Hive 0.13.1a jars (Hive jars actually used by Spark) from [here](http://mvnrepository.com/artifact/org.spark-project.hive) and replace corresponding original 0.13.1 jars in `$HIVE_HOME/lib`. -4. Download [Kryo 2.21 jar](http://mvnrepository.com/artifact/com.esotericsoftware.kryo/kryo/2.21) (Note: 2.22 jar does not work) and [Javolution 5.5.1 jar](http://mvnrepository.com/artifact/javolution/javolution/5.5.1) to `$HIVE_HOME/lib`. -5. This step is optional. But, when generating golden answer files, if a Hive query fails and you find that Hive tries to talk to HDFS or you find weird runtime NPEs, set the following in your test suite... - -``` -val testTempDir = Utils.createTempDir() -// We have to use kryo to let Hive correctly serialize some plans. -sql("set hive.plan.serialization.format=kryo") -// Explicitly set fs to local fs. -sql(s"set fs.default.name=file://$testTempDir/") -// Ask Hive to run jobs in-process as a single map and reduce task. -sql("set mapred.job.tracker=local") -``` - -Using the console -= -An interactive scala console can be invoked by running `build/sbt hive/console`. -From here you can execute queries with HiveQl and manipulate DataFrame by using DSL. - -```scala -$ build/sbt
spark git commit: [SPARK-16557][SQL] Remove stale doc in sql/README.md
Repository: spark Updated Branches: refs/heads/master 972673aca -> 2e4075e2e [SPARK-16557][SQL] Remove stale doc in sql/README.md ## What changes were proposed in this pull request? Most of the documentation in https://github.com/apache/spark/blob/master/sql/README.md is stale. It would be useful to keep the list of projects to explain what's going on, and everything else should be removed. ## How was this patch tested? N/A Author: Reynold XinCloses #14211 from rxin/SPARK-16557. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2e4075e2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2e4075e2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2e4075e2 Branch: refs/heads/master Commit: 2e4075e2ece9574100c79558cab054485e25c2ee Parents: 972673a Author: Reynold Xin Authored: Thu Jul 14 19:24:42 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 19:24:42 2016 -0700 -- sql/README.md | 75 +- 1 file changed, 1 insertion(+), 74 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2e4075e2/sql/README.md -- diff --git a/sql/README.md b/sql/README.md index b090398..58e9097 100644 --- a/sql/README.md +++ b/sql/README.md @@ -1,83 +1,10 @@ Spark SQL = -This module provides support for executing relational queries expressed in either SQL or a LINQ-like Scala DSL. +This module provides support for executing relational queries expressed in either SQL or the DataFrame/Dataset API. Spark SQL is broken up into four subprojects: - Catalyst (sql/catalyst) - An implementation-agnostic framework for manipulating trees of relational operators and expressions. - Execution (sql/core) - A query planner / execution engine for translating Catalyst's logical query plans into Spark RDDs. This component also includes a new public interface, SQLContext, that allows users to execute SQL or LINQ statements against existing RDDs and Parquet files. - Hive Support (sql/hive) - Includes an extension of SQLContext called HiveContext that allows users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes. There are also wrappers that allows users to run queries that include Hive UDFs, UDAFs, and UDTFs. - HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server. - - -Other dependencies for developers -- -In order to create new hive test cases (i.e. a test suite based on `HiveComparisonTest`), -you will need to setup your development environment based on the following instructions. - -If you are working with Hive 0.12.0, you will need to set several environmental variables as follows. - -``` -export HIVE_HOME="/hive/build/dist" -export HIVE_DEV_HOME="/hive/" -export HADOOP_HOME="/hadoop" -``` - -If you are working with Hive 0.13.1, the following steps are needed: - -1. Download Hive's [0.13.1](https://archive.apache.org/dist/hive/hive-0.13.1) and set `HIVE_HOME` with `export HIVE_HOME=""`. Please do not set `HIVE_DEV_HOME` (See [SPARK-4119](https://issues.apache.org/jira/browse/SPARK-4119)). -2. Set `HADOOP_HOME` with `export HADOOP_HOME=""` -3. Download all Hive 0.13.1a jars (Hive jars actually used by Spark) from [here](http://mvnrepository.com/artifact/org.spark-project.hive) and replace corresponding original 0.13.1 jars in `$HIVE_HOME/lib`. -4. Download [Kryo 2.21 jar](http://mvnrepository.com/artifact/com.esotericsoftware.kryo/kryo/2.21) (Note: 2.22 jar does not work) and [Javolution 5.5.1 jar](http://mvnrepository.com/artifact/javolution/javolution/5.5.1) to `$HIVE_HOME/lib`. -5. This step is optional. But, when generating golden answer files, if a Hive query fails and you find that Hive tries to talk to HDFS or you find weird runtime NPEs, set the following in your test suite... - -``` -val testTempDir = Utils.createTempDir() -// We have to use kryo to let Hive correctly serialize some plans. -sql("set hive.plan.serialization.format=kryo") -// Explicitly set fs to local fs. -sql(s"set fs.default.name=file://$testTempDir/") -// Ask Hive to run jobs in-process as a single map and reduce task. -sql("set mapred.job.tracker=local") -``` - -Using the console -= -An interactive scala console can be invoked by running `build/sbt hive/console`. -From here you can execute queries with HiveQl and manipulate DataFrame by using DSL. - -```scala -$ build/sbt hive/console - -[info] Starting scala interpreter... -import org.apache.spark.sql.catalyst.analysis._ -import
spark git commit: [SPARK-16555] Work around Jekyll error-handling bug which led to silent failures
Repository: spark Updated Branches: refs/heads/master 01c4c1fa5 -> 972673aca [SPARK-16555] Work around Jekyll error-handling bug which led to silent failures If a custom Jekyll template tag throws Ruby's equivalent of a "file not found" exception, then Jekyll will stop the doc building process but will exit with a successful status, causing our doc publishing jobs to silently fail. This is caused by https://github.com/jekyll/jekyll/issues/5104, a case of bad error-handling logic in Jekyll. This patch works around this by updating our `include_example.rb` plugin to catch the exception and exit rather than allowing it to bubble up and be ignored by Jekyll. I tested this manually with ``` rm ./examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala cd docs SKIP_API=1 jekyll build echo $? ``` Author: Josh RosenCloses #14209 from JoshRosen/fix-doc-building. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/972673ac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/972673ac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/972673ac Branch: refs/heads/master Commit: 972673aca562b24c885801d2ac48e0df95cde9eb Parents: 01c4c1f Author: Josh Rosen Authored: Thu Jul 14 15:55:36 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 15:55:36 2016 -0700 -- docs/_plugins/include_example.rb | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/972673ac/docs/_plugins/include_example.rb -- diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 306..6ea1d43 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -45,7 +45,15 @@ module Jekyll @file = File.join(@code_dir, snippet_file) @lang = snippet_file.split('.').last - code = File.open(@file).read.encode("UTF-8") + begin +code = File.open(@file).read.encode("UTF-8") + rescue => e +# We need to explicitly exit on execptions here because Jekyll will silently swallow +# them, leading to silent build failures (see https://github.com/jekyll/jekyll/issues/5104) +puts(e) +puts(e.backtrace) +exit 1 + end code = select_lines(code) rendered_code = Pygments.highlight(code, :lexer => @lang) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16555] Work around Jekyll error-handling bug which led to silent failures
Repository: spark Updated Branches: refs/heads/branch-2.0 5c56bc00c -> aa4690b1b [SPARK-16555] Work around Jekyll error-handling bug which led to silent failures If a custom Jekyll template tag throws Ruby's equivalent of a "file not found" exception, then Jekyll will stop the doc building process but will exit with a successful status, causing our doc publishing jobs to silently fail. This is caused by https://github.com/jekyll/jekyll/issues/5104, a case of bad error-handling logic in Jekyll. This patch works around this by updating our `include_example.rb` plugin to catch the exception and exit rather than allowing it to bubble up and be ignored by Jekyll. I tested this manually with ``` rm ./examples/src/main/scala/org/apache/spark/examples/sql/SparkSQLExample.scala cd docs SKIP_API=1 jekyll build echo $? ``` Author: Josh RosenCloses #14209 from JoshRosen/fix-doc-building. (cherry picked from commit 972673aca562b24c885801d2ac48e0df95cde9eb) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/aa4690b1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/aa4690b1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/aa4690b1 Branch: refs/heads/branch-2.0 Commit: aa4690b1bbf86f5f927ca0038dc80dc17182b268 Parents: 5c56bc0 Author: Josh Rosen Authored: Thu Jul 14 15:55:36 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 15:55:42 2016 -0700 -- docs/_plugins/include_example.rb | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/aa4690b1/docs/_plugins/include_example.rb -- diff --git a/docs/_plugins/include_example.rb b/docs/_plugins/include_example.rb index 306..6ea1d43 100644 --- a/docs/_plugins/include_example.rb +++ b/docs/_plugins/include_example.rb @@ -45,7 +45,15 @@ module Jekyll @file = File.join(@code_dir, snippet_file) @lang = snippet_file.split('.').last - code = File.open(@file).read.encode("UTF-8") + begin +code = File.open(@file).read.encode("UTF-8") + rescue => e +# We need to explicitly exit on execptions here because Jekyll will silently swallow +# them, leading to silent build failures (see https://github.com/jekyll/jekyll/issues/5104) +puts(e) +puts(e.backtrace) +exit 1 + end code = select_lines(code) rendered_code = Pygments.highlight(code, :lexer => @lang) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16553][DOCS] Fix SQL example file name in docs
Repository: spark Updated Branches: refs/heads/branch-2.0 1fe0bcdd0 -> 5c56bc00c [SPARK-16553][DOCS] Fix SQL example file name in docs ## What changes were proposed in this pull request? Fixes a typo in the sql programming guide ## How was this patch tested? Building docs locally (If this patch involves UI changes, please attach a screenshot; otherwise, remove this) Author: Shivaram VenkataramanCloses #14208 from shivaram/spark-sql-doc-fix. (cherry picked from commit 01c4c1fa539a6c601ea0d8960363e895c17a8f76) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5c56bc00 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5c56bc00 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5c56bc00 Branch: refs/heads/branch-2.0 Commit: 5c56bc00ce6d873107010e574f53a4fa5a23bd27 Parents: 1fe0bcd Author: Shivaram Venkataraman Authored: Thu Jul 14 14:19:30 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 14:19:35 2016 -0700 -- docs/sql-programming-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5c56bc00/docs/sql-programming-guide.md -- diff --git a/docs/sql-programming-guide.md b/docs/sql-programming-guide.md index f5d1fee..a4127da 100644 --- a/docs/sql-programming-guide.md +++ b/docs/sql-programming-guide.md @@ -249,7 +249,7 @@ In addition to simple column references and expressions, DataFrames also have a The `sql` function on a `SparkSession` enables applications to run SQL queries programmatically and returns the result as a `DataFrame`. -{% include_example run_sql scala/org/apache/spark/examples/sql/SparkSQLExample.scala %} +{% include_example run_sql scala/org/apache/spark/examples/sql/SparkSqlExample.scala %} - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16313][SQL][BRANCH-1.6] Spark should not silently drop exceptions in file listing
Repository: spark Updated Branches: refs/heads/branch-1.6 4381e2121 -> 6ea7d4bd3 [SPARK-16313][SQL][BRANCH-1.6] Spark should not silently drop exceptions in file listing ## What changes were proposed in this pull request? Spark silently drops exceptions during file listing. This is a very bad behavior because it can mask legitimate errors and the resulting plan will silently have 0 rows. This patch changes it to not silently drop the errors. After making partition discovery not silently drop exceptions, HiveMetastoreCatalog can trigger partition discovery on empty tables, which cause FileNotFoundExceptions (these Exceptions were dropped by partition discovery silently). To address this issue, this PR introduces two **hacks** to workaround the issues. These two hacks try to avoid of triggering partition discovery on empty tables in HiveMetastoreCatalog. ## How was this patch tested? Manually tested. **Note: This is a backport of https://github.com/apache/spark/pull/13987** Author: Yin HuaiCloses #14139 from yhuai/SPARK-16313-branch-1.6. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6ea7d4bd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6ea7d4bd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6ea7d4bd Branch: refs/heads/branch-1.6 Commit: 6ea7d4bd393911d2d15b61e78df7473a7ea9b161 Parents: 4381e21 Author: Yin Huai Authored: Thu Jul 14 12:00:31 2016 -0700 Committer: Yin Huai Committed: Thu Jul 14 12:00:31 2016 -0700 -- .../apache/spark/sql/sources/interfaces.scala | 6 ++-- .../org/apache/spark/sql/SQLQuerySuite.scala| 2 +- .../spark/sql/hive/HiveMetastoreCatalog.scala | 30 +--- .../sql/hive/MetastoreDataSourcesSuite.scala| 23 --- 4 files changed, 49 insertions(+), 12 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/6ea7d4bd/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala index ce5f3dc..5aba55c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/sources/interfaces.scala @@ -453,9 +453,9 @@ abstract class HadoopFsRelation private[sql]( val jobConf = new JobConf(hadoopConf, this.getClass()) val pathFilter = FileInputFormat.getInputPathFilter(jobConf) if (pathFilter != null) { -Try(fs.listStatus(qualified, pathFilter)).getOrElse(Array.empty) +fs.listStatus(qualified, pathFilter) } else { -Try(fs.listStatus(qualified)).getOrElse(Array.empty) +fs.listStatus(qualified) } }.filterNot { status => val name = status.getPath.getName @@ -903,7 +903,7 @@ private[sql] object HadoopFsRelation extends Logging { val hdfsPath = new Path(path) val fs = hdfsPath.getFileSystem(serializableConfiguration.value) val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory) - Try(listLeafFiles(fs, fs.getFileStatus(qualified))).getOrElse(Array.empty) + listLeafFiles(fs, fs.getFileStatus(qualified)) }.map { status => FakeFileStatus( status.getPath.toString, http://git-wip-us.apache.org/repos/asf/spark/blob/6ea7d4bd/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 6fec580..be23043 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -1757,7 +1757,7 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext { val e3 = intercept[AnalysisException] { sql("select * from json.invalid_file") } -assert(e3.message.contains("No input paths specified")) +assert(e3.message.contains("invalid_file does not exist")) } test("SortMergeJoin returns wrong results when using UnsafeRows") { http://git-wip-us.apache.org/repos/asf/spark/blob/6ea7d4bd/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala index 0562e33..03720c9 100644 ---
spark git commit: [SPARK-16540][YARN][CORE] Avoid adding jars twice for Spark running on yarn
Repository: spark Updated Branches: refs/heads/master 31ca741ae -> 91575cac3 [SPARK-16540][YARN][CORE] Avoid adding jars twice for Spark running on yarn ## What changes were proposed in this pull request? Currently when running spark on yarn, jars specified with --jars, --packages will be added twice, one is Spark's own file server, another is yarn's distributed cache, this can be seen from log: for example: ``` ./bin/spark-shell --master yarn-client --jars examples/target/scala-2.11/jars/scopt_2.11-3.3.0.jar ``` If specified the jar to be added is scopt jar, it will added twice: ``` ... 16/07/14 15:06:48 INFO Server: Started 5603ms 16/07/14 15:06:48 INFO Utils: Successfully started service 'SparkUI' on port 4040. 16/07/14 15:06:48 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://192.168.0.102:4040 16/07/14 15:06:48 INFO SparkContext: Added JAR file:/Users/sshao/projects/apache-spark/examples/target/scala-2.11/jars/scopt_2.11-3.3.0.jar at spark://192.168.0.102:63996/jars/scopt_2.11-3.3.0.jar with timestamp 1468480008637 16/07/14 15:06:49 INFO RMProxy: Connecting to ResourceManager at /0.0.0.0:8032 16/07/14 15:06:49 INFO Client: Requesting a new application from cluster with 1 NodeManagers 16/07/14 15:06:49 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (8192 MB per container) 16/07/14 15:06:49 INFO Client: Will allocate AM container, with 896 MB memory including 384 MB overhead 16/07/14 15:06:49 INFO Client: Setting up container launch context for our AM 16/07/14 15:06:49 INFO Client: Setting up the launch environment for our AM container 16/07/14 15:06:49 INFO Client: Preparing resources for our AM container 16/07/14 15:06:49 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 16/07/14 15:06:50 INFO Client: Uploading resource file:/private/var/folders/tb/8pw1511s2q78mj7plnq8p9g4gn/T/spark-a446300b-84bf-43ff-bfb1-3adfb0571a42/__spark_libs__6486179704064718817.zip -> hdfs://localhost:8020/user/sshao/.sparkStaging/application_1468468348998_0009/__spark_libs__6486179704064718817.zip 16/07/14 15:06:51 INFO Client: Uploading resource file:/Users/sshao/projects/apache-spark/examples/target/scala-2.11/jars/scopt_2.11-3.3.0.jar -> hdfs://localhost:8020/user/sshao/.sparkStaging/application_1468468348998_0009/scopt_2.11-3.3.0.jar 16/07/14 15:06:51 INFO Client: Uploading resource file:/private/var/folders/tb/8pw1511s2q78mj7plnq8p9g4gn/T/spark-a446300b-84bf-43ff-bfb1-3adfb0571a42/__spark_conf__326416236462420861.zip -> hdfs://localhost:8020/user/sshao/.sparkStaging/application_1468468348998_0009/__spark_conf__.zip ... ``` So here try to avoid adding jars to Spark's fileserver unnecessarily. ## How was this patch tested? Manually verified both in yarn client and cluster mode, also in standalone mode. Author: jerryshaoCloses #14196 from jerryshao/SPARK-16540. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/91575cac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/91575cac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/91575cac Branch: refs/heads/master Commit: 91575cac32e470d7079a55fb86d66332aba599d0 Parents: 31ca741 Author: jerryshao Authored: Thu Jul 14 10:40:59 2016 -0700 Committer: Marcelo Vanzin Committed: Thu Jul 14 10:40:59 2016 -0700 -- core/src/main/scala/org/apache/spark/util/Utils.scala| 4 ++-- .../src/main/scala/org/apache/spark/repl/SparkILoop.scala| 2 +- repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/91575cac/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 2e4ec4c..6ab9e99 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2409,9 +2409,9 @@ private[spark] object Utils extends Logging { * "spark.yarn.dist.jars" properties, while in other modes it returns the jar files pointed by * only the "spark.jars" property. */ - def getUserJars(conf: SparkConf): Seq[String] = { + def getUserJars(conf: SparkConf, isShell: Boolean = false): Seq[String] = { val sparkJars = conf.getOption("spark.jars") -if (conf.get("spark.master") == "yarn") { +if (conf.get("spark.master") == "yarn" && isShell) { val yarnJars = conf.getOption("spark.yarn.dist.jars")
spark git commit: [SPARK-16540][YARN][CORE] Avoid adding jars twice for Spark running on yarn
Repository: spark Updated Branches: refs/heads/branch-2.0 23e1ab9c7 -> 1fe0bcdd0 [SPARK-16540][YARN][CORE] Avoid adding jars twice for Spark running on yarn ## What changes were proposed in this pull request? Currently when running spark on yarn, jars specified with --jars, --packages will be added twice, one is Spark's own file server, another is yarn's distributed cache, this can be seen from log: for example: ``` ./bin/spark-shell --master yarn-client --jars examples/target/scala-2.11/jars/scopt_2.11-3.3.0.jar ``` If specified the jar to be added is scopt jar, it will added twice: ``` ... 16/07/14 15:06:48 INFO Server: Started 5603ms 16/07/14 15:06:48 INFO Utils: Successfully started service 'SparkUI' on port 4040. 16/07/14 15:06:48 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://192.168.0.102:4040 16/07/14 15:06:48 INFO SparkContext: Added JAR file:/Users/sshao/projects/apache-spark/examples/target/scala-2.11/jars/scopt_2.11-3.3.0.jar at spark://192.168.0.102:63996/jars/scopt_2.11-3.3.0.jar with timestamp 1468480008637 16/07/14 15:06:49 INFO RMProxy: Connecting to ResourceManager at /0.0.0.0:8032 16/07/14 15:06:49 INFO Client: Requesting a new application from cluster with 1 NodeManagers 16/07/14 15:06:49 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (8192 MB per container) 16/07/14 15:06:49 INFO Client: Will allocate AM container, with 896 MB memory including 384 MB overhead 16/07/14 15:06:49 INFO Client: Setting up container launch context for our AM 16/07/14 15:06:49 INFO Client: Setting up the launch environment for our AM container 16/07/14 15:06:49 INFO Client: Preparing resources for our AM container 16/07/14 15:06:49 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 16/07/14 15:06:50 INFO Client: Uploading resource file:/private/var/folders/tb/8pw1511s2q78mj7plnq8p9g4gn/T/spark-a446300b-84bf-43ff-bfb1-3adfb0571a42/__spark_libs__6486179704064718817.zip -> hdfs://localhost:8020/user/sshao/.sparkStaging/application_1468468348998_0009/__spark_libs__6486179704064718817.zip 16/07/14 15:06:51 INFO Client: Uploading resource file:/Users/sshao/projects/apache-spark/examples/target/scala-2.11/jars/scopt_2.11-3.3.0.jar -> hdfs://localhost:8020/user/sshao/.sparkStaging/application_1468468348998_0009/scopt_2.11-3.3.0.jar 16/07/14 15:06:51 INFO Client: Uploading resource file:/private/var/folders/tb/8pw1511s2q78mj7plnq8p9g4gn/T/spark-a446300b-84bf-43ff-bfb1-3adfb0571a42/__spark_conf__326416236462420861.zip -> hdfs://localhost:8020/user/sshao/.sparkStaging/application_1468468348998_0009/__spark_conf__.zip ... ``` So here try to avoid adding jars to Spark's fileserver unnecessarily. ## How was this patch tested? Manually verified both in yarn client and cluster mode, also in standalone mode. Author: jerryshaoCloses #14196 from jerryshao/SPARK-16540. (cherry picked from commit 91575cac32e470d7079a55fb86d66332aba599d0) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1fe0bcdd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1fe0bcdd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1fe0bcdd Branch: refs/heads/branch-2.0 Commit: 1fe0bcdd0bf39dd4993bf2ec35f66eec1b949f5b Parents: 23e1ab9 Author: jerryshao Authored: Thu Jul 14 10:40:59 2016 -0700 Committer: Marcelo Vanzin Committed: Thu Jul 14 10:41:17 2016 -0700 -- core/src/main/scala/org/apache/spark/util/Utils.scala| 4 ++-- .../src/main/scala/org/apache/spark/repl/SparkILoop.scala| 2 +- repl/scala-2.11/src/main/scala/org/apache/spark/repl/Main.scala | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1fe0bcdd/core/src/main/scala/org/apache/spark/util/Utils.scala -- diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index a79d195..be1ae40 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2405,9 +2405,9 @@ private[spark] object Utils extends Logging { * "spark.yarn.dist.jars" properties, while in other modes it returns the jar files pointed by * only the "spark.jars" property. */ - def getUserJars(conf: SparkConf): Seq[String] = { + def getUserJars(conf: SparkConf, isShell: Boolean = false): Seq[String] = { val sparkJars = conf.getOption("spark.jars") -if (conf.get("spark.master") == "yarn") { +if
spark git commit: [SPARK-16528][SQL] Fix NPE problem in HiveClientImpl
Repository: spark Updated Branches: refs/heads/branch-2.0 741801921 -> 23e1ab9c7 [SPARK-16528][SQL] Fix NPE problem in HiveClientImpl ## What changes were proposed in this pull request? There are some calls to methods or fields (getParameters, properties) which are then passed to Java/Scala collection converters. Unfortunately those fields can be null in some cases and then the conversions throws NPE. We fix it by wrapping calls to those fields and methods with option and then do the conversion. ## How was this patch tested? Manually tested with a custom Hive metastore. Author: Jacek LewandowskiCloses #14200 from jacek-lewandowski/SPARK-16528. (cherry picked from commit 31ca741aef9dd138529e064785c8e58b86140ff5) Signed-off-by: Reynold Xin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/23e1ab9c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/23e1ab9c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/23e1ab9c Branch: refs/heads/branch-2.0 Commit: 23e1ab9c7d56946647cb2081bd384d174bce1882 Parents: 7418019 Author: Jacek Lewandowski Authored: Thu Jul 14 10:18:31 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 10:19:01 2016 -0700 -- .../apache/spark/sql/hive/client/HiveClientImpl.scala | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/23e1ab9c/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 1c89d8c..6cdf3ef 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -293,7 +293,7 @@ private[hive] class HiveClientImpl( database.name, database.description, database.locationUri, -database.properties.asJava), +Option(database.properties).map(_.asJava).orNull), ignoreIfExists) } @@ -311,7 +311,7 @@ private[hive] class HiveClientImpl( database.name, database.description, database.locationUri, -database.properties.asJava)) +Option(database.properties).map(_.asJava).orNull)) } override def getDatabaseOption(name: String): Option[CatalogDatabase] = withHiveState { @@ -320,7 +320,7 @@ private[hive] class HiveClientImpl( name = d.getName, description = d.getDescription, locationUri = d.getLocationUri, -properties = d.getParameters.asScala.toMap) +properties = Option(d.getParameters).map(_.asScala.toMap).orNull) } } @@ -353,7 +353,7 @@ private[hive] class HiveClientImpl( unsupportedFeatures += "bucketing" } - val properties = h.getParameters.asScala.toMap + val properties = Option(h.getParameters).map(_.asScala.toMap).orNull CatalogTable( identifier = TableIdentifier(h.getTableName, Option(h.getDbName)), @@ -390,7 +390,8 @@ private[hive] class HiveClientImpl( outputFormat = Option(h.getOutputFormatClass).map(_.getName), serde = Option(h.getSerializationLib), compressed = h.getTTable.getSd.isCompressed, - serdeProperties = h.getTTable.getSd.getSerdeInfo.getParameters.asScala.toMap + serdeProperties = Option(h.getTTable.getSd.getSerdeInfo.getParameters) +.map(_.asScala.toMap).orNull ), properties = properties, viewOriginalText = Option(h.getViewOriginalText), @@ -815,6 +816,7 @@ private[hive] class HiveClientImpl( outputFormat = Option(apiPartition.getSd.getOutputFormat), serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib), compressed = apiPartition.getSd.isCompressed, -serdeProperties = apiPartition.getSd.getSerdeInfo.getParameters.asScala.toMap)) +serdeProperties = Option(apiPartition.getSd.getSerdeInfo.getParameters) + .map(_.asScala.toMap).orNull)) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16528][SQL] Fix NPE problem in HiveClientImpl
Repository: spark Updated Branches: refs/heads/master c576f9fb9 -> 31ca741ae [SPARK-16528][SQL] Fix NPE problem in HiveClientImpl ## What changes were proposed in this pull request? There are some calls to methods or fields (getParameters, properties) which are then passed to Java/Scala collection converters. Unfortunately those fields can be null in some cases and then the conversions throws NPE. We fix it by wrapping calls to those fields and methods with option and then do the conversion. ## How was this patch tested? Manually tested with a custom Hive metastore. Author: Jacek LewandowskiCloses #14200 from jacek-lewandowski/SPARK-16528. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31ca741a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31ca741a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31ca741a Branch: refs/heads/master Commit: 31ca741aef9dd138529e064785c8e58b86140ff5 Parents: c576f9f Author: Jacek Lewandowski Authored: Thu Jul 14 10:18:31 2016 -0700 Committer: Reynold Xin Committed: Thu Jul 14 10:18:31 2016 -0700 -- .../apache/spark/sql/hive/client/HiveClientImpl.scala | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/31ca741a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala index 7e0cef3..2f102a8 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala @@ -293,7 +293,7 @@ private[hive] class HiveClientImpl( database.name, database.description, database.locationUri, -database.properties.asJava), +Option(database.properties).map(_.asJava).orNull), ignoreIfExists) } @@ -311,7 +311,7 @@ private[hive] class HiveClientImpl( database.name, database.description, database.locationUri, -database.properties.asJava)) +Option(database.properties).map(_.asJava).orNull)) } override def getDatabaseOption(name: String): Option[CatalogDatabase] = withHiveState { @@ -320,7 +320,7 @@ private[hive] class HiveClientImpl( name = d.getName, description = d.getDescription, locationUri = d.getLocationUri, -properties = d.getParameters.asScala.toMap) +properties = Option(d.getParameters).map(_.asScala.toMap).orNull) } } @@ -353,7 +353,7 @@ private[hive] class HiveClientImpl( unsupportedFeatures += "bucketing" } - val properties = h.getParameters.asScala.toMap + val properties = Option(h.getParameters).map(_.asScala.toMap).orNull CatalogTable( identifier = TableIdentifier(h.getTableName, Option(h.getDbName)), @@ -390,7 +390,8 @@ private[hive] class HiveClientImpl( outputFormat = Option(h.getOutputFormatClass).map(_.getName), serde = Option(h.getSerializationLib), compressed = h.getTTable.getSd.isCompressed, - serdeProperties = h.getTTable.getSd.getSerdeInfo.getParameters.asScala.toMap + serdeProperties = Option(h.getTTable.getSd.getSerdeInfo.getParameters) +.map(_.asScala.toMap).orNull ), properties = properties, viewOriginalText = Option(h.getViewOriginalText), @@ -817,6 +818,7 @@ private[hive] class HiveClientImpl( outputFormat = Option(apiPartition.getSd.getOutputFormat), serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib), compressed = apiPartition.getSd.isCompressed, -serdeProperties = apiPartition.getSd.getSerdeInfo.getParameters.asScala.toMap)) +serdeProperties = Option(apiPartition.getSd.getSerdeInfo.getParameters) + .map(_.asScala.toMap).orNull)) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[1/2] spark git commit: Preparing Spark release v2.0.0-rc4
Repository: spark Updated Branches: refs/heads/branch-2.0 29281bc40 -> 0a651aa26 Preparing Spark release v2.0.0-rc4 Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e5f8c111 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e5f8c111 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e5f8c111 Branch: refs/heads/branch-2.0 Commit: e5f8c1117e0c48499f54d62b556bc693435afae0 Parents: 29281bc Author: Patrick WendellAuthored: Thu Jul 14 09:50:07 2016 -0700 Committer: Patrick Wendell Committed: Thu Jul 14 09:50:07 2016 -0700 -- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 34 files changed, 34 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e5f8c111/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 507ddc7..5f546bb 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.1-SNAPSHOT +2.0.0 ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e5f8c111/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index bc3b0fe..2eaa810 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.1-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e5f8c111/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index 2fb5835..f068d9d 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.1-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e5f8c111/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index 07d9f1c..fd22188 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.1-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e5f8c111/common/sketch/pom.xml -- diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index 5e02efd..a17aba5 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.1-SNAPSHOT +2.0.0 ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/e5f8c111/common/tags/pom.xml -- diff --git a/common/tags/pom.xml b/common/tags/pom.xml index e7fc6a2..0bd8846 100644 ---
[2/2] spark git commit: Preparing development version 2.0.1-SNAPSHOT
Preparing development version 2.0.1-SNAPSHOT Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a651aa2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a651aa2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a651aa2 Branch: refs/heads/branch-2.0 Commit: 0a651aa26b27ec679a5878154e75c92111689981 Parents: e5f8c11 Author: Patrick WendellAuthored: Thu Jul 14 09:50:16 2016 -0700 Committer: Patrick Wendell Committed: Thu Jul 14 09:50:16 2016 -0700 -- assembly/pom.xml | 2 +- common/network-common/pom.xml | 2 +- common/network-shuffle/pom.xml| 2 +- common/network-yarn/pom.xml | 2 +- common/sketch/pom.xml | 2 +- common/tags/pom.xml | 2 +- common/unsafe/pom.xml | 2 +- core/pom.xml | 2 +- examples/pom.xml | 2 +- external/docker-integration-tests/pom.xml | 2 +- external/flume-assembly/pom.xml | 2 +- external/flume-sink/pom.xml | 2 +- external/flume/pom.xml| 2 +- external/java8-tests/pom.xml | 2 +- external/kafka-0-10-assembly/pom.xml | 2 +- external/kafka-0-10/pom.xml | 2 +- external/kafka-0-8-assembly/pom.xml | 2 +- external/kafka-0-8/pom.xml| 2 +- external/kinesis-asl-assembly/pom.xml | 2 +- external/kinesis-asl/pom.xml | 2 +- external/spark-ganglia-lgpl/pom.xml | 2 +- graphx/pom.xml| 2 +- launcher/pom.xml | 2 +- mllib-local/pom.xml | 2 +- mllib/pom.xml | 2 +- pom.xml | 2 +- repl/pom.xml | 2 +- sql/catalyst/pom.xml | 2 +- sql/core/pom.xml | 2 +- sql/hive-thriftserver/pom.xml | 2 +- sql/hive/pom.xml | 2 +- streaming/pom.xml | 2 +- tools/pom.xml | 2 +- yarn/pom.xml | 2 +- 34 files changed, 34 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0a651aa2/assembly/pom.xml -- diff --git a/assembly/pom.xml b/assembly/pom.xml index 5f546bb..507ddc7 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0a651aa2/common/network-common/pom.xml -- diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 2eaa810..bc3b0fe 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0a651aa2/common/network-shuffle/pom.xml -- diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml index f068d9d..2fb5835 100644 --- a/common/network-shuffle/pom.xml +++ b/common/network-shuffle/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0a651aa2/common/network-yarn/pom.xml -- diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml index fd22188..07d9f1c 100644 --- a/common/network-yarn/pom.xml +++ b/common/network-yarn/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0a651aa2/common/sketch/pom.xml -- diff --git a/common/sketch/pom.xml b/common/sketch/pom.xml index a17aba5..5e02efd 100644 --- a/common/sketch/pom.xml +++ b/common/sketch/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.11 -2.0.0 +2.0.1-SNAPSHOT ../../pom.xml http://git-wip-us.apache.org/repos/asf/spark/blob/0a651aa2/common/tags/pom.xml -- diff --git a/common/tags/pom.xml b/common/tags/pom.xml index 0bd8846..e7fc6a2 100644 --- a/common/tags/pom.xml +++ b/common/tags/pom.xml @@ -22,7 +22,7 @@ org.apache.spark
spark git commit: [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping
Repository: spark Updated Branches: refs/heads/branch-2.0 0a651aa26 -> 741801921 [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping ## What changes were proposed in this pull request? `SQLTestUtils.withTempDatabase` is a frequently used test harness to setup a temporary table and clean up finally. This issue improves like the following for usability. ```scala -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} ``` In case of forgetting to reset the databaes, `withTempDatabase` will not raise Exception. ## How was this patch tested? This improves test harness. Author: Dongjoon HyunCloses #14184 from dongjoon-hyun/SPARK-16529. (cherry picked from commit c576f9fb90853cce2e8e5dcc32a536a0f49cbbd8) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/74180192 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/74180192 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/74180192 Branch: refs/heads/branch-2.0 Commit: 7418019218e5a2bd4ae948bb1984816f161925cf Parents: 0a651aa Author: Dongjoon Hyun Authored: Fri Jul 15 00:51:11 2016 +0800 Committer: Cheng Lian Committed: Fri Jul 15 00:51:56 2016 +0800 -- .../test/scala/org/apache/spark/sql/test/SQLTestUtils.scala | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/74180192/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 853dd0f..26bd3fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -29,6 +29,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.NoSuchTableException +import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ @@ -196,7 +197,12 @@ private[sql] trait SQLTestUtils fail("Failed to create temporary database", cause) } -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping
Repository: spark Updated Branches: refs/heads/master 12005c88f -> c576f9fb9 [SPARK-16529][SQL][TEST] `withTempDatabase` should set `default` database before dropping ## What changes were proposed in this pull request? `SQLTestUtils.withTempDatabase` is a frequently used test harness to setup a temporary table and clean up finally. This issue improves like the following for usability. ```scala -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} ``` In case of forgetting to reset the databaes, `withTempDatabase` will not raise Exception. ## How was this patch tested? This improves test harness. Author: Dongjoon HyunCloses #14184 from dongjoon-hyun/SPARK-16529. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c576f9fb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c576f9fb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c576f9fb Branch: refs/heads/master Commit: c576f9fb90853cce2e8e5dcc32a536a0f49cbbd8 Parents: 12005c8 Author: Dongjoon Hyun Authored: Fri Jul 15 00:51:11 2016 +0800 Committer: Cheng Lian Committed: Fri Jul 15 00:51:11 2016 +0800 -- .../test/scala/org/apache/spark/sql/test/SQLTestUtils.scala | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c576f9fb/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index 853dd0f..26bd3fb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -29,6 +29,7 @@ import org.scalatest.BeforeAndAfterAll import org.apache.spark.SparkFunSuite import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.analysis.NoSuchTableException +import org.apache.spark.sql.catalyst.catalog.SessionCatalog.DEFAULT_DATABASE import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util._ @@ -196,7 +197,12 @@ private[sql] trait SQLTestUtils fail("Failed to create temporary database", cause) } -try f(dbName) finally spark.sql(s"DROP DATABASE $dbName CASCADE") +try f(dbName) finally { + if (spark.catalog.currentDatabase == dbName) { +spark.sql(s"USE ${DEFAULT_DATABASE}") + } + spark.sql(s"DROP DATABASE $dbName CASCADE") +} } /** - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] Git Push Summary
Repository: spark Updated Tags: refs/tags/v2.0.0-rc4 [created] e5f8c1117 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions
Repository: spark Updated Branches: refs/heads/master 093ebbc62 -> 12005c88f [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions ## What changes were proposed in this pull request? Fix function routing to work with and without namespace operator `SparkR::createDataFrame` ## How was this patch tested? manual, unit tests shivaram Author: Felix CheungCloses #14195 from felixcheung/rroutedefault. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/12005c88 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/12005c88 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/12005c88 Branch: refs/heads/master Commit: 12005c88fb24168d57b577cff73eddcd9d8963fc Parents: 093ebbc Author: Felix Cheung Authored: Thu Jul 14 09:45:30 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 14 09:45:30 2016 -0700 -- R/pkg/R/SQLContext.R | 4 +++- R/pkg/inst/tests/testthat/test_sparkSQL.R | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/12005c88/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index bc0daa2..d2ea155 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -48,7 +48,9 @@ getInternalType <- function(x) { #' @return whatever the target returns #' @noRd dispatchFunc <- function(newFuncSig, x, ...) { - funcName <- as.character(sys.call(sys.parent())[[1]]) + # When called with SparkR::createDataFrame, sys.call()[[1]] returns c(::, SparkR, createDataFrame) + callsite <- as.character(sys.call(sys.parent())[[1]]) + funcName <- callsite[[length(callsite)]] f <- get(paste0(funcName, ".default")) # Strip sqlContext from list of parameters and then pass the rest along. contextNames <- c("org.apache.spark.sql.SQLContext", http://git-wip-us.apache.org/repos/asf/spark/blob/12005c88/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 8786823..a1b1f1c 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2405,7 +2405,8 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { a <- 1:3 b <- c("a", "b", "c") ldf <- data.frame(a, b) - df <- suppressWarnings(createDataFrame(sqlContext, ldf)) + # Call function with namespace :: operator - SPARK-16538 + df <- suppressWarnings(SparkR::createDataFrame(sqlContext, ldf)) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) expect_equal(count(df), 3) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions
Repository: spark Updated Branches: refs/heads/branch-2.0 4e9080f44 -> 29281bc40 [SPARK-16538][SPARKR] fix R call with namespace operator on SparkSession functions ## What changes were proposed in this pull request? Fix function routing to work with and without namespace operator `SparkR::createDataFrame` ## How was this patch tested? manual, unit tests shivaram Author: Felix CheungCloses #14195 from felixcheung/rroutedefault. (cherry picked from commit 12005c88fb24168d57b577cff73eddcd9d8963fc) Signed-off-by: Shivaram Venkataraman Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29281bc4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29281bc4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29281bc4 Branch: refs/heads/branch-2.0 Commit: 29281bc40cb83ce2946b0395981c8dce5630910c Parents: 4e9080f Author: Felix Cheung Authored: Thu Jul 14 09:45:30 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 14 09:45:39 2016 -0700 -- R/pkg/R/SQLContext.R | 4 +++- R/pkg/inst/tests/testthat/test_sparkSQL.R | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/29281bc4/R/pkg/R/SQLContext.R -- diff --git a/R/pkg/R/SQLContext.R b/R/pkg/R/SQLContext.R index bc0daa2..d2ea155 100644 --- a/R/pkg/R/SQLContext.R +++ b/R/pkg/R/SQLContext.R @@ -48,7 +48,9 @@ getInternalType <- function(x) { #' @return whatever the target returns #' @noRd dispatchFunc <- function(newFuncSig, x, ...) { - funcName <- as.character(sys.call(sys.parent())[[1]]) + # When called with SparkR::createDataFrame, sys.call()[[1]] returns c(::, SparkR, createDataFrame) + callsite <- as.character(sys.call(sys.parent())[[1]]) + funcName <- callsite[[length(callsite)]] f <- get(paste0(funcName, ".default")) # Strip sqlContext from list of parameters and then pass the rest along. contextNames <- c("org.apache.spark.sql.SQLContext", http://git-wip-us.apache.org/repos/asf/spark/blob/29281bc4/R/pkg/inst/tests/testthat/test_sparkSQL.R -- diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 1bfdc34..20c750a 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -2397,7 +2397,8 @@ test_that("createDataFrame sqlContext parameter backward compatibility", { a <- 1:3 b <- c("a", "b", "c") ldf <- data.frame(a, b) - df <- suppressWarnings(createDataFrame(sqlContext, ldf)) + # Call function with namespace :: operator - SPARK-16538 + df <- suppressWarnings(SparkR::createDataFrame(sqlContext, ldf)) expect_equal(columns(df), c("a", "b")) expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) expect_equal(count(df), 3) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16509][SPARKR] Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy.
Repository: spark Updated Branches: refs/heads/master 56183b84f -> 093ebbc62 [SPARK-16509][SPARKR] Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy. ## What changes were proposed in this pull request? Rename window.partitionBy and window.orderBy to windowPartitionBy and windowOrderBy to pass CRAN package check. ## How was this patch tested? SparkR unit tests. Author: Sun RuiCloses #14192 from sun-rui/SPARK-16509. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/093ebbc6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/093ebbc6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/093ebbc6 Branch: refs/heads/master Commit: 093ebbc628699b40f091b5b7083c119fffa9314b Parents: 56183b8 Author: Sun Rui Authored: Thu Jul 14 09:38:42 2016 -0700 Committer: Shivaram Venkataraman Committed: Thu Jul 14 09:38:42 2016 -0700 -- R/pkg/NAMESPACE | 4 +- R/pkg/R/WindowSpec.R | 4 +- R/pkg/R/generics.R| 8 ++-- R/pkg/R/window.R | 54 +++--- R/pkg/inst/tests/testthat/test_sparkSQL.R | 8 ++-- 5 files changed, 44 insertions(+), 34 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/NAMESPACE -- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index bc3aceb..fe52905 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -341,5 +341,5 @@ export("partitionBy", "rowsBetween", "rangeBetween") -export("window.partitionBy", - "window.orderBy") +export("windowPartitionBy", + "windowOrderBy") http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/R/WindowSpec.R -- diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index 9f3b1e4..e20d05d 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -22,10 +22,10 @@ NULL #' S4 class that represents a WindowSpec #' -#' WindowSpec can be created by using window.partitionBy() or window.orderBy() +#' WindowSpec can be created by using windowPartitionBy() or windowOrderBy() #' #' @rdname WindowSpec -#' @seealso \link{window.partitionBy}, \link{window.orderBy} +#' @seealso \link{windowPartitionBy}, \link{windowOrderBy} #' #' @param sws A Java object reference to the backing Scala WindowSpec #' @export http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/R/generics.R -- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index df057bd..8416e5c 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -779,13 +779,13 @@ setGeneric("rowsBetween", function(x, start, end) { standardGeneric("rowsBetween #' @export setGeneric("rangeBetween", function(x, start, end) { standardGeneric("rangeBetween") }) -#' @rdname window.partitionBy +#' @rdname windowPartitionBy #' @export -setGeneric("window.partitionBy", function(col, ...) { standardGeneric("window.partitionBy") }) +setGeneric("windowPartitionBy", function(col, ...) { standardGeneric("windowPartitionBy") }) -#' @rdname window.orderBy +#' @rdname windowOrderBy #' @export -setGeneric("window.orderBy", function(col, ...) { standardGeneric("window.orderBy") }) +setGeneric("windowOrderBy", function(col, ...) { standardGeneric("windowOrderBy") }) ## Expression Function Methods ## http://git-wip-us.apache.org/repos/asf/spark/blob/093ebbc6/R/pkg/R/window.R -- diff --git a/R/pkg/R/window.R b/R/pkg/R/window.R index e4bc933..d9d069c 100644 --- a/R/pkg/R/window.R +++ b/R/pkg/R/window.R @@ -17,23 +17,28 @@ # window.R - Utility functions for defining window in DataFrames -#' window.partitionBy +#' windowPartitionBy #' #' Creates a WindowSpec with the partitioning defined. #' -#' @rdname window.partitionBy -#' @name window.partitionBy +#' @param col A column name or Column by which rows are partitioned to +#'windows. +#' @param ... Optional column names or Columns in addition to col, by +#'which rows are partitioned to windows. +#' +#' @rdname windowPartitionBy +#' @name windowPartitionBy #' @export #' @examples #' \dontrun{ -#' ws <- window.partitionBy("key1", "key2") +#' ws <- windowPartitionBy("key1", "key2") #' df1 <- select(df, over(lead("value", 1), ws)) #' -#' ws <- window.partitionBy(df$key1, df$key2) +#' ws <- windowPartitionBy(df$key1, df$key2) #' df1 <- select(df, over(lead("value", 1), ws)) #' } -#' @note
spark git commit: [SPARK-16543][SQL] Rename the columns of `SHOW PARTITION/COLUMNS` commands
Repository: spark Updated Branches: refs/heads/master 1b5c9e52a -> 56183b84f [SPARK-16543][SQL] Rename the columns of `SHOW PARTITION/COLUMNS` commands ## What changes were proposed in this pull request? This PR changes the name of columns returned by `SHOW PARTITION` and `SHOW COLUMNS` commands. Currently, both commands uses `result` as a column name. **Comparison: Column Name** Command|Spark(Before)|Spark(After)|Hive --|--||- SHOW PARTITIONS|result|partition|partition SHOW COLUMNS|result|col_name|field Note that Spark/Hive uses `col_name` in `DESC TABLES`. So, this PR chooses `col_name` for consistency among Spark commands. **Before** ```scala scala> sql("show partitions p").show() +--+ |result| +--+ | b=2| +--+ scala> sql("show columns in p").show() +--+ |result| +--+ | a| | b| +--+ ``` **After** ```scala scala> sql("show partitions p").show +-+ |partition| +-+ | b=2| +-+ scala> sql("show columns in p").show ++ |col_name| ++ | a| | b| ++ ``` ## How was this patch tested? Manual. Author: Dongjoon HyunCloses #14199 from dongjoon-hyun/SPARK-16543. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/56183b84 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/56183b84 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/56183b84 Branch: refs/heads/master Commit: 56183b84fb64ea13977d89ec55a9dd3997b4dacf Parents: 1b5c9e5 Author: Dongjoon Hyun Authored: Thu Jul 14 17:18:34 2016 +0200 Committer: Herman van Hovell Committed: Thu Jul 14 17:18:34 2016 +0200 -- .../scala/org/apache/spark/sql/execution/command/tables.scala | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/56183b84/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala index 6651c33..6e52a46 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala @@ -622,9 +622,8 @@ case class ShowTablePropertiesCommand(table: TableIdentifier, propertyKey: Optio * }}} */ case class ShowColumnsCommand(table: TableIdentifier) extends RunnableCommand { - // The result of SHOW COLUMNS has one column called 'result' override val output: Seq[Attribute] = { -AttributeReference("result", StringType, nullable = false)() :: Nil +AttributeReference("col_name", StringType, nullable = false)() :: Nil } override def run(sparkSession: SparkSession): Seq[Row] = { @@ -652,9 +651,8 @@ case class ShowColumnsCommand(table: TableIdentifier) extends RunnableCommand { case class ShowPartitionsCommand( table: TableIdentifier, spec: Option[TablePartitionSpec]) extends RunnableCommand { - // The result of SHOW PARTITIONS has one column called 'result' override val output: Seq[Attribute] = { -AttributeReference("result", StringType, nullable = false)() :: Nil +AttributeReference("partition", StringType, nullable = false)() :: Nil } private def getPartName(spec: TablePartitionSpec, partColNames: Seq[String]): String = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16530][SQL][TRIVIAL] Wrong Parser Keyword in ALTER TABLE CHANGE COLUMN
Repository: spark Updated Branches: refs/heads/master b7b5e1787 -> 1b5c9e52a [SPARK-16530][SQL][TRIVIAL] Wrong Parser Keyword in ALTER TABLE CHANGE COLUMN What changes were proposed in this pull request? Based on the [Hive SQL syntax](https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL#LanguageManualDDL-ChangeColumnName/Type/Position/Comment), the command to change column name/type/position/comment is `ALTER TABLE CHANGE COLUMN`. However, in our .g4 file, it is `ALTER TABLE CHANGE COLUMNS`. Because it is the last optional keyword, it does not take any effect. Thus, I put the issue as a Trivial level. cc hvanhovell How was this patch tested? Existing test cases Author: gatorsmileCloses #14186 from gatorsmile/changeColumns. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1b5c9e52 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1b5c9e52 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1b5c9e52 Branch: refs/heads/master Commit: 1b5c9e52a7d5cdd3b4da1334ddff0518a8e14505 Parents: b7b5e17 Author: gatorsmile Authored: Thu Jul 14 17:15:51 2016 +0200 Committer: Herman van Hovell Committed: Thu Jul 14 17:15:51 2016 +0200 -- .../main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4| 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1b5c9e52/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 -- diff --git a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 index 7ccbb2d..5e10462 100644 --- a/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 +++ b/sql/catalyst/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBase.g4 @@ -170,7 +170,7 @@ unsupportedHiveNativeCommands | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CONCATENATE | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=SET kw4=FILEFORMAT | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=ADD kw4=COLUMNS -| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CHANGE kw4=COLUMNS? +| kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=CHANGE kw4=COLUMN? | kw1=ALTER kw2=TABLE tableIdentifier partitionSpec? kw3=REPLACE kw4=COLUMNS | kw1=START kw2=TRANSACTION | kw1=COMMIT - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16505][YARN] Optionally propagate error during shuffle service startup.
Repository: spark Updated Branches: refs/heads/master c4bc2ed84 -> b7b5e1787 [SPARK-16505][YARN] Optionally propagate error during shuffle service startup. This prevents the NM from starting when something is wrong, which would lead to later errors which are confusing and harder to debug. Added a unit test to verify startup fails if something is wrong. Author: Marcelo VanzinCloses #14162 from vanzin/SPARK-16505. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b7b5e178 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b7b5e178 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b7b5e178 Branch: refs/heads/master Commit: b7b5e17876f65c6644505c356f1a0db24ce1d142 Parents: c4bc2ed Author: Marcelo Vanzin Authored: Thu Jul 14 09:42:32 2016 -0500 Committer: Tom Graves Committed: Thu Jul 14 09:42:32 2016 -0500 -- .../spark/network/yarn/YarnShuffleService.java | 75 +++- docs/job-scheduling.md | 13 +--- docs/running-on-yarn.md | 31 .../network/yarn/YarnShuffleServiceSuite.scala | 34 - 4 files changed, 106 insertions(+), 47 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b7b5e178/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java -- diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index 8a05628..df17dac 100644 --- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -70,6 +70,11 @@ public class YarnShuffleService extends AuxiliaryService { private static final String RECOVERY_FILE_NAME = "registeredExecutors.ldb"; + // Whether failure during service initialization should stop the NM. + @VisibleForTesting + static final String STOP_ON_FAILURE_KEY = "spark.yarn.shuffle.stopOnFailure"; + private static final boolean DEFAULT_STOP_ON_FAILURE = false; + // An entity that manages the shuffle secret per application // This is used only if authentication is enabled private ShuffleSecretManager secretManager; @@ -119,44 +124,50 @@ public class YarnShuffleService extends AuxiliaryService { * Start the shuffle server with the given configuration. */ @Override - protected void serviceInit(Configuration conf) { + protected void serviceInit(Configuration conf) throws Exception { _conf = conf; -// In case this NM was killed while there were running spark applications, we need to restore -// lost state for the existing executors. We look for an existing file in the NM's local dirs. -// If we don't find one, then we choose a file to use to save the state next time. Even if -// an application was stopped while the NM was down, we expect yarn to call stopApplication() -// when it comes back -registeredExecutorFile = - new File(getRecoveryPath().toUri().getPath(), RECOVERY_FILE_NAME); - -TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf)); -// If authentication is enabled, set up the shuffle server to use a -// special RPC handler that filters out unauthenticated fetch requests -boolean authEnabled = conf.getBoolean(SPARK_AUTHENTICATE_KEY, DEFAULT_SPARK_AUTHENTICATE); +boolean stopOnFailure = conf.getBoolean(STOP_ON_FAILURE_KEY, DEFAULT_STOP_ON_FAILURE); + try { + // In case this NM was killed while there were running spark applications, we need to restore + // lost state for the existing executors. We look for an existing file in the NM's local dirs. + // If we don't find one, then we choose a file to use to save the state next time. Even if + // an application was stopped while the NM was down, we expect yarn to call stopApplication() + // when it comes back + registeredExecutorFile = +new File(getRecoveryPath().toUri().getPath(), RECOVERY_FILE_NAME); + + TransportConf transportConf = new TransportConf("shuffle", new HadoopConfigProvider(conf)); blockHandler = new ExternalShuffleBlockHandler(transportConf, registeredExecutorFile); -} catch (Exception e) { - logger.error("Failed to initialize external shuffle service", e); -} -List bootstraps = Lists.newArrayList(); -if (authEnabled) { - secretManager = new ShuffleSecretManager(); - bootstraps.add(new SaslServerBootstrap(transportConf, secretManager)); -} + // If
spark git commit: [SPARK-14963][MINOR][YARN] Fix typo in YarnShuffleService recovery file name
Repository: spark Updated Branches: refs/heads/master e3f8a0336 -> c4bc2ed84 [SPARK-14963][MINOR][YARN] Fix typo in YarnShuffleService recovery file name ## What changes were proposed in this pull request? Due to the changes of [SPARK-14963](https://issues.apache.org/jira/browse/SPARK-14963), external shuffle recovery file name is changed mistakenly, so here change it back to the previous file name. This only affects the master branch, branch-2.0 is correct [here](https://github.com/apache/spark/blob/branch-2.0/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java#L195). ## How was this patch tested? N/A Author: jerryshaoCloses #14197 from jerryshao/fix-typo-file-name. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c4bc2ed8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c4bc2ed8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c4bc2ed8 Branch: refs/heads/master Commit: c4bc2ed844ea045d2e8218154690b5b2b023f1e5 Parents: e3f8a03 Author: jerryshao Authored: Thu Jul 14 08:31:04 2016 -0500 Committer: Tom Graves Committed: Thu Jul 14 08:31:04 2016 -0500 -- .../java/org/apache/spark/network/yarn/YarnShuffleService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c4bc2ed8/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java -- diff --git a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java index 9807383..8a05628 100644 --- a/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java +++ b/common/network-yarn/src/main/java/org/apache/spark/network/yarn/YarnShuffleService.java @@ -68,7 +68,7 @@ public class YarnShuffleService extends AuxiliaryService { private static final String SPARK_AUTHENTICATE_KEY = "spark.authenticate"; private static final boolean DEFAULT_SPARK_AUTHENTICATE = false; - private static final String RECOVERY_FILE_NAME = "registeredExecutor.ldb"; + private static final String RECOVERY_FILE_NAME = "registeredExecutors.ldb"; // An entity that manages the shuffle secret per application // This is used only if authentication is enabled - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16403][EXAMPLES] Cleanup to remove unused imports, consistent style, minor fixes
Repository: spark Updated Branches: refs/heads/master 252d4f27f -> e3f8a0336 [SPARK-16403][EXAMPLES] Cleanup to remove unused imports, consistent style, minor fixes ## What changes were proposed in this pull request? Cleanup of examples, mostly from PySpark-ML to fix minor issues: unused imports, style consistency, pipeline_example is a duplicate, use future print funciton, and a spelling error. * The "Pipeline Example" is duplicated by "Simple Text Classification Pipeline" in Scala, Python, and Java. * "Estimator Transformer Param Example" is duplicated by "Simple Params Example" in Scala, Python and Java * Synced random_forest_classifier_example.py with Scala by adding IndexToString label converted * Synced train_validation_split.py (in Scala ModelSelectionViaTrainValidationExample) by adjusting data split, adding grid for intercept. * RegexTokenizer was doing nothing in tokenizer_example.py and JavaTokenizerExample.java, synced with Scala version ## How was this patch tested? local tests and run modified examples Author: Bryan CutlerCloses #14081 from BryanCutler/examples-cleanup-SPARK-16403. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e3f8a033 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e3f8a033 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e3f8a033 Branch: refs/heads/master Commit: e3f8a033679261aaee15bda0f970a1890411e743 Parents: 252d4f2 Author: Bryan Cutler Authored: Thu Jul 14 09:12:46 2016 +0100 Committer: Sean Owen Committed: Thu Jul 14 09:12:46 2016 +0100 -- .../spark/examples/ml/JavaPipelineExample.java | 4 +- .../examples/ml/JavaSimpleParamsExample.java| 113 --- .../JavaSimpleTextClassificationPipeline.java | 93 --- .../ml/JavaStopWordsRemoverExample.java | 2 +- .../spark/examples/ml/JavaTokenizerExample.java | 19 +++- .../main/python/ml/aft_survival_regression.py | 2 +- .../main/python/ml/bisecting_k_means_example.py | 2 +- examples/src/main/python/ml/cross_validator.py | 3 +- .../src/main/python/ml/dataframe_example.py | 11 +- .../ml/decision_tree_classification_example.py | 2 +- .../ml/estimator_transformer_param_example.py | 20 ++-- .../main/python/ml/gaussian_mixture_example.py | 2 +- .../gradient_boosted_tree_classifier_example.py | 2 +- .../gradient_boosted_tree_regressor_example.py | 2 +- .../python/ml/isotonic_regression_example.py| 6 +- examples/src/main/python/ml/kmeans_example.py | 4 +- examples/src/main/python/ml/lda_example.py | 3 - .../ml/multilayer_perceptron_classification.py | 5 + examples/src/main/python/ml/n_gram_example.py | 2 + .../src/main/python/ml/naive_bayes_example.py | 4 +- .../src/main/python/ml/one_vs_rest_example.py | 5 +- .../main/python/ml/onehot_encoder_example.py| 1 + examples/src/main/python/ml/pca_example.py | 2 + examples/src/main/python/ml/pipeline_example.py | 10 +- .../python/ml/polynomial_expansion_example.py | 12 +- .../python/ml/quantile_discretizer_example.py | 8 +- .../ml/random_forest_classifier_example.py | 13 ++- .../ml/random_forest_regressor_example.py | 2 +- examples/src/main/python/ml/rformula_example.py | 2 + .../src/main/python/ml/simple_params_example.py | 95 .../ml/simple_text_classification_pipeline.py | 72 .../main/python/ml/stopwords_remover_example.py | 2 +- .../main/python/ml/string_indexer_example.py| 1 + examples/src/main/python/ml/tf_idf_example.py | 3 + .../src/main/python/ml/tokenizer_example.py | 13 ++- .../main/python/ml/train_validation_split.py| 8 +- .../main/python/ml/vector_assembler_example.py | 2 + .../main/python/ml/vector_indexer_example.py| 1 + examples/src/main/python/ml/word2vec_example.py | 2 + .../src/main/python/streaming/queue_stream.py | 1 - .../apache/spark/examples/BroadcastTest.scala | 1 - .../examples/ml/GaussianMixtureExample.scala| 6 +- .../examples/ml/IsotonicRegressionExample.scala | 2 - .../spark/examples/ml/KMeansExample.scala | 1 - ...electionViaTrainValidationSplitExample.scala | 1 + .../MultilayerPerceptronClassifierExample.scala | 6 + .../spark/examples/ml/NaiveBayesExample.scala | 1 + .../spark/examples/ml/PipelineExample.scala | 4 +- .../ml/QuantileDiscretizerExample.scala | 4 +- .../spark/examples/ml/RFormulaExample.scala | 2 + .../spark/examples/ml/SimpleParamsExample.scala | 104 - .../ml/SimpleTextClassificationPipeline.scala | 93 --- .../examples/ml/StopWordsRemoverExample.scala | 2 +- .../apache/spark/examples/ml/TfIdfExample.scala | 3 + .../spark/examples/ml/TokenizerExample.scala
spark git commit: [SPARK-16500][ML][MLLIB][OPTIMIZER] add LBFGS convergence warning for all used place in MLLib
Repository: spark Updated Branches: refs/heads/branch-2.0 b3ebecbb7 -> 240c42b28 [SPARK-16500][ML][MLLIB][OPTIMIZER] add LBFGS convergence warning for all used place in MLLib ## What changes were proposed in this pull request? Add warning_for the following case when LBFGS training not actually convergence: 1) LogisticRegression 2) AFTSurvivalRegression 3) LBFGS algorithm wrapper in mllib package ## How was this patch tested? N/A Author: WeichenXuCloses #14157 from WeichenXu123/add_lbfgs_convergence_warning_for_all_used_place. (cherry picked from commit 252d4f27f23b54892bcea25a2cea62d8cbab) Signed-off-by: Sean Owen Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/240c42b2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/240c42b2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/240c42b2 Branch: refs/heads/branch-2.0 Commit: 240c42b284b3f4bd302984fa51513c249f6d7648 Parents: b3ebecb Author: WeichenXu Authored: Thu Jul 14 09:11:04 2016 +0100 Committer: Sean Owen Committed: Thu Jul 14 09:11:14 2016 +0100 -- .../apache/spark/ml/classification/LogisticRegression.scala| 5 + .../org/apache/spark/ml/regression/AFTSurvivalRegression.scala | 5 + .../main/scala/org/apache/spark/mllib/optimization/LBFGS.scala | 6 ++ 3 files changed, 16 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/240c42b2/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index e157bde..4bab801 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -424,6 +424,11 @@ class LogisticRegression @Since("1.2.0") ( throw new SparkException(msg) } +if (!state.actuallyConverged) { + logWarning("LogisticRegression training fininshed but the result " + +s"is not converged because: ${state.convergedReason.get.reason}") +} + /* The coefficients are trained in the scaled space; we're converting them back to the original space. http://git-wip-us.apache.org/repos/asf/spark/blob/240c42b2/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala index 7c51845..366448f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala @@ -245,6 +245,11 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S throw new SparkException(msg) } + if (!state.actuallyConverged) { +logWarning("AFTSurvivalRegression training fininshed but the result " + + s"is not converged because: ${state.convergedReason.get.reason}") + } + state.x.toArray.clone() } http://git-wip-us.apache.org/repos/asf/spark/blob/240c42b2/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala index ec6ffe6..c61b2db 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/optimization/LBFGS.scala @@ -212,6 +212,12 @@ object LBFGS extends Logging { state = states.next() } lossHistory += state.value + +if (!state.actuallyConverged) { + logWarning("LBFGS training fininshed but the result " + +s"is not converged because: ${state.convergedReason.get.reason}") +} + val weights = Vectors.fromBreeze(state.x) val lossHistoryArray = lossHistory.result() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-16448] RemoveAliasOnlyProject should not remove alias with metadata
Repository: spark Updated Branches: refs/heads/master 39c836e97 -> db7317ac3 [SPARK-16448] RemoveAliasOnlyProject should not remove alias with metadata ## What changes were proposed in this pull request? `Alias` with metadata is not a no-op and we should not strip it in `RemoveAliasOnlyProject` rule. This PR also did some improvement for this rule: 1. extend the semantic of `alias-only`. Now we allow the project list to be partially aliased. 2. add unit test for this rule. ## How was this patch tested? new `RemoveAliasOnlyProjectSuite` Author: Wenchen FanCloses #14106 from cloud-fan/bug. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/db7317ac Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/db7317ac Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/db7317ac Branch: refs/heads/master Commit: db7317ac3c2fd2a11088d10060f168178dc99664 Parents: 39c836e Author: Wenchen Fan Authored: Thu Jul 14 15:48:22 2016 +0800 Committer: Cheng Lian Committed: Thu Jul 14 15:48:22 2016 +0800 -- .../sql/catalyst/optimizer/Optimizer.scala | 49 - .../optimizer/RemoveAliasOnlyProjectSuite.scala | 77 2 files changed, 108 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/db7317ac/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 08fb019..c8e9d8e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -165,36 +165,49 @@ object PushProjectThroughSample extends Rule[LogicalPlan] { * but can also benefit other operators. */ object RemoveAliasOnlyProject extends Rule[LogicalPlan] { - // Check if projectList in the Project node has the same attribute names and ordering - // as its child node. + /** + * Returns true if the project list is semantically same as child output, after strip alias on + * attribute. + */ private def isAliasOnly( projectList: Seq[NamedExpression], childOutput: Seq[Attribute]): Boolean = { -if (!projectList.forall(_.isInstanceOf[Alias]) || projectList.length != childOutput.length) { +if (projectList.length != childOutput.length) { false } else { - projectList.map(_.asInstanceOf[Alias]).zip(childOutput).forall { case (a, o) => -a.child match { - case attr: Attribute if a.name == attr.name && attr.semanticEquals(o) => true - case _ => false -} + stripAliasOnAttribute(projectList).zip(childOutput).forall { +case (a: Attribute, o) if a semanticEquals o => true +case _ => false } } } + private def stripAliasOnAttribute(projectList: Seq[NamedExpression]) = { +projectList.map { + // Alias with metadata can not be stripped, or the metadata will be lost. + // If the alias name is different from attribute name, we can't strip it either, or we may + // accidentally change the output schema name of the root plan. + case a @ Alias(attr: Attribute, name) if a.metadata == Metadata.empty && name == attr.name => +attr + case other => other +} + } + def apply(plan: LogicalPlan): LogicalPlan = { -val aliasOnlyProject = plan.find { - case Project(pList, child) if isAliasOnly(pList, child.output) => true - case _ => false +val aliasOnlyProject = plan.collectFirst { + case p @ Project(pList, child) if isAliasOnly(pList, child.output) => p } -aliasOnlyProject.map { case p: Project => - val aliases = p.projectList.map(_.asInstanceOf[Alias]) - val attrMap = AttributeMap(aliases.map(a => (a.toAttribute, a.child))) - plan.transformAllExpressions { -case a: Attribute if attrMap.contains(a) => attrMap(a) - }.transform { -case op: Project if op.eq(p) => op.child +aliasOnlyProject.map { case proj => + val attributesToReplace = proj.output.zip(proj.child.output).filterNot { +case (a1, a2) => a1 semanticEquals a2 + } + val attrMap = AttributeMap(attributesToReplace) + plan transform { +case plan: Project if plan eq proj => plan.child +case plan => plan transformExpressions { + case a: Attribute if attrMap.contains(a) => attrMap(a) +} } }.getOrElse(plan) }