svn commit: r24771 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_06_20_01-c36fecc-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Feb 7 04:15:20 2018 New Revision: 24771 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_06_20_01-c36fecc docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24765 - in /dev/spark/2.3.1-SNAPSHOT-2018_02_06_18_01-874d3f8-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Feb 7 02:15:30 2018 New Revision: 24765 Log: Apache Spark 2.3.1-SNAPSHOT-2018_02_06_18_01-874d3f8 docs [This commit notification would consist of 1442 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23327][SQL] Update the description and tests of three external API or functions
Repository: spark Updated Branches: refs/heads/master b96a083b1 -> c36fecc3b [SPARK-23327][SQL] Update the description and tests of three external API or functions ## What changes were proposed in this pull request? Update the description and tests of three external API or functions `createFunction `, `length` and `repartitionByRange ` ## How was this patch tested? N/A Author: gatorsmileCloses #20495 from gatorsmile/updateFunc. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c36fecc3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c36fecc3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c36fecc3 Branch: refs/heads/master Commit: c36fecc3b416c38002779c3cf40b6a665ac4bf13 Parents: b96a083 Author: gatorsmile Authored: Tue Feb 6 16:46:43 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 16:46:43 2018 -0800 -- R/pkg/R/functions.R | 4 +++- python/pyspark/sql/functions.py | 8 --- .../sql/catalyst/catalog/SessionCatalog.scala | 7 -- .../expressions/stringExpressions.scala | 23 ++-- .../scala/org/apache/spark/sql/Dataset.scala| 2 ++ .../spark/sql/execution/command/functions.scala | 14 +++- .../scala/org/apache/spark/sql/functions.scala | 4 +++- .../sql/execution/command/DDLParserSuite.scala | 10 - 8 files changed, 44 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c36fecc3/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 55365a4..9f7c631 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1026,7 +1026,9 @@ setMethod("last_day", }) #' @details -#' \code{length}: Computes the length of a given string or binary column. +#' \code{length}: Computes the character length of a string data or number of bytes +#' of a binary data. The length of string data includes the trailing spaces. +#' The length of binary data includes binary zeros. #' #' @rdname column_string_functions #' @aliases length length,Column-method http://git-wip-us.apache.org/repos/asf/spark/blob/c36fecc3/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index 3c8fb4c..05031f5 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1705,10 +1705,12 @@ def unhex(col): @ignore_unicode_prefix @since(1.5) def length(col): -"""Calculates the length of a string or binary expression. +"""Computes the character length of string data or number of bytes of binary data. +The length of character data includes the trailing spaces. The length of binary data +includes binary zeros. ->>> spark.createDataFrame([('ABC',)], ['a']).select(length('a').alias('length')).collect() -[Row(length=3)] +>>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect() +[Row(length=4)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.length(_to_java_column(col))) http://git-wip-us.apache.org/repos/asf/spark/blob/c36fecc3/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index a129896..4b119c7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -988,8 +988,11 @@ class SessionCatalog( // --- /** - * Create a metastore function in the database specified in `funcDefinition`. + * Create a function in the database specified in `funcDefinition`. * If no such database is specified, create it in the current database. + * + * @param ignoreIfExists: When true, ignore if the function with the specified name exists + *in the specified database. */ def createFunction(funcDefinition: CatalogFunction, ignoreIfExists: Boolean): Unit = { val db = formatDatabaseName(funcDefinition.identifier.database.getOrElse(getCurrentDatabase)) @@ -1061,7 +1064,7 @@ class SessionCatalog( } /** - * Check if the specified function exists. + * Check if the function with the specified name exists */ def
spark git commit: [SPARK-23327][SQL] Update the description and tests of three external API or functions
Repository: spark Updated Branches: refs/heads/branch-2.3 f9c913263 -> 874d3f89f [SPARK-23327][SQL] Update the description and tests of three external API or functions ## What changes were proposed in this pull request? Update the description and tests of three external API or functions `createFunction `, `length` and `repartitionByRange ` ## How was this patch tested? N/A Author: gatorsmileCloses #20495 from gatorsmile/updateFunc. (cherry picked from commit c36fecc3b416c38002779c3cf40b6a665ac4bf13) Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/874d3f89 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/874d3f89 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/874d3f89 Branch: refs/heads/branch-2.3 Commit: 874d3f89fe0f903a6465520c3e6c4788a6865d9a Parents: f9c9132 Author: gatorsmile Authored: Tue Feb 6 16:46:43 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 16:46:55 2018 -0800 -- R/pkg/R/functions.R | 4 +++- python/pyspark/sql/functions.py | 8 --- .../sql/catalyst/catalog/SessionCatalog.scala | 7 -- .../expressions/stringExpressions.scala | 23 ++-- .../scala/org/apache/spark/sql/Dataset.scala| 2 ++ .../spark/sql/execution/command/functions.scala | 14 +++- .../scala/org/apache/spark/sql/functions.scala | 4 +++- .../sql/execution/command/DDLParserSuite.scala | 10 - 8 files changed, 44 insertions(+), 28 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/874d3f89/R/pkg/R/functions.R -- diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 55365a4..9f7c631 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1026,7 +1026,9 @@ setMethod("last_day", }) #' @details -#' \code{length}: Computes the length of a given string or binary column. +#' \code{length}: Computes the character length of a string data or number of bytes +#' of a binary data. The length of string data includes the trailing spaces. +#' The length of binary data includes binary zeros. #' #' @rdname column_string_functions #' @aliases length length,Column-method http://git-wip-us.apache.org/repos/asf/spark/blob/874d3f89/python/pyspark/sql/functions.py -- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index b637707..52fab7c 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -1705,10 +1705,12 @@ def unhex(col): @ignore_unicode_prefix @since(1.5) def length(col): -"""Calculates the length of a string or binary expression. +"""Computes the character length of string data or number of bytes of binary data. +The length of character data includes the trailing spaces. The length of binary data +includes binary zeros. ->>> spark.createDataFrame([('ABC',)], ['a']).select(length('a').alias('length')).collect() -[Row(length=3)] +>>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect() +[Row(length=4)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.length(_to_java_column(col))) http://git-wip-us.apache.org/repos/asf/spark/blob/874d3f89/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala -- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala index a129896..4b119c7 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala @@ -988,8 +988,11 @@ class SessionCatalog( // --- /** - * Create a metastore function in the database specified in `funcDefinition`. + * Create a function in the database specified in `funcDefinition`. * If no such database is specified, create it in the current database. + * + * @param ignoreIfExists: When true, ignore if the function with the specified name exists + *in the specified database. */ def createFunction(funcDefinition: CatalogFunction, ignoreIfExists: Boolean): Unit = { val db = formatDatabaseName(funcDefinition.identifier.database.getOrElse(getCurrentDatabase)) @@ -1061,7 +1064,7 @@ class SessionCatalog( }
svn commit: r24764 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_06_16_01-b96a083-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Wed Feb 7 00:15:01 2018 New Revision: 24764 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_06_16_01-b96a083 docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24762 - in /dev/spark/2.3.1-SNAPSHOT-2018_02_06_14_01-f9c9132-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Feb 6 22:15:24 2018 New Revision: 24762 Log: Apache Spark 2.3.1-SNAPSHOT-2018_02_06_14_01-f9c9132 docs [This commit notification would consist of 1442 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23315][SQL] failed to get output from canonicalized data source v2 related plans
Repository: spark Updated Branches: refs/heads/branch-2.3 775e1 -> f9c913263 [SPARK-23315][SQL] failed to get output from canonicalized data source v2 related plans ## What changes were proposed in this pull request? `DataSourceV2Relation` keeps a `fullOutput` and resolves the real output on demand by column name lookup. i.e. ``` lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name => fullOutput.find(_.name == name).get } ``` This will be broken after we canonicalize the plan, because all attribute names become "None", see https://github.com/apache/spark/blob/v2.3.0-rc1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala#L42 To fix this, `DataSourceV2Relation` should just keep `output`, and update the `output` when doing column pruning. ## How was this patch tested? a new test case Author: Wenchen FanCloses #20485 from cloud-fan/canonicalize. (cherry picked from commit b96a083b1c6ff0d2c588be9499b456e1adce97dc) Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f9c91326 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f9c91326 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f9c91326 Branch: refs/heads/branch-2.3 Commit: f9c913263219f5e8a375542994142645dd0f6c6a Parents: 775 Author: Wenchen Fan Authored: Tue Feb 6 12:43:45 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 12:43:53 2018 -0800 -- .../datasources/v2/DataSourceReaderHolder.scala | 12 +++- .../datasources/v2/DataSourceV2Relation.scala | 8 +++--- .../datasources/v2/DataSourceV2ScanExec.scala | 4 +-- .../v2/PushDownOperatorsToDataSource.scala | 29 ++-- .../sql/sources/v2/DataSourceV2Suite.scala | 20 +- 5 files changed, 48 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f9c91326/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala index 6460c97..81219e9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util.Objects -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.sources.v2.reader._ /** @@ -28,9 +28,9 @@ import org.apache.spark.sql.sources.v2.reader._ trait DataSourceReaderHolder { /** - * The full output of the data source reader, without column pruning. + * The output of the data source reader, w.r.t. column pruning. */ - def fullOutput: Seq[AttributeReference] + def output: Seq[Attribute] /** * The held data source reader. @@ -46,7 +46,7 @@ trait DataSourceReaderHolder { case s: SupportsPushDownFilters => s.pushedFilters().toSet case _ => Nil } -Seq(fullOutput, reader.getClass, reader.readSchema(), filters) +Seq(output, reader.getClass, filters) } def canEqual(other: Any): Boolean @@ -61,8 +61,4 @@ trait DataSourceReaderHolder { override def hashCode(): Int = { metadata.map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b) } - - lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name => -fullOutput.find(_.name == name).get - } } http://git-wip-us.apache.org/repos/asf/spark/blob/f9c91326/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index eebfa29..38f6b15 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics} import org.apache.spark.sql.sources.v2.reader._ case class DataSourceV2Relation( -fullOutput: Seq[AttributeReference], +output:
spark git commit: [SPARK-23315][SQL] failed to get output from canonicalized data source v2 related plans
Repository: spark Updated Branches: refs/heads/master caf304456 -> b96a083b1 [SPARK-23315][SQL] failed to get output from canonicalized data source v2 related plans ## What changes were proposed in this pull request? `DataSourceV2Relation` keeps a `fullOutput` and resolves the real output on demand by column name lookup. i.e. ``` lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name => fullOutput.find(_.name == name).get } ``` This will be broken after we canonicalize the plan, because all attribute names become "None", see https://github.com/apache/spark/blob/v2.3.0-rc1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala#L42 To fix this, `DataSourceV2Relation` should just keep `output`, and update the `output` when doing column pruning. ## How was this patch tested? a new test case Author: Wenchen FanCloses #20485 from cloud-fan/canonicalize. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b96a083b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b96a083b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b96a083b Branch: refs/heads/master Commit: b96a083b1c6ff0d2c588be9499b456e1adce97dc Parents: caf3044 Author: Wenchen Fan Authored: Tue Feb 6 12:43:45 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 12:43:45 2018 -0800 -- .../datasources/v2/DataSourceReaderHolder.scala | 12 +++- .../datasources/v2/DataSourceV2Relation.scala | 8 +++--- .../datasources/v2/DataSourceV2ScanExec.scala | 4 +-- .../v2/PushDownOperatorsToDataSource.scala | 29 ++-- .../sql/sources/v2/DataSourceV2Suite.scala | 20 +- 5 files changed, 48 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b96a083b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala index 6460c97..81219e9 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceReaderHolder.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2 import java.util.Objects -import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.sources.v2.reader._ /** @@ -28,9 +28,9 @@ import org.apache.spark.sql.sources.v2.reader._ trait DataSourceReaderHolder { /** - * The full output of the data source reader, without column pruning. + * The output of the data source reader, w.r.t. column pruning. */ - def fullOutput: Seq[AttributeReference] + def output: Seq[Attribute] /** * The held data source reader. @@ -46,7 +46,7 @@ trait DataSourceReaderHolder { case s: SupportsPushDownFilters => s.pushedFilters().toSet case _ => Nil } -Seq(fullOutput, reader.getClass, reader.readSchema(), filters) +Seq(output, reader.getClass, filters) } def canEqual(other: Any): Boolean @@ -61,8 +61,4 @@ trait DataSourceReaderHolder { override def hashCode(): Int = { metadata.map(Objects.hashCode).foldLeft(0)((a, b) => 31 * a + b) } - - lazy val output: Seq[Attribute] = reader.readSchema().map(_.name).map { name => -fullOutput.find(_.name == name).get - } } http://git-wip-us.apache.org/repos/asf/spark/blob/b96a083b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala index eebfa29..38f6b15 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Relation.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics} import org.apache.spark.sql.sources.v2.reader._ case class DataSourceV2Relation( -fullOutput: Seq[AttributeReference], +output: Seq[AttributeReference], reader: DataSourceReader) extends LeafNode with MultiInstanceRelation with
spark git commit: [MINOR][TEST] Fix class name for Pandas UDF tests
Repository: spark Updated Branches: refs/heads/branch-2.3 036a04b29 -> 775e1 [MINOR][TEST] Fix class name for Pandas UDF tests In https://github.com/apache/spark/commit/b2ce17b4c9fea58140a57ca1846b2689b15c0d61, I mistakenly renamed `VectorizedUDFTests` to `ScalarPandasUDF`. This PR fixes the mistake. Existing tests. Author: Li JinCloses #20489 from icexelloss/fix-scalar-udf-tests. (cherry picked from commit caf30445632de6aec810309293499199e7a20892) Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/775e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/775e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/775e Branch: refs/heads/branch-2.3 Commit: 775e154b14f8a9cad829d7fd476e3b6405ce Parents: 036a04b Author: Li Jin Authored: Tue Feb 6 12:30:04 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 12:37:25 2018 -0800 -- python/pyspark/sql/tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/775e/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 2577ed7..878d402 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3740,7 +3740,7 @@ class PandasUDFTests(ReusedSQLTestCase): @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") -class VectorizedUDFTests(ReusedSQLTestCase): +class ScalarPandasUDFTests(ReusedSQLTestCase): @classmethod def setUpClass(cls): @@ -4253,7 +4253,7 @@ class VectorizedUDFTests(ReusedSQLTestCase): @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") -class GroupbyApplyTests(ReusedSQLTestCase): +class GroupedMapPandasUDFTests(ReusedSQLTestCase): def assertFramesEqual(self, expected, result): msg = ("DataFrames are not equal: " + - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [MINOR][TEST] Fix class name for Pandas UDF tests
Repository: spark Updated Branches: refs/heads/master ac7454cac -> caf304456 [MINOR][TEST] Fix class name for Pandas UDF tests ## What changes were proposed in this pull request? In https://github.com/apache/spark/commit/b2ce17b4c9fea58140a57ca1846b2689b15c0d61, I mistakenly renamed `VectorizedUDFTests` to `ScalarPandasUDF`. This PR fixes the mistake. ## How was this patch tested? Existing tests. Author: Li JinCloses #20489 from icexelloss/fix-scalar-udf-tests. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/caf30445 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/caf30445 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/caf30445 Branch: refs/heads/master Commit: caf30445632de6aec810309293499199e7a20892 Parents: ac7454c Author: Li Jin Authored: Tue Feb 6 12:30:04 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 12:30:04 2018 -0800 -- python/pyspark/sql/tests.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/caf30445/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 89b7c21..53da7dd 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3766,7 +3766,7 @@ class PandasUDFTests(ReusedSQLTestCase): @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") -class ScalarPandasUDF(ReusedSQLTestCase): +class ScalarPandasUDFTests(ReusedSQLTestCase): @classmethod def setUpClass(cls): @@ -4279,7 +4279,7 @@ class ScalarPandasUDF(ReusedSQLTestCase): @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") -class GroupbyApplyPandasUDFTests(ReusedSQLTestCase): +class GroupedMapPandasUDFTests(ReusedSQLTestCase): @property def data(self): @@ -4448,7 +4448,7 @@ class GroupbyApplyPandasUDFTests(ReusedSQLTestCase): @unittest.skipIf(not _have_pandas or not _have_arrow, "Pandas or Arrow not installed") -class GroupbyAggPandasUDFTests(ReusedSQLTestCase): +class GroupedAggPandasUDFTests(ReusedSQLTestCase): @property def data(self): - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23312][SQL][FOLLOWUP] add a config to turn off vectorized cache reader
Repository: spark Updated Branches: refs/heads/branch-2.3 7782fd03a -> 036a04b29 [SPARK-23312][SQL][FOLLOWUP] add a config to turn off vectorized cache reader ## What changes were proposed in this pull request? https://github.com/apache/spark/pull/20483 tried to provide a way to turn off the new columnar cache reader, to restore the behavior in 2.2. However even we turn off that config, the behavior is still different than 2.2. If the output data are rows, we still enable whole stage codegen for the scan node, which is different with 2.2, we should also fix it. ## How was this patch tested? existing tests. Author: Wenchen FanCloses #20513 from cloud-fan/cache. (cherry picked from commit ac7454cac04a1d9252b3856360eda5c3e8bcb8da) Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/036a04b2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/036a04b2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/036a04b2 Branch: refs/heads/branch-2.3 Commit: 036a04b29c818ddbe695f7833577781e8bb16d3f Parents: 7782fd0 Author: Wenchen Fan Authored: Tue Feb 6 12:27:37 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 12:27:52 2018 -0800 -- .../spark/sql/execution/columnar/InMemoryTableScanExec.scala | 3 +++ .../src/test/scala/org/apache/spark/sql/CachedTableSuite.scala| 3 ++- .../org/apache/spark/sql/execution/WholeStageCodegenSuite.scala | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/036a04b2/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala index e972f8b..a93e8a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala @@ -61,6 +61,9 @@ case class InMemoryTableScanExec( }) && !WholeStageCodegenExec.isTooManyFields(conf, relation.schema) } + // TODO: revisit this. Shall we always turn off whole stage codegen if the output data are rows? + override def supportCodegen: Boolean = supportsBatch + override protected def needsUnsafeRowConversion: Boolean = false private val columnIndices = http://git-wip-us.apache.org/repos/asf/spark/blob/036a04b2/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 9f27fa0..669e5f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -787,7 +787,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext withSQLConf(SQLConf.CACHE_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { val df = spark.range(10).cache() df.queryExecution.executedPlan.foreach { - case i: InMemoryTableScanExec => assert(i.supportsBatch == vectorized) + case i: InMemoryTableScanExec => +assert(i.supportsBatch == vectorized && i.supportCodegen == vectorized) case _ => } } http://git-wip-us.apache.org/repos/asf/spark/blob/036a04b2/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala index 6e8d5a7..ef16292 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala @@ -137,7 +137,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext { val dsStringFilter = dsString.filter(_ == "1") val planString = dsStringFilter.queryExecution.executedPlan assert(planString.collect { - case WholeStageCodegenExec(FilterExec(_, i: InMemoryTableScanExec)) if !i.supportsBatch => () + case i: InMemoryTableScanExec if !i.supportsBatch => () }.length == 1) assert(dsStringFilter.collect() === Array("1")) }
spark git commit: [SPARK-23312][SQL][FOLLOWUP] add a config to turn off vectorized cache reader
Repository: spark Updated Branches: refs/heads/master 7db9979ba -> ac7454cac [SPARK-23312][SQL][FOLLOWUP] add a config to turn off vectorized cache reader ## What changes were proposed in this pull request? https://github.com/apache/spark/pull/20483 tried to provide a way to turn off the new columnar cache reader, to restore the behavior in 2.2. However even we turn off that config, the behavior is still different than 2.2. If the output data are rows, we still enable whole stage codegen for the scan node, which is different with 2.2, we should also fix it. ## How was this patch tested? existing tests. Author: Wenchen FanCloses #20513 from cloud-fan/cache. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ac7454ca Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ac7454ca Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ac7454ca Branch: refs/heads/master Commit: ac7454cac04a1d9252b3856360eda5c3e8bcb8da Parents: 7db9979 Author: Wenchen Fan Authored: Tue Feb 6 12:27:37 2018 -0800 Committer: gatorsmile Committed: Tue Feb 6 12:27:37 2018 -0800 -- .../spark/sql/execution/columnar/InMemoryTableScanExec.scala | 3 +++ .../src/test/scala/org/apache/spark/sql/CachedTableSuite.scala| 3 ++- .../org/apache/spark/sql/execution/WholeStageCodegenSuite.scala | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ac7454ca/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala index e972f8b..a93e8a1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/columnar/InMemoryTableScanExec.scala @@ -61,6 +61,9 @@ case class InMemoryTableScanExec( }) && !WholeStageCodegenExec.isTooManyFields(conf, relation.schema) } + // TODO: revisit this. Shall we always turn off whole stage codegen if the output data are rows? + override def supportCodegen: Boolean = supportsBatch + override protected def needsUnsafeRowConversion: Boolean = false private val columnIndices = http://git-wip-us.apache.org/repos/asf/spark/blob/ac7454ca/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala index 9f27fa0..669e5f2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CachedTableSuite.scala @@ -787,7 +787,8 @@ class CachedTableSuite extends QueryTest with SQLTestUtils with SharedSQLContext withSQLConf(SQLConf.CACHE_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { val df = spark.range(10).cache() df.queryExecution.executedPlan.foreach { - case i: InMemoryTableScanExec => assert(i.supportsBatch == vectorized) + case i: InMemoryTableScanExec => +assert(i.supportsBatch == vectorized && i.supportCodegen == vectorized) case _ => } } http://git-wip-us.apache.org/repos/asf/spark/blob/ac7454ca/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala index 6e8d5a7..ef16292 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala @@ -137,7 +137,7 @@ class WholeStageCodegenSuite extends QueryTest with SharedSQLContext { val dsStringFilter = dsString.filter(_ == "1") val planString = dsStringFilter.queryExecution.executedPlan assert(planString.collect { - case WholeStageCodegenExec(FilterExec(_, i: InMemoryTableScanExec)) if !i.supportsBatch => () + case i: InMemoryTableScanExec if !i.supportsBatch => () }.length == 1) assert(dsStringFilter.collect() === Array("1")) } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For
svn commit: r24747 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_06_12_01-7db9979-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Feb 6 20:15:32 2018 New Revision: 24747 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_06_12_01-7db9979 docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23310][CORE][FOLLOWUP] Fix Java style check issues.
Repository: spark Updated Branches: refs/heads/branch-2.3 a51154482 -> 7782fd03a [SPARK-23310][CORE][FOLLOWUP] Fix Java style check issues. ## What changes were proposed in this pull request? This is a follow-up of #20492 which broke lint-java checks. This pr fixes the lint-java issues. ``` [ERROR] src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java:[79] (sizes) LineLength: Line is longer than 100 characters (found 114). ``` ## How was this patch tested? Checked manually in my local environment. Author: Takuya UESHINCloses #20514 from ueshin/issues/SPARK-23310/fup1. (cherry picked from commit 7db9979babe52d15828967c86eb77e3fb2791579) Signed-off-by: Marcelo Vanzin Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7782fd03 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7782fd03 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7782fd03 Branch: refs/heads/branch-2.3 Commit: 7782fd03ab95552dff1d1477887632bbc8f6ee51 Parents: a511544 Author: Takuya UESHIN Authored: Tue Feb 6 10:46:48 2018 -0800 Committer: Marcelo Vanzin Committed: Tue Feb 6 10:47:04 2018 -0800 -- .../util/collection/unsafe/sort/UnsafeSorterSpillReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7782fd03/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java index 71e7c7a..2c53c8d 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java @@ -76,8 +76,8 @@ public final class UnsafeSorterSpillReader extends UnsafeSorterIterator implemen SparkEnv.get() == null ? 0.5 : SparkEnv.get().conf().getDouble("spark.unsafe.sorter.spill.read.ahead.fraction", 0.5); -// SPARK-23310: Disable read-ahead input stream, because it is causing lock contention and perf regression for -// TPC-DS queries. +// SPARK-23310: Disable read-ahead input stream, because it is causing lock contention and perf +// regression for TPC-DS queries. final boolean readAheadEnabled = SparkEnv.get() != null && SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", false); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23310][CORE][FOLLOWUP] Fix Java style check issues.
Repository: spark Updated Branches: refs/heads/master 63c5bf13c -> 7db9979ba [SPARK-23310][CORE][FOLLOWUP] Fix Java style check issues. ## What changes were proposed in this pull request? This is a follow-up of #20492 which broke lint-java checks. This pr fixes the lint-java issues. ``` [ERROR] src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java:[79] (sizes) LineLength: Line is longer than 100 characters (found 114). ``` ## How was this patch tested? Checked manually in my local environment. Author: Takuya UESHINCloses #20514 from ueshin/issues/SPARK-23310/fup1. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7db9979b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7db9979b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7db9979b Branch: refs/heads/master Commit: 7db9979babe52d15828967c86eb77e3fb2791579 Parents: 63c5bf1 Author: Takuya UESHIN Authored: Tue Feb 6 10:46:48 2018 -0800 Committer: Marcelo Vanzin Committed: Tue Feb 6 10:46:48 2018 -0800 -- .../util/collection/unsafe/sort/UnsafeSorterSpillReader.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7db9979b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java -- diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java index 71e7c7a..2c53c8d 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java @@ -76,8 +76,8 @@ public final class UnsafeSorterSpillReader extends UnsafeSorterIterator implemen SparkEnv.get() == null ? 0.5 : SparkEnv.get().conf().getDouble("spark.unsafe.sorter.spill.read.ahead.fraction", 0.5); -// SPARK-23310: Disable read-ahead input stream, because it is causing lock contention and perf regression for -// TPC-DS queries. +// SPARK-23310: Disable read-ahead input stream, because it is causing lock contention and perf +// regression for TPC-DS queries. final boolean readAheadEnabled = SparkEnv.get() != null && SparkEnv.get().conf().getBoolean("spark.unsafe.sorter.spill.read.ahead.enabled", false); - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark-website git commit: Direct security-related emails to secur...@apache.org for Spark
Repository: spark-website Updated Branches: refs/heads/asf-site 464ddc703 -> 3f874c90a Direct security-related emails to secur...@apache.org for Spark Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/3f874c90 Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/3f874c90 Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/3f874c90 Branch: refs/heads/asf-site Commit: 3f874c90ac708d90e304bdb613dcbaed6384f167 Parents: 464ddc7 Author: Sean OwenAuthored: Tue Feb 6 10:11:43 2018 -0600 Committer: Sean Owen Committed: Tue Feb 6 10:11:43 2018 -0600 -- security.md| 4 ++-- site/security.html | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark-website/blob/3f874c90/security.md -- diff --git a/security.md b/security.md index fd1fe46..bd2e66f 100644 --- a/security.md +++ b/security.md @@ -12,8 +12,8 @@ navigation: Apache Spark uses the standard process outlined by the [Apache Security Team](https://www.apache.org/security/) for reporting vulnerabilities. -If you need to report a possible security vulnerability, please email `priv...@spark.apache.org`. This is a -non-public list that will reach the Spark PMC. Messages to `secur...@apache.org` will also reach the PMC. +To report a possible security vulnerability, please email `secur...@apache.org`. This is a +non-public list that will reach the Apache Security team, as well as the Spark PMC. Known Security Issues http://git-wip-us.apache.org/repos/asf/spark-website/blob/3f874c90/site/security.html -- diff --git a/site/security.html b/site/security.html index 7101faa..cf789dd 100644 --- a/site/security.html +++ b/site/security.html @@ -199,8 +199,8 @@ Apache Spark uses the standard process outlined by the https://www.apache.org/security/;>Apache Security Team for reporting vulnerabilities. -If you need to report a possible security vulnerability, please email priv...@spark.apache.org. This is a -non-public list that will reach the Spark PMC. Messages to secur...@apache.org will also reach the PMC. +To report a possible security vulnerability, please email secur...@apache.org. This is a +non-public list that will reach the Apache Security team, as well as the Spark PMC. Known Security Issues - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24725 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_06_04_01-63c5bf1-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Feb 6 12:21:01 2018 New Revision: 24725 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_06_04_01-63c5bf1 docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
svn commit: r24723 - in /dev/spark/2.3.1-SNAPSHOT-2018_02_06_02_01-a511544-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Feb 6 10:15:57 2018 New Revision: 24723 Log: Apache Spark 2.3.1-SNAPSHOT-2018_02_06_02_01-a511544 docs [This commit notification would consist of 1442 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23334][SQL][PYTHON] Fix pandas_udf with return type StringType() to handle str type properly in Python 2.
Repository: spark Updated Branches: refs/heads/branch-2.3 44933033e -> a51154482 [SPARK-23334][SQL][PYTHON] Fix pandas_udf with return type StringType() to handle str type properly in Python 2. ## What changes were proposed in this pull request? In Python 2, when `pandas_udf` tries to return string type value created in the udf with `".."`, the execution fails. E.g., ```python from pyspark.sql.functions import pandas_udf, col import pandas as pd df = spark.range(10) str_f = pandas_udf(lambda x: pd.Series(["%s" % i for i in x]), "string") df.select(str_f(col('id'))).show() ``` raises the following exception: ``` ... java.lang.AssertionError: assertion failed: Invalid schema from pandas_udf: expected StringType, got BinaryType at scala.Predef$.assert(Predef.scala:170) at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.(ArrowEvalPythonExec.scala:93) ... ``` Seems like pyarrow ignores `type` parameter for `pa.Array.from_pandas()` and consider it as binary type when the type is string type and the string values are `str` instead of `unicode` in Python 2. This pr adds a workaround for the case. ## How was this patch tested? Added a test and existing tests. Author: Takuya UESHINCloses #20507 from ueshin/issues/SPARK-23334. (cherry picked from commit 63c5bf13ce5cd3b8d7e7fb88de881ed207fde720) Signed-off-by: hyukjinkwon Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a5115448 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a5115448 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a5115448 Branch: refs/heads/branch-2.3 Commit: a511544822be6e3fc9c6bb5080a163b9acbb41f2 Parents: 4493303 Author: Takuya UESHIN Authored: Tue Feb 6 18:30:50 2018 +0900 Committer: hyukjinkwon Committed: Tue Feb 6 18:31:06 2018 +0900 -- python/pyspark/serializers.py | 4 python/pyspark/sql/tests.py | 9 + 2 files changed, 13 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a5115448/python/pyspark/serializers.py -- diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index e870325..91a7f09 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -230,6 +230,10 @@ def _create_batch(series, timezone): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) +elif t is not None and pa.types.is_string(t) and sys.version < '3': +# TODO: need decode before converting to Arrow in Python 2 +return pa.Array.from_pandas(s.apply( +lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) return pa.Array.from_pandas(s, mask=mask, type=t) arrs = [create_array(s, t) for s, t in series] http://git-wip-us.apache.org/repos/asf/spark/blob/a5115448/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 95b9c0e..2577ed7 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3896,6 +3896,15 @@ class VectorizedUDFTests(ReusedSQLTestCase): res = df.select(str_f(col('str'))) self.assertEquals(df.collect(), res.collect()) +def test_vectorized_udf_string_in_udf(self): +from pyspark.sql.functions import pandas_udf, col +import pandas as pd +df = self.spark.range(10) +str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType()) +actual = df.select(str_f(col('id'))) +expected = df.select(col('id').cast('string')) +self.assertEquals(expected.collect(), actual.collect()) + def test_vectorized_udf_datatype_string(self): from pyspark.sql.functions import pandas_udf, col df = self.spark.range(10).select( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23334][SQL][PYTHON] Fix pandas_udf with return type StringType() to handle str type properly in Python 2.
Repository: spark Updated Branches: refs/heads/master 8141c3e3d -> 63c5bf13c [SPARK-23334][SQL][PYTHON] Fix pandas_udf with return type StringType() to handle str type properly in Python 2. ## What changes were proposed in this pull request? In Python 2, when `pandas_udf` tries to return string type value created in the udf with `".."`, the execution fails. E.g., ```python from pyspark.sql.functions import pandas_udf, col import pandas as pd df = spark.range(10) str_f = pandas_udf(lambda x: pd.Series(["%s" % i for i in x]), "string") df.select(str_f(col('id'))).show() ``` raises the following exception: ``` ... java.lang.AssertionError: assertion failed: Invalid schema from pandas_udf: expected StringType, got BinaryType at scala.Predef$.assert(Predef.scala:170) at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.(ArrowEvalPythonExec.scala:93) ... ``` Seems like pyarrow ignores `type` parameter for `pa.Array.from_pandas()` and consider it as binary type when the type is string type and the string values are `str` instead of `unicode` in Python 2. This pr adds a workaround for the case. ## How was this patch tested? Added a test and existing tests. Author: Takuya UESHINCloses #20507 from ueshin/issues/SPARK-23334. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/63c5bf13 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/63c5bf13 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/63c5bf13 Branch: refs/heads/master Commit: 63c5bf13ce5cd3b8d7e7fb88de881ed207fde720 Parents: 8141c3e Author: Takuya UESHIN Authored: Tue Feb 6 18:30:50 2018 +0900 Committer: hyukjinkwon Committed: Tue Feb 6 18:30:50 2018 +0900 -- python/pyspark/serializers.py | 4 python/pyspark/sql/tests.py | 9 + 2 files changed, 13 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/63c5bf13/python/pyspark/serializers.py -- diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index e870325..91a7f09 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -230,6 +230,10 @@ def _create_batch(series, timezone): s = _check_series_convert_timestamps_internal(s.fillna(0), timezone) # TODO: need cast after Arrow conversion, ns values cause error with pandas 0.19.2 return pa.Array.from_pandas(s, mask=mask).cast(t, safe=False) +elif t is not None and pa.types.is_string(t) and sys.version < '3': +# TODO: need decode before converting to Arrow in Python 2 +return pa.Array.from_pandas(s.apply( +lambda v: v.decode("utf-8") if isinstance(v, str) else v), mask=mask, type=t) return pa.Array.from_pandas(s, mask=mask, type=t) arrs = [create_array(s, t) for s, t in series] http://git-wip-us.apache.org/repos/asf/spark/blob/63c5bf13/python/pyspark/sql/tests.py -- diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 545ec5a..89b7c21 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3922,6 +3922,15 @@ class ScalarPandasUDF(ReusedSQLTestCase): res = df.select(str_f(col('str'))) self.assertEquals(df.collect(), res.collect()) +def test_vectorized_udf_string_in_udf(self): +from pyspark.sql.functions import pandas_udf, col +import pandas as pd +df = self.spark.range(10) +str_f = pandas_udf(lambda x: pd.Series(map(str, x)), StringType()) +actual = df.select(str_f(col('id'))) +expected = df.select(col('id').cast('string')) +self.assertEquals(expected.collect(), actual.collect()) + def test_vectorized_udf_datatype_string(self): from pyspark.sql.functions import pandas_udf, col df = self.spark.range(10).select( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-23290][SQL][PYTHON][BACKPORT-2.3] Use datetime.date for date type when converting Spark DataFrame to Pandas DataFrame.
Repository: spark Updated Branches: refs/heads/branch-2.3 521494d7b -> 44933033e [SPARK-23290][SQL][PYTHON][BACKPORT-2.3] Use datetime.date for date type when converting Spark DataFrame to Pandas DataFrame. ## What changes were proposed in this pull request? This is a backport of #20506. In #18664, there was a change in how `DateType` is being returned to users ([line 1968 in dataframe.py](https://github.com/apache/spark/pull/18664/files#diff-6fc344560230bf0ef711bb9b5573f1faR1968)). This can cause client code which works in Spark 2.2 to fail. See [SPARK-23290](https://issues.apache.org/jira/browse/SPARK-23290?focusedCommentId=16350917=com.atlassian.jira.plugin.system.issuetabpanels%3Acomment-tabpanel#comment-16350917) for an example. This pr modifies to use `datetime.date` for date type as Spark 2.2 does. ## How was this patch tested? Tests modified to fit the new behavior and existing tests. Author: Takuya UESHINCloses #20515 from ueshin/issues/SPARK-23290_2.3. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/44933033 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/44933033 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/44933033 Branch: refs/heads/branch-2.3 Commit: 44933033e9216ccb2e533b9dc6e6cb03cce39817 Parents: 521494d Author: Takuya UESHIN Authored: Tue Feb 6 18:29:37 2018 +0900 Committer: hyukjinkwon Committed: Tue Feb 6 18:29:37 2018 +0900 -- python/pyspark/serializers.py | 9 -- python/pyspark/sql/dataframe.py | 7 ++--- python/pyspark/sql/tests.py | 57 ++-- python/pyspark/sql/types.py | 15 ++ 4 files changed, 66 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/44933033/python/pyspark/serializers.py -- diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py index 88d6a19..e870325 100644 --- a/python/pyspark/serializers.py +++ b/python/pyspark/serializers.py @@ -267,12 +267,15 @@ class ArrowStreamPandasSerializer(Serializer): """ Deserialize ArrowRecordBatches to an Arrow table and return as a list of pandas.Series. """ -from pyspark.sql.types import _check_dataframe_localize_timestamps +from pyspark.sql.types import from_arrow_schema, _check_dataframe_convert_date, \ +_check_dataframe_localize_timestamps import pyarrow as pa reader = pa.open_stream(stream) +schema = from_arrow_schema(reader.schema) for batch in reader: -# NOTE: changed from pa.Columns.to_pandas, timezone issue in conversion fixed in 0.7.1 -pdf = _check_dataframe_localize_timestamps(batch.to_pandas(), self._timezone) +pdf = batch.to_pandas() +pdf = _check_dataframe_convert_date(pdf, schema) +pdf = _check_dataframe_localize_timestamps(pdf, self._timezone) yield [c for _, c in pdf.iteritems()] def __repr__(self): http://git-wip-us.apache.org/repos/asf/spark/blob/44933033/python/pyspark/sql/dataframe.py -- diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 2e55407..59a4170 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -1923,7 +1923,8 @@ class DataFrame(object): if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true": try: -from pyspark.sql.types import _check_dataframe_localize_timestamps +from pyspark.sql.types import _check_dataframe_convert_date, \ +_check_dataframe_localize_timestamps from pyspark.sql.utils import require_minimum_pyarrow_version import pyarrow require_minimum_pyarrow_version() @@ -1931,6 +1932,7 @@ class DataFrame(object): if tables: table = pyarrow.concat_tables(tables) pdf = table.to_pandas() +pdf = _check_dataframe_convert_date(pdf, self.schema) return _check_dataframe_localize_timestamps(pdf, timezone) else: return pd.DataFrame.from_records([], columns=self.columns) @@ -2009,7 +2011,6 @@ def _to_corrected_pandas_type(dt): """ When converting Spark SQL records to Pandas DataFrame, the inferred data type may be wrong. This method gets the corrected data type for Pandas if that type may be inferred uncorrectly. -NOTE: DateType is inferred incorrectly as 'object', TimestampType is correct with
svn commit: r24720 - in /dev/spark/2.4.0-SNAPSHOT-2018_02_06_00_01-8141c3e-docs: ./ _site/ _site/api/ _site/api/R/ _site/api/java/ _site/api/java/lib/ _site/api/java/org/ _site/api/java/org/apache/ _s
Author: pwendell Date: Tue Feb 6 08:15:48 2018 New Revision: 24720 Log: Apache Spark 2.4.0-SNAPSHOT-2018_02_06_00_01-8141c3e docs [This commit notification would consist of 1444 parts, which exceeds the limit of 50 ones, so it was shortened to the summary.] - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org