[spark] branch master updated: [SPARK-39268][SQL] AttachDistributedSequenceExec do not checkpoint childRDD with single partition
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new f673ebd8afc [SPARK-39268][SQL] AttachDistributedSequenceExec do not checkpoint childRDD with single partition f673ebd8afc is described below commit f673ebd8afc94a3b434a0156b61366fede80b8f9 Author: Ruifeng Zheng AuthorDate: Thu May 26 12:30:25 2022 +0800 [SPARK-39268][SQL] AttachDistributedSequenceExec do not checkpoint childRDD with single partition ### What changes were proposed in this pull request? do not checkpoint child rdd when it only has 1 partition ### Why are the changes needed? avoid necessary checkpoint when child rdd only has 1 partition, `zipWithIndex` will not trigger an action ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? existing suites Closes #36648 from zhengruifeng/sql_do_not_checkpoint_with_single_partition. Authored-by: Ruifeng Zheng Signed-off-by: Ruifeng Zheng --- python/pyspark/pandas/tests/test_groupby.py | 9 ++--- .../sql/execution/python/AttachDistributedSequenceExec.scala | 11 --- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 045cbaf5274..ac1e73f9d5d 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -2256,9 +2256,12 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): acc += 1 return np.sum(x) -actual = psdf.groupby("d").apply(sum_with_acc_frame).sort_index() +actual = psdf.groupby("d").apply(sum_with_acc_frame) actual.columns = ["d", "v"] -self.assert_eq(actual, pdf.groupby("d").apply(sum).sort_index().reset_index(drop=True)) +self.assert_eq( +actual.to_pandas().sort_index(), +pdf.groupby("d").apply(sum).sort_index().reset_index(drop=True), +) self.assert_eq(acc.value, 2) def sum_with_acc_series(x) -> np.float64: @@ -2267,7 +2270,7 @@ class GroupByTest(PandasOnSparkTestCase, TestUtils): return np.sum(x) self.assert_eq( -psdf.groupby("d")["v"].apply(sum_with_acc_series).sort_index(), + psdf.groupby("d")["v"].apply(sum_with_acc_series).to_pandas().sort_index(), pdf.groupby("d")["v"].apply(sum).sort_index().reset_index(drop=True), ) self.assert_eq(acc.value, 4) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala index 203fb6d7d50..5f722826fc7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/python/AttachDistributedSequenceExec.scala @@ -41,9 +41,14 @@ case class AttachDistributedSequenceExec( override def outputPartitioning: Partitioning = child.outputPartitioning override protected def doExecute(): RDD[InternalRow] = { -child.execute().map(_.copy()) -.localCheckpoint() // to avoid execute multiple jobs. zipWithIndex launches a Spark job. -.zipWithIndex().mapPartitions { iter => +val childRDD = child.execute().map(_.copy()) +val checkpointed = if (childRDD.getNumPartitions > 1) { + // to avoid execute multiple jobs. zipWithIndex launches a Spark job. + childRDD.localCheckpoint() +} else { + childRDD +} +checkpointed.zipWithIndex().mapPartitions { iter => val unsafeProj = UnsafeProjection.create(output, output) val joinedRow = new JoinedRow val unsafeRowWriter = - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39277][SQL] Make Optimizer extends SQLConfHelper and remove incorrect comment
This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ddb8a4036c2 [SPARK-39277][SQL] Make Optimizer extends SQLConfHelper and remove incorrect comment ddb8a4036c2 is described below commit ddb8a4036c28b7893713b1565cc0de3d60bd363b Author: Yuming Wang AuthorDate: Thu May 26 12:08:13 2022 +0800 [SPARK-39277][SQL] Make Optimizer extends SQLConfHelper and remove incorrect comment ### What changes were proposed in this pull request? Make Optimizer extends SQLConfHelper and remove incorrect comment. ### Why are the changes needed? Make the code easier to maintain. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Existing test. Closes #36657 from wangyum/SPARK-39277. Authored-by: Yuming Wang Signed-off-by: Wenchen Fan --- .../org/apache/spark/sql/catalyst/optimizer/Optimizer.scala | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 6b9746a880f..a84959f0991 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.optimizer import scala.collection.mutable +import org.apache.spark.sql.catalyst.SQLConfHelper import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.expressions._ @@ -40,7 +41,7 @@ import org.apache.spark.util.Utils * Optimizers can override this. */ abstract class Optimizer(catalogManager: CatalogManager) - extends RuleExecutor[LogicalPlan] { + extends RuleExecutor[LogicalPlan] with SQLConfHelper { // Check for structural integrity of the plan in test mode. // Currently we check after the execution of each rule if a plan: @@ -65,7 +66,7 @@ abstract class Optimizer(catalogManager: CatalogManager) protected def fixedPoint = FixedPoint( - SQLConf.get.optimizerMaxIterations, + conf.optimizerMaxIterations, maxIterationsSetting = SQLConf.OPTIMIZER_MAX_ITERATIONS.key) /** @@ -142,8 +143,6 @@ abstract class Optimizer(catalogManager: CatalogManager) InferFiltersFromConstraints) :: Batch("Operator Optimization after Inferring Filters", fixedPoint, operatorOptimizationRuleSet: _*) :: - // Set strategy to Once to avoid pushing filter every time because we do not change the - // join condition. Batch("Push extra predicate through join", fixedPoint, PushExtraPredicateThroughJoin, PushDownPredicates) :: Nil @@ -389,7 +388,7 @@ abstract class Optimizer(catalogManager: CatalogManager) */ final override def batches: Seq[Batch] = { val excludedRulesConf = - SQLConf.get.optimizerExcludedRules.toSeq.flatMap(Utils.stringToSeq) + conf.optimizerExcludedRules.toSeq.flatMap(Utils.stringToSeq) val excludedRules = excludedRulesConf.filter { ruleName => val nonExcludable = nonExcludableRules.contains(ruleName) if (nonExcludable) { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39253][DOCS][PYTHON] Improve PySpark API reference to be more readable
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 48b89539e76 [SPARK-39253][DOCS][PYTHON] Improve PySpark API reference to be more readable 48b89539e76 is described below commit 48b89539e761ef9f3d4c966ee4c4a86027b0c3d1 Author: itholic AuthorDate: Thu May 26 10:48:52 2022 +0900 [SPARK-39253][DOCS][PYTHON] Improve PySpark API reference to be more readable ### What changes were proposed in this pull request? This PR proposes to improve the PySpark API reference page to be more readable, So far, the PySpark documentation especially ["Spark SQL" part](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql.html#) is not-well organized so it's a bit uncomfortable to be read as below: https://user-images.githubusercontent.com/44108233/169951148-f77ba1d1-3e0f-411e-81be-65a5d669f75d.png";> For example, whereas [pandas API on Spark reference page](https://spark.apache.org/docs/latest/api/python/reference/pyspark.pandas/index.html) is relatively well-organized as below: https://user-images.githubusercontent.com/44108233/169951169-938e13c1-36ba-4d5c-b02e-7b7e80366e2c.png";> ### Why are the changes needed? The improvement of document readability will also improve the usability for PySpark. ### Does this PR introduce _any_ user-facing change? Yes, now the documentation is categorized by its class or their own purpose more clearly as below: https://user-images.githubusercontent.com/44108233/169951517-f8b9cb72-7408-46d6-8cd7-15ae890a7a7f.png";> ### How was this patch tested? The CI already include the doc build, so the existing test should cover. Closes #36647 from itholic/SPARK-39253. Authored-by: itholic Signed-off-by: Hyukjin Kwon --- python/docs/source/reference/index.rst | 2 +- python/docs/source/reference/pyspark.sql.rst | 663 - .../{index.rst => pyspark.sql/catalog.rst} | 49 +- .../{index.rst => pyspark.sql/column.rst} | 59 +- .../{index.rst => pyspark.sql/configuration.rst} | 20 +- .../{index.rst => pyspark.sql/core_classes.rst}| 40 +- .../{index.rst => pyspark.sql/data_types.rst} | 47 +- .../source/reference/pyspark.sql/dataframe.rst | 133 + .../source/reference/pyspark.sql/functions.rst | 343 +++ .../{index.rst => pyspark.sql/grouping.rst}| 39 +- .../source/reference/{ => pyspark.sql}/index.rst | 34 +- python/docs/source/reference/pyspark.sql/io.rst| 54 ++ .../reference/{index.rst => pyspark.sql/row.rst} | 24 +- .../source/reference/pyspark.sql/spark_session.rst | 53 ++ .../{index.rst => pyspark.sql/window.rst} | 39 +- 15 files changed, 773 insertions(+), 826 deletions(-) diff --git a/python/docs/source/reference/index.rst b/python/docs/source/reference/index.rst index 5ea127f6d9e..b16c614d34c 100644 --- a/python/docs/source/reference/index.rst +++ b/python/docs/source/reference/index.rst @@ -27,7 +27,7 @@ Pandas API on Spark follows the API specifications of latest pandas release. .. toctree:: :maxdepth: 2 - pyspark.sql + pyspark.sql/index pyspark.pandas/index pyspark.ss pyspark.ml diff --git a/python/docs/source/reference/pyspark.sql.rst b/python/docs/source/reference/pyspark.sql.rst deleted file mode 100644 index adc1958822e..000 --- a/python/docs/source/reference/pyspark.sql.rst +++ /dev/null @@ -1,663 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - -..http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. - - -= -Spark SQL -= - -Core Classes - - -.. currentmodule:: pyspark.sql - -.. autosummary:: -:toctree: api/ - -SparkSession -Catalog -DataFrame -Column -Observation -Row -GroupedData -PandasCogroupedOps -DataFrameNaFunctions -DataFrameStatFunctions -Window - - -Spark Session APIs --- - -.. currentmodule:: pyspark.sql - -The entry point to programming Spark with
[spark] branch branch-3.0 updated: [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9b268122f68 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly 9b268122f68 is described below commit 9b268122f68718ed46d9ffd97c402c5a1e7db73a Author: Takuya UESHIN AuthorDate: Thu May 26 10:36:03 2022 +0900 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly Fix the accumulator of `ArrayAggregate` to handle complex types properly. The accumulator of `ArrayAggregate` should copy the intermediate result if string, struct, array, or map. If the intermediate data of `ArrayAggregate` holds reusable data, the result will be duplicated. ```scala import org.apache.spark.sql.functions._ val reverse = udf((s: String) => s.reverse) val df = Seq(Array("abc", "def")).toDF("array") val testArray = df.withColumn( "agg", aggregate( col("array"), array().cast("array"), (acc, s) => concat(acc, array(reverse(s) aggArray.show(truncate=false) ``` should be: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[cba, fed]| +--+--+ ``` but: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[fed, fed]| +--+--+ ``` Yes, this fixes the correctness issue. Added a test. Closes #36674 from ueshin/issues/SPARK-39293/array_aggregate. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon (cherry picked from commit d6a11cb4b411c8136eb241aac167bc96990f5421) Signed-off-by: Hyukjin Kwon (cherry picked from commit 92e82fdf8e2faec5add61e2448f11272dfb19c6e) Signed-off-by: Hyukjin Kwon (cherry picked from commit 68d69501576ba21e182791aad91b82a1e7282d11) Signed-off-by: Hyukjin Kwon --- .../catalyst/expressions/higherOrderFunctions.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index a530ce5da27..4a8c366107c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -752,7 +752,7 @@ case class ArrayAggregate( var i = 0 while (i < arr.numElements()) { elementVar.value.set(arr.get(i, elementVar.dataType)) -accForMergeVar.value.set(mergeForEval.eval(input)) + accForMergeVar.value.set(InternalRow.copyValue(mergeForEval.eval(input))) i += 1 } accForFinishVar.value.set(accForMergeVar.value.get) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 7984336beba..1d752a675dd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -2515,6 +2515,25 @@ class DataFrameSuite extends QueryTest checkAnswer(df3.select($"*-#&% ?.`a``b.c`"), Row("col1")) } + test("SPARK-39293: The accumulator of ArrayAggregate to handle complex types properly") { +val reverse = udf((s: String) => s.reverse) + +val df = Seq(Array("abc", "def")).toDF("array") +val testArray = df.select( + aggregate( +col("array"), +array().cast("array"), +(acc, s) => concat(acc, array(reverse(s) +checkAnswer(testArray, Row(Array("cba", "fed")) :: Nil) + +val testMap = df.select( + aggregate( +col("array"), +map().cast("map"), +(acc, s) => map_concat(acc, map(s, reverse(s) +checkAnswer(testMap, Row(Map("abc" -> "cba", "def" -> "fed")) :: Nil) + } + test("SPARK-35886: PromotePrecision should be subexpr replaced") { withTable("tbl") { sql( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated: [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.1 by this push: new 68d69501576 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly 68d69501576 is described below commit 68d69501576ba21e182791aad91b82a1e7282d11 Author: Takuya UESHIN AuthorDate: Thu May 26 10:36:03 2022 +0900 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly Fix the accumulator of `ArrayAggregate` to handle complex types properly. The accumulator of `ArrayAggregate` should copy the intermediate result if string, struct, array, or map. If the intermediate data of `ArrayAggregate` holds reusable data, the result will be duplicated. ```scala import org.apache.spark.sql.functions._ val reverse = udf((s: String) => s.reverse) val df = Seq(Array("abc", "def")).toDF("array") val testArray = df.withColumn( "agg", aggregate( col("array"), array().cast("array"), (acc, s) => concat(acc, array(reverse(s) aggArray.show(truncate=false) ``` should be: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[cba, fed]| +--+--+ ``` but: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[fed, fed]| +--+--+ ``` Yes, this fixes the correctness issue. Added a test. Closes #36674 from ueshin/issues/SPARK-39293/array_aggregate. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon (cherry picked from commit d6a11cb4b411c8136eb241aac167bc96990f5421) Signed-off-by: Hyukjin Kwon (cherry picked from commit 92e82fdf8e2faec5add61e2448f11272dfb19c6e) Signed-off-by: Hyukjin Kwon --- .../catalyst/expressions/higherOrderFunctions.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index a4e069d652b..3cfb0e399f1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -753,7 +753,7 @@ case class ArrayAggregate( var i = 0 while (i < arr.numElements()) { elementVar.value.set(arr.get(i, elementVar.dataType)) -accForMergeVar.value.set(mergeForEval.eval(input)) + accForMergeVar.value.set(InternalRow.copyValue(mergeForEval.eval(input))) i += 1 } accForFinishVar.value.set(accForMergeVar.value.get) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index 8730aeb91fb..78dbddc7494 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -2683,6 +2683,25 @@ class DataFrameSuite extends QueryTest checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil) } + test("SPARK-39293: The accumulator of ArrayAggregate to handle complex types properly") { +val reverse = udf((s: String) => s.reverse) + +val df = Seq(Array("abc", "def")).toDF("array") +val testArray = df.select( + aggregate( +col("array"), +array().cast("array"), +(acc, s) => concat(acc, array(reverse(s) +checkAnswer(testArray, Row(Array("cba", "fed")) :: Nil) + +val testMap = df.select( + aggregate( +col("array"), +map().cast("map"), +(acc, s) => map_concat(acc, map(s, reverse(s) +checkAnswer(testMap, Row(Map("abc" -> "cba", "def" -> "fed")) :: Nil) + } + test("SPARK-35886: PromotePrecision should be subexpr replaced") { withTable("tbl") { sql( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.2 updated: [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.2 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.2 by this push: new 7b33e39d64c [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly 7b33e39d64c is described below commit 7b33e39d64c6a77b30d7c7957a4f2c6f14899644 Author: Takuya UESHIN AuthorDate: Thu May 26 10:36:03 2022 +0900 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly ### What changes were proposed in this pull request? Fix the accumulator of `ArrayAggregate` to handle complex types properly. The accumulator of `ArrayAggregate` should copy the intermediate result if string, struct, array, or map. ### Why are the changes needed? If the intermediate data of `ArrayAggregate` holds reusable data, the result will be duplicated. ```scala import org.apache.spark.sql.functions._ val reverse = udf((s: String) => s.reverse) val df = Seq(Array("abc", "def")).toDF("array") val testArray = df.withColumn( "agg", aggregate( col("array"), array().cast("array"), (acc, s) => concat(acc, array(reverse(s) aggArray.show(truncate=false) ``` should be: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[cba, fed]| +--+--+ ``` but: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[fed, fed]| +--+--+ ``` ### Does this PR introduce _any_ user-facing change? Yes, this fixes the correctness issue. ### How was this patch tested? Added a test. Closes #36674 from ueshin/issues/SPARK-39293/array_aggregate. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon (cherry picked from commit d6a11cb4b411c8136eb241aac167bc96990f5421) Signed-off-by: Hyukjin Kwon --- .../catalyst/expressions/higherOrderFunctions.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index bbcd3b49572..0ec817836a5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -829,7 +829,7 @@ case class ArrayAggregate( var i = 0 while (i < arr.numElements()) { elementVar.value.set(arr.get(i, elementVar.dataType)) -accForMergeVar.value.set(mergeForEval.eval(input)) + accForMergeVar.value.set(InternalRow.copyValue(mergeForEval.eval(input))) i += 1 } accForFinishVar.value.set(accForMergeVar.value.get) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index d9b75c7794b..e9483f84ac3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -2852,6 +2852,25 @@ class DataFrameSuite extends QueryTest checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil) } + test("SPARK-39293: The accumulator of ArrayAggregate to handle complex types properly") { +val reverse = udf((s: String) => s.reverse) + +val df = Seq(Array("abc", "def")).toDF("array") +val testArray = df.select( + aggregate( +col("array"), +array().cast("array"), +(acc, s) => concat(acc, array(reverse(s) +checkAnswer(testArray, Row(Array("cba", "fed")) :: Nil) + +val testMap = df.select( + aggregate( +col("array"), +map().cast("map"), +(acc, s) => map_concat(acc, map(s, reverse(s) +checkAnswer(testMap, Row(Map("abc" -> "cba", "def" -> "fed")) :: Nil) + } + test("SPARK-34882: Aggregate with multiple distinct null sensitive aggregators") { withUserDefinedFunction(("countNulls", true)) { spark.udf.register("countNulls", udaf(new Aggregator[JLong, JLong, JLong] { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated: [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/branch-3.3 by this push: new 92e82fdf8e2 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly 92e82fdf8e2 is described below commit 92e82fdf8e2faec5add61e2448f11272dfb19c6e Author: Takuya UESHIN AuthorDate: Thu May 26 10:36:03 2022 +0900 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly ### What changes were proposed in this pull request? Fix the accumulator of `ArrayAggregate` to handle complex types properly. The accumulator of `ArrayAggregate` should copy the intermediate result if string, struct, array, or map. ### Why are the changes needed? If the intermediate data of `ArrayAggregate` holds reusable data, the result will be duplicated. ```scala import org.apache.spark.sql.functions._ val reverse = udf((s: String) => s.reverse) val df = Seq(Array("abc", "def")).toDF("array") val testArray = df.withColumn( "agg", aggregate( col("array"), array().cast("array"), (acc, s) => concat(acc, array(reverse(s) aggArray.show(truncate=false) ``` should be: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[cba, fed]| +--+--+ ``` but: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[fed, fed]| +--+--+ ``` ### Does this PR introduce _any_ user-facing change? Yes, this fixes the correctness issue. ### How was this patch tested? Added a test. Closes #36674 from ueshin/issues/SPARK-39293/array_aggregate. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon (cherry picked from commit d6a11cb4b411c8136eb241aac167bc96990f5421) Signed-off-by: Hyukjin Kwon --- .../catalyst/expressions/higherOrderFunctions.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index f9b2ade9a60..fa444a670f2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -826,7 +826,7 @@ case class ArrayAggregate( var i = 0 while (i < arr.numElements()) { elementVar.value.set(arr.get(i, elementVar.dataType)) -accForMergeVar.value.set(mergeForEval.eval(input)) + accForMergeVar.value.set(InternalRow.copyValue(mergeForEval.eval(input))) i += 1 } accForFinishVar.value.set(accForMergeVar.value.get) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index d16416d600d..728ba3d6456 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -2933,6 +2933,25 @@ class DataFrameSuite extends QueryTest checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil) } + test("SPARK-39293: The accumulator of ArrayAggregate to handle complex types properly") { +val reverse = udf((s: String) => s.reverse) + +val df = Seq(Array("abc", "def")).toDF("array") +val testArray = df.select( + aggregate( +col("array"), +array().cast("array"), +(acc, s) => concat(acc, array(reverse(s) +checkAnswer(testArray, Row(Array("cba", "fed")) :: Nil) + +val testMap = df.select( + aggregate( +col("array"), +map().cast("map"), +(acc, s) => map_concat(acc, map(s, reverse(s) +checkAnswer(testMap, Row(Map("abc" -> "cba", "def" -> "fed")) :: Nil) + } + test("SPARK-34882: Aggregate with multiple distinct null sensitive aggregators") { withUserDefinedFunction(("countNulls", true)) { spark.udf.register("countNulls", udaf(new Aggregator[JLong, JLong, JLong] { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d6a11cb4b41 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly d6a11cb4b41 is described below commit d6a11cb4b411c8136eb241aac167bc96990f5421 Author: Takuya UESHIN AuthorDate: Thu May 26 10:36:03 2022 +0900 [SPARK-39293][SQL] Fix the accumulator of ArrayAggregate to handle complex types properly ### What changes were proposed in this pull request? Fix the accumulator of `ArrayAggregate` to handle complex types properly. The accumulator of `ArrayAggregate` should copy the intermediate result if string, struct, array, or map. ### Why are the changes needed? If the intermediate data of `ArrayAggregate` holds reusable data, the result will be duplicated. ```scala import org.apache.spark.sql.functions._ val reverse = udf((s: String) => s.reverse) val df = Seq(Array("abc", "def")).toDF("array") val testArray = df.withColumn( "agg", aggregate( col("array"), array().cast("array"), (acc, s) => concat(acc, array(reverse(s) aggArray.show(truncate=false) ``` should be: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[cba, fed]| +--+--+ ``` but: ``` +--+--+ |array |agg | +--+--+ |[abc, def]|[fed, fed]| +--+--+ ``` ### Does this PR introduce _any_ user-facing change? Yes, this fixes the correctness issue. ### How was this patch tested? Added a test. Closes #36674 from ueshin/issues/SPARK-39293/array_aggregate. Authored-by: Takuya UESHIN Signed-off-by: Hyukjin Kwon --- .../catalyst/expressions/higherOrderFunctions.scala | 2 +- .../scala/org/apache/spark/sql/DataFrameSuite.scala | 19 +++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index 0fa2c3debe0..79b76f799d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -826,7 +826,7 @@ case class ArrayAggregate( var i = 0 while (i < arr.numElements()) { elementVar.value.set(arr.get(i, elementVar.dataType)) -accForMergeVar.value.set(mergeForEval.eval(input)) + accForMergeVar.value.set(InternalRow.copyValue(mergeForEval.eval(input))) i += 1 } accForFinishVar.value.set(accForMergeVar.value.get) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala index c5c718088f3..d7dc945ae13 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala @@ -2957,6 +2957,25 @@ class DataFrameSuite extends QueryTest checkAnswer(test10, Row(Array(Row("cbaihg"), Row("fedlkj"))) :: Nil) } + test("SPARK-39293: The accumulator of ArrayAggregate to handle complex types properly") { +val reverse = udf((s: String) => s.reverse) + +val df = Seq(Array("abc", "def")).toDF("array") +val testArray = df.select( + aggregate( +col("array"), +array().cast("array"), +(acc, s) => concat(acc, array(reverse(s) +checkAnswer(testArray, Row(Array("cba", "fed")) :: Nil) + +val testMap = df.select( + aggregate( +col("array"), +map().cast("map"), +(acc, s) => map_concat(acc, map(s, reverse(s) +checkAnswer(testMap, Row(Map("abc" -> "cba", "def" -> "fed")) :: Nil) + } + test("SPARK-34882: Aggregate with multiple distinct null sensitive aggregators") { withUserDefinedFunction(("countNulls", true)) { spark.udf.register("countNulls", udaf(new Aggregator[JLong, JLong, JLong] { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.3 updated (37a2416ca4c -> 6c4e07dbe38)
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a change to branch branch-3.3 in repository https://gitbox.apache.org/repos/asf/spark.git from 37a2416ca4c [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty add 6c4e07dbe38 [SPARK-39255][SQL][3.3] Improve error messages No new revisions were added by this update. Summary of changes: core/src/main/resources/error/error-classes.json | 12 ++-- .../spark/sql/errors/QueryCompilationErrors.scala | 4 +- .../apache/spark/sql/errors/QueryErrorsBase.scala | 10 +++- .../spark/sql/errors/QueryExecutionErrors.scala| 2 +- .../apache/spark/sql/types/StructTypeSuite.scala | 22 --- .../resources/sql-tests/results/ansi/cast.sql.out | 68 +++--- .../resources/sql-tests/results/ansi/date.sql.out | 6 +- .../results/ansi/datetime-parsing-invalid.sql.out | 4 +- .../sql-tests/results/ansi/interval.sql.out| 20 +++ .../results/ansi/string-functions.sql.out | 8 +-- .../test/resources/sql-tests/results/pivot.sql.out | 2 +- .../sql-tests/results/postgreSQL/boolean.sql.out | 32 +- .../sql-tests/results/postgreSQL/float4.sql.out| 8 +-- .../sql-tests/results/postgreSQL/float8.sql.out| 8 +-- .../sql-tests/results/postgreSQL/text.sql.out | 4 +- .../results/postgreSQL/window_part2.sql.out| 2 +- .../results/postgreSQL/window_part3.sql.out| 2 +- .../results/postgreSQL/window_part4.sql.out| 2 +- .../results/timestampNTZ/timestamp-ansi.sql.out| 2 +- .../sql-tests/results/udf/udf-pivot.sql.out| 2 +- .../spark/sql/connector/InsertIntoTests.scala | 4 +- 21 files changed, 117 insertions(+), 107 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39261][CORE] Improve newline formatting for error messages
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 11760e37993 [SPARK-39261][CORE] Improve newline formatting for error messages 11760e37993 is described below commit 11760e3799374d65372b5fcdca699f4daa2162e6 Author: Karen Feng AuthorDate: Wed May 25 22:34:08 2022 +0800 [SPARK-39261][CORE] Improve newline formatting for error messages ### What changes were proposed in this pull request? Error messages in the JSON file should not contain newline characters; newlines are delineated as different elements in the array. This PR: - Checks that newline characters do not exist, and improves the formatting of the JSON file so that each array element is on a new line - Checks that messages are trimmed, and improves the formatting of the JSON file by adding spaces after all newlines and between error class messages/subclass messages in the code - Introduces an environment variable to generate a formatted file: `SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "core/testOnly *SparkThrowableSuite"` ### Why are the changes needed? Improves the readability of the error message JSON file. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UTs Closes #36639 from karenfeng/newline-error-msg. Authored-by: Karen Feng Signed-off-by: Gengliang Wang --- core/src/main/resources/error/error-classes.json | 415 - .../main/scala/org/apache/spark/ErrorInfo.scala| 2 +- .../org/apache/spark/SparkThrowableSuite.scala | 45 ++- .../resources/sql-tests/results/ansi/date.sql.out | 6 +- .../results/ansi/datetime-parsing-invalid.sql.out | 16 +- .../sql-tests/results/ansi/timestamp.sql.out | 12 +- .../test/resources/sql-tests/results/date.sql.out | 6 +- .../results/datetime-formatting-invalid.sql.out| 44 +-- .../results/datetime-parsing-invalid.sql.out | 16 +- .../sql-tests/results/json-functions.sql.out | 4 +- .../resources/sql-tests/results/timestamp.sql.out | 12 +- .../results/timestampNTZ/timestamp-ansi.sql.out| 2 +- .../results/timestampNTZ/timestamp.sql.out | 2 +- .../native/stringCastAndExpressions.sql.out| 6 +- .../sql/errors/QueryExecutionAnsiErrorsSuite.scala | 2 +- .../sql/errors/QueryExecutionErrorsSuite.scala | 10 +- 16 files changed, 428 insertions(+), 172 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 23f99524a7e..e1b95d99ab0 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -1,336 +1,555 @@ { "AMBIGUOUS_FIELD_NAME" : { -"message" : [ "Field name is ambiguous and has matching fields in the struct." ], +"message" : [ + "Field name is ambiguous and has matching fields in the struct." +], "sqlState" : "42000" }, "ARITHMETIC_OVERFLOW" : { -"message" : [ ". If necessary set to \"false\" (except for ANSI interval type) to bypass this error." ], +"message" : [ + ". If necessary set to \"false\" (except for ANSI interval type) to bypass this error." +], "sqlState" : "22003" }, "CANNOT_CAST_DATATYPE" : { -"message" : [ "Cannot cast to ." ], +"message" : [ + "Cannot cast to ." +], "sqlState" : "22005" }, "CANNOT_CHANGE_DECIMAL_PRECISION" : { -"message" : [ " cannot be represented as Decimal(, ). If necessary set to \"false\" to bypass this error." ], +"message" : [ + " cannot be represented as Decimal(, ). If necessary set to \"false\" to bypass this error." +], "sqlState" : "22005" }, "CANNOT_PARSE_DECIMAL" : { -"message" : [ "Cannot parse decimal" ], +"message" : [ + "Cannot parse decimal" +], "sqlState" : "42000" }, "CANNOT_UP_CAST_DATATYPE" : { -"message" : [ "Cannot up cast from to .\n" ] +"message" : [ + "Cannot up cast from to .", + "" +] }, "CAST_INVALID_INPUT" : { -"message" : [ "The value of the type cannot be cast to because it is malformed. Correct the value as per the syntax, or change its target type. To return NULL instead, use `try_cast`. If necessary set to \"false\" to bypass this error." ], +"message" : [ + "The value of the type cannot be cast to because it is malformed. Correct the value as per the syntax, or change its target type. To return NULL instead, use `try_cast`. If necessary set to \"false\" to bypass this error." +], "sqlState" : "42000" }, "CAST_OVERFLOW" : { -"message" : [ "The value of the type cannot be cast to due to an overflow. To return NU
[spark] branch master updated: [SPARK-38982][PYTHON][PS][TESTS] Skip categories setter test
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 2172b0f4471 [SPARK-38982][PYTHON][PS][TESTS] Skip categories setter test 2172b0f4471 is described below commit 2172b0f44719cca2ddc799c1d0b18f45a62de990 Author: Yikun Jiang AuthorDate: Wed May 25 20:08:15 2022 +0900 [SPARK-38982][PYTHON][PS][TESTS] Skip categories setter test ### What changes were proposed in this pull request? Since https://github.com/pandas-dev/pandas/commit/126a19d038b65493729e21ca969fbb58dab9a408, pandas changes behavior. Before pandas 1.4, the pandas will refresh dtypes according to categories, since panda 1.4, `categories.setter` dtype refresh will not work. According to https://github.com/pandas-dev/pandas/issues/46820 , the complete support of `categories.setter` will never back. And also only categories is refreshed (but dtype not) is useless behavior so we'd better to only fix test and keep current PS behavior, then remove this setter support when we remove all deprecated methods. ### Why are the changes needed? Make CI passed with pandas 1.4.x ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? test_categories_setter passed with 1.3.X and also 1.4.x Closes #36355 from Yikun/SPARK-38982. Authored-by: Yikun Jiang Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/tests/indexes/test_category.py | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/pyspark/pandas/tests/indexes/test_category.py b/python/pyspark/pandas/tests/indexes/test_category.py index 14c39aa713c..5122fb076de 100644 --- a/python/pyspark/pandas/tests/indexes/test_category.py +++ b/python/pyspark/pandas/tests/indexes/test_category.py @@ -82,7 +82,11 @@ class CategoricalIndexTest(PandasOnSparkTestCase, TestUtils): pidx.categories = ["z", "y", "x"] psidx.categories = ["z", "y", "x"] -if LooseVersion(pd.__version__) >= LooseVersion("1.1"): +# Pandas deprecated all the in-place category-setting behaviors, dtypes also not be +# refreshed in categories.setter since Pandas 1.4+, we should also consider to clean up +# this test when in-place category-setting removed: +# https://github.com/pandas-dev/pandas/issues/46820 +if LooseVersion("1.4") >= LooseVersion(pd.__version__) >= LooseVersion("1.1"): self.assert_eq(pidx, psidx) self.assert_eq(pdf, psdf) else: - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-39272][SQL] Increase the start position of query context by 1
This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 4f567f49cb1 [SPARK-39272][SQL] Increase the start position of query context by 1 4f567f49cb1 is described below commit 4f567f49cb19007b9adcce850f5f309c02375ac3 Author: Gengliang Wang AuthorDate: Wed May 25 17:32:45 2022 +0800 [SPARK-39272][SQL] Increase the start position of query context by 1 ### What changes were proposed in this pull request? Increase the start position of query context by 1 ### Why are the changes needed? Currently, the line number starts from 1, while the start position starts from 0. Thus it's better to increase the start position by 1 for consistency. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UT Closes #36651 from gengliangwang/increase1. Authored-by: Gengliang Wang Signed-off-by: Gengliang Wang --- .../apache/spark/sql/catalyst/trees/TreeNode.scala | 4 +- .../spark/sql/catalyst/trees/TreeNodeSuite.scala | 2 +- .../resources/sql-tests/results/ansi/cast.sql.out | 70 +++--- .../resources/sql-tests/results/ansi/date.sql.out | 6 +- .../results/ansi/datetime-parsing-invalid.sql.out | 4 +- .../ansi/decimalArithmeticOperations.sql.out | 20 +++ .../sql-tests/results/ansi/interval.sql.out| 34 +-- .../resources/sql-tests/results/ansi/map.sql.out | 8 +-- .../results/ansi/string-functions.sql.out | 8 +-- .../resources/sql-tests/results/interval.sql.out | 12 ++-- .../sql-tests/results/postgreSQL/boolean.sql.out | 32 +- .../sql-tests/results/postgreSQL/float4.sql.out| 8 +-- .../sql-tests/results/postgreSQL/float8.sql.out| 8 +-- .../sql-tests/results/postgreSQL/int4.sql.out | 12 ++-- .../sql-tests/results/postgreSQL/int8.sql.out | 14 ++--- .../results/postgreSQL/select_having.sql.out | 2 +- .../sql-tests/results/postgreSQL/text.sql.out | 4 +- .../results/postgreSQL/window_part2.sql.out| 2 +- .../results/postgreSQL/window_part3.sql.out| 2 +- .../results/postgreSQL/window_part4.sql.out| 2 +- .../udf/postgreSQL/udf-select_having.sql.out | 2 +- .../sql/errors/QueryExecutionAnsiErrorsSuite.scala | 8 +-- 22 files changed, 133 insertions(+), 131 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala index 54c64515ee4..fcbebf3ac7a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/trees/TreeNode.scala @@ -79,7 +79,9 @@ case class Origin( "" } else { val positionContext = if (line.isDefined && startPosition.isDefined) { -s"(line ${line.get}, position ${startPosition.get})" +// Note that the line number starts from 1, while the start position starts from 0. +// Here we increase the start position by 1 for consistency. +s"(line ${line.get}, position ${startPosition.get + 1})" } else { "" } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala index 899a740bdae..1e1206c0e1e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/trees/TreeNodeSuite.scala @@ -876,7 +876,7 @@ class TreeNodeSuite extends SparkFunSuite with SQLHelper { objectType = Some("VIEW"), objectName = Some("some_view")) val expected = - """== SQL of VIEW some_view(line 3, position 38) == + """== SQL of VIEW some_view(line 3, position 39) == |...7890 + 1234567890 + 1234567890, cast('a' | |as /* comment */ diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/cast.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/cast.sql.out index 891cd34b7c5..45024dcffa7 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/cast.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/cast.sql.out @@ -9,7 +9,7 @@ struct<> -- !query output org.apache.spark.SparkNumberFormatException [CAST_INVALID_INPUT] The value '1.23' of the type "STRING" cannot be cast to "INT" because it is malformed. Correct the value as per the syntax, or change its target type. To return NULL instead, use `try_cast`. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. -== SQL(line 1, position 7) == +== SQL(li
[spark] 02/02: [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git commit 3cc5d4a6138c58b1d290748304156dcf2fd63b6d Author: Ivan Sadikov AuthorDate: Wed May 25 11:39:54 2022 +0900 [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty This PR removes flaky `test_df_is_empty` as reported in https://issues.apache.org/jira/browse/SPARK-39252. I will open a follow-up PR to reintroduce the test and fix the flakiness (or see if it was a regression). No. Existing unit tests. Closes #36656 from sadikovi/SPARK-39252. Authored-by: Ivan Sadikov Signed-off-by: Hyukjin Kwon (cherry picked from commit 9823bb385cd6dca7c4fb5a6315721420ad42f80a) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/tests/test_dataframe.py | 36 -- 1 file changed, 36 deletions(-) diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 6b9ac24d8c1..e3977e81851 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -21,7 +21,6 @@ import shutil import tempfile import time import unittest -import uuid from pyspark.sql import SparkSession, Row from pyspark.sql.types import StringType, IntegerType, DoubleType, StructType, StructField, \ @@ -838,41 +837,6 @@ class DataFrameTests(ReusedSQLTestCase): finally: shutil.rmtree(tpath) -def test_df_is_empty(self): -# SPARK-39084: Fix df.rdd.isEmpty() resulting in JVM crash. - -# This particular example of DataFrame reproduces an issue in isEmpty call -# which could result in JVM crash. -data = [] -for t in range(0, 1): -id = str(uuid.uuid4()) -if t == 0: -for i in range(0, 99): -data.append((id,)) -elif t < 10: -for i in range(0, 75): -data.append((id,)) -elif t < 100: -for i in range(0, 50): -data.append((id,)) -elif t < 1000: -for i in range(0, 25): -data.append((id,)) -else: -for i in range(0, 10): -data.append((id,)) - -tmpPath = tempfile.mkdtemp() -shutil.rmtree(tmpPath) -try: -df = self.spark.createDataFrame(data, ["col"]) -df.coalesce(1).write.parquet(tmpPath) - -res = self.spark.read.parquet(tmpPath).groupBy("col").count() -self.assertFalse(res.rdd.isEmpty()) -finally: -shutil.rmtree(tmpPath) - class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils): # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated (ad42284f3f1 -> 3cc5d4a6138)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git from ad42284f3f1 [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty new 661cc5298c3 Revert "[SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty" new 3cc5d4a6138 [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: python/pyspark/sql/tests/test_dataframe.py | 1 - 1 file changed, 1 deletion(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] 01/02: Revert "[SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty"
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git commit 661cc5298c3a46e7381b1159eb90764ad616c87a Author: Hyukjin Kwon AuthorDate: Wed May 25 16:32:41 2022 +0900 Revert "[SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty" This reverts commit ad42284f3f1aa3d3d691b95ea76ea8eae535abe2. --- python/pyspark/sql/tests/test_dataframe.py | 37 +- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 562e5bbda08..6b9ac24d8c1 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -21,7 +21,7 @@ import shutil import tempfile import time import unittest -from typing import cast +import uuid from pyspark.sql import SparkSession, Row from pyspark.sql.types import StringType, IntegerType, DoubleType, StructType, StructField, \ @@ -838,6 +838,41 @@ class DataFrameTests(ReusedSQLTestCase): finally: shutil.rmtree(tpath) +def test_df_is_empty(self): +# SPARK-39084: Fix df.rdd.isEmpty() resulting in JVM crash. + +# This particular example of DataFrame reproduces an issue in isEmpty call +# which could result in JVM crash. +data = [] +for t in range(0, 1): +id = str(uuid.uuid4()) +if t == 0: +for i in range(0, 99): +data.append((id,)) +elif t < 10: +for i in range(0, 75): +data.append((id,)) +elif t < 100: +for i in range(0, 50): +data.append((id,)) +elif t < 1000: +for i in range(0, 25): +data.append((id,)) +else: +for i in range(0, 10): +data.append((id,)) + +tmpPath = tempfile.mkdtemp() +shutil.rmtree(tmpPath) +try: +df = self.spark.createDataFrame(data, ["col"]) +df.coalesce(1).write.parquet(tmpPath) + +res = self.spark.read.parquet(tmpPath).groupBy("col").count() +self.assertFalse(res.rdd.isEmpty()) +finally: +shutil.rmtree(tmpPath) + class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils): # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch branch-3.1 updated (3b2c1b916a2 -> ad42284f3f1)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a change to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git from 3b2c1b916a2 [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty new a7d0edf800c Revert "[SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty" new ad42284f3f1 [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty The 2 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: python/pyspark/sql/tests/test_dataframe.py | 79 -- 1 file changed, 79 deletions(-) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] 01/02: Revert "[SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty"
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git commit a7d0edf800cae9dbe7c65f84178baf7386f8a29a Author: Hyukjin Kwon AuthorDate: Wed May 25 16:28:31 2022 +0900 Revert "[SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty" This reverts commit 3b2c1b916a29fef0463dc3a6d9df2e46a91cf446. --- python/pyspark/sql/tests/test_dataframe.py | 46 +- 1 file changed, 1 insertion(+), 45 deletions(-) diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 72fed2856a6..6b9ac24d8c1 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -21,7 +21,7 @@ import shutil import tempfile import time import unittest -from typing import cast +import uuid from pyspark.sql import SparkSession, Row from pyspark.sql.types import StringType, IntegerType, DoubleType, StructType, StructField, \ @@ -838,7 +838,6 @@ class DataFrameTests(ReusedSQLTestCase): finally: shutil.rmtree(tpath) -<<< HEAD def test_df_is_empty(self): # SPARK-39084: Fix df.rdd.isEmpty() resulting in JVM crash. @@ -873,49 +872,6 @@ class DataFrameTests(ReusedSQLTestCase): self.assertFalse(res.rdd.isEmpty()) finally: shutil.rmtree(tmpPath) -=== -def test_df_show(self): -# SPARK-35408: ensure better diagnostics if incorrect parameters are passed -# to DataFrame.show - -df = self.spark.createDataFrame([("foo",)]) -df.show(5) -df.show(5, True) -df.show(5, 1, True) -df.show(n=5, truncate="1", vertical=False) -df.show(n=5, truncate=1.5, vertical=False) - -with self.assertRaisesRegex(TypeError, "Parameter 'n'"): -df.show(True) -with self.assertRaisesRegex(TypeError, "Parameter 'vertical'"): -df.show(vertical="foo") -with self.assertRaisesRegex(TypeError, "Parameter 'truncate=foo'"): -df.show(truncate="foo") - -@unittest.skipIf( -not have_pandas or not have_pyarrow, -cast(str, pandas_requirement_message or pyarrow_requirement_message), -) -def test_pandas_api(self): -import pandas as pd -from pandas.testing import assert_frame_equal - -sdf = self.spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"]) -psdf_from_sdf = sdf.pandas_api() -psdf_from_sdf_with_index = sdf.pandas_api(index_col="Col1") -pdf = pd.DataFrame({"Col1": ["a", "b", "c"], "Col2": [1, 2, 3]}) -pdf_with_index = pdf.set_index("Col1") - -assert_frame_equal(pdf, psdf_from_sdf.to_pandas()) -assert_frame_equal(pdf_with_index, psdf_from_sdf_with_index.to_pandas()) - -# test for SPARK-36337 -def test_create_nan_decimal_dataframe(self): -self.assertEqual( -self.spark.createDataFrame(data=[Decimal("NaN")], schema="decimal").collect(), -[Row(value=None)], -) ->>> 9823bb385cd ([SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty) class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils): - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] 02/02: [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git commit ad42284f3f1aa3d3d691b95ea76ea8eae535abe2 Author: Ivan Sadikov AuthorDate: Wed May 25 11:39:54 2022 +0900 [SPARK-39252][PYSPARK][TESTS] Remove flaky test_df_is_empty This PR removes flaky `test_df_is_empty` as reported in https://issues.apache.org/jira/browse/SPARK-39252. I will open a follow-up PR to reintroduce the test and fix the flakiness (or see if it was a regression). No. Existing unit tests. Closes #36656 from sadikovi/SPARK-39252. Authored-by: Ivan Sadikov Signed-off-by: Hyukjin Kwon (cherry picked from commit 9823bb385cd6dca7c4fb5a6315721420ad42f80a) Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/tests/test_dataframe.py | 37 +- 1 file changed, 1 insertion(+), 36 deletions(-) diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 6b9ac24d8c1..562e5bbda08 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -21,7 +21,7 @@ import shutil import tempfile import time import unittest -import uuid +from typing import cast from pyspark.sql import SparkSession, Row from pyspark.sql.types import StringType, IntegerType, DoubleType, StructType, StructField, \ @@ -838,41 +838,6 @@ class DataFrameTests(ReusedSQLTestCase): finally: shutil.rmtree(tpath) -def test_df_is_empty(self): -# SPARK-39084: Fix df.rdd.isEmpty() resulting in JVM crash. - -# This particular example of DataFrame reproduces an issue in isEmpty call -# which could result in JVM crash. -data = [] -for t in range(0, 1): -id = str(uuid.uuid4()) -if t == 0: -for i in range(0, 99): -data.append((id,)) -elif t < 10: -for i in range(0, 75): -data.append((id,)) -elif t < 100: -for i in range(0, 50): -data.append((id,)) -elif t < 1000: -for i in range(0, 25): -data.append((id,)) -else: -for i in range(0, 10): -data.append((id,)) - -tmpPath = tempfile.mkdtemp() -shutil.rmtree(tmpPath) -try: -df = self.spark.createDataFrame(data, ["col"]) -df.coalesce(1).write.parquet(tmpPath) - -res = self.spark.read.parquet(tmpPath).groupBy("col").count() -self.assertFalse(res.rdd.isEmpty()) -finally: -shutil.rmtree(tmpPath) - class QueryExecutionListenerTests(unittest.TestCase, SQLTestUtils): # These tests are separate because it uses 'spark.sql.queryExecutionListeners' which is - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org