[spark] branch master updated: [SPARK-44024][SQL] Change to use `map` when `unzip` only used to extract a single element
This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new ad6cd60ca74 [SPARK-44024][SQL] Change to use `map` when `unzip` only used to extract a single element ad6cd60ca74 is described below commit ad6cd60ca7408018d8c6259597456e9c2fe8b376 Author: yangjie01 AuthorDate: Sun Jun 18 07:19:56 2023 -0500 [SPARK-44024][SQL] Change to use `map` when `unzip` only used to extract a single element ### What changes were proposed in this pull request? A minor code simplification, use `map` instead of `unzip` when `unzip` only used to extract a single element. ### Why are the changes needed? Code simplification ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Pass GitHub Actions Closes #41548 from LuciferYang/SPARK-44024. Lead-authored-by: yangjie01 Co-authored-by: YangJie Signed-off-by: Sean Owen --- .../scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala | 2 +- .../apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala | 2 +- .../spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index 568e3d30e34..c70dba01808 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -861,7 +861,7 @@ object ColumnPruning extends Rule[LogicalPlan] { val newProjects = e.projections.map { proj => proj.zip(e.output).filter { case (_, a) => newOutput.contains(a) -}.unzip._1 +}.map(_._1) } a.copy(child = Expand(newProjects, newOutput, grandChild)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala index 20ccf991af6..8dac6737334 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/CreateIndexExec.scala @@ -52,7 +52,7 @@ case class CreateIndexExec( } try { table.createIndex( -indexName, columns.unzip._1.toArray, colProperties, propertiesWithIndexType.asJava) +indexName, columns.map(_._1).toArray, colProperties, propertiesWithIndexType.asJava) } catch { case _: IndexAlreadyExistsException if ignoreIfExists => logWarning(s"Index $indexName already exists in table ${table.name}. Ignoring.") diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala index 49a6c7232ec..e58fe7844ab 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala @@ -192,11 +192,11 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper { val groupOutputMap = normalizedGroupingExpr.zipWithIndex.map { case (e, i) => AttributeReference(s"group_col_$i", e.dataType)() -> e } - val groupOutput = groupOutputMap.unzip._1 + val groupOutput = groupOutputMap.map(_._1) val aggOutputMap = finalAggExprs.zipWithIndex.map { case (e, i) => AttributeReference(s"agg_func_$i", e.dataType)() -> e } - val aggOutput = aggOutputMap.unzip._1 + val aggOutput = aggOutputMap.map(_._1) val newOutput = groupOutput ++ aggOutput val groupByExprToOutputOrdinal = mutable.HashMap.empty[Expression, Int] normalizedGroupingExpr.zipWithIndex.foreach { case (expr, ordinal) => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44089][SQL][TESTS] Remove the `@ignore` identifier from `AlterTableRenamePartitionSuite`
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d05091eb8f0 [SPARK-44089][SQL][TESTS] Remove the `@ignore` identifier from `AlterTableRenamePartitionSuite` d05091eb8f0 is described below commit d05091eb8f0f6ee1398fae90fd7b593ac3314e44 Author: yangjie01 AuthorDate: Sun Jun 18 17:24:36 2023 +0300 [SPARK-44089][SQL][TESTS] Remove the `@ignore` identifier from `AlterTableRenamePartitionSuite` ### What changes were proposed in this pull request? https://github.com/apache/spark/pull/41533 ignore `AlterTableRenamePartitionSuite` try to restore stability of `sql-others` test task, but it seems that it is not the root cause that affects stability, so this pr has removed the previously added `ignore` identifier to restore testing. ### Why are the changes needed? Resume testing of `AlterTableRenamePartitionSuite` ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? should monitor ci Closes #41647 from LuciferYang/SPARK-44089. Authored-by: yangjie01 Signed-off-by: Max Gekk --- .../sql/execution/command/v2/AlterTableRenamePartitionSuite.scala | 3 --- 1 file changed, 3 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableRenamePartitionSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableRenamePartitionSuite.scala index 764596685b5..bb06818da48 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableRenamePartitionSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/v2/AlterTableRenamePartitionSuite.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.command.v2 -import org.scalatest.Ignore - import org.apache.spark.sql.Row import org.apache.spark.sql.execution.command @@ -26,7 +24,6 @@ import org.apache.spark.sql.execution.command * The class contains tests for the `ALTER TABLE .. RENAME PARTITION` command * to check V2 table catalogs. */ -@Ignore class AlterTableRenamePartitionSuite extends command.AlterTableRenamePartitionSuiteBase with CommandSuiteBase { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44093][SQL][TESTS] Make `catalyst` module passes in Java 21
This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new cbfc920c2d7 [SPARK-44093][SQL][TESTS] Make `catalyst` module passes in Java 21 cbfc920c2d7 is described below commit cbfc920c2d75451e898ff5e00622a2af4eed3709 Author: Dongjoon Hyun AuthorDate: Sun Jun 18 17:34:08 2023 +0300 [SPARK-44093][SQL][TESTS] Make `catalyst` module passes in Java 21 ### What changes were proposed in this pull request? This PR aims to make `catalyst` module passes in Java 21. ### Why are the changes needed? https://bugs.openjdk.org/browse/JDK-8267125 changes the error message at Java 18. **JAVA** ``` $ java -version openjdk version "21-ea" 2023-09-19 OpenJDK Runtime Environment (build 21-ea+27-2343) OpenJDK 64-Bit Server VM (build 21-ea+27-2343, mixed mode, sharing) ``` **BEFORE** ``` $ build/sbt "catalyst/test" ... [info] *** 1 TEST FAILED *** [error] Failed: Total 7122, Failed 1, Errors 0, Passed 7121, Ignored 5, Canceled 1 [error] Failed tests: [error] org.apache.spark.sql.catalyst.expressions.ExpressionImplUtilsSuite [error] (catalyst / Test / test) sbt.TestsFailedException: Tests unsuccessful [error] Total time: 212 s (03:32), completed Jun 18, 2023, 1:11:17 AM ``` **AFTER** ``` $ build/sbt "catalyst/test" ... [info] All tests passed. [info] Passed: Total 7122, Failed 0, Errors 0, Passed 7122, Ignored 5, Canceled 1 [success] Total time: 213 s (03:33), completed Jun 18, 2023, 1:15:37 AM ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs and manual test on Java 21. Closes #41649 from dongjoon-hyun/SPARK-44093. Authored-by: Dongjoon Hyun Signed-off-by: Max Gekk --- .../sql/catalyst/expressions/ExpressionImplUtilsSuite.scala | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala index 3b0dd82c173..4b33f9bc527 100644 --- a/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala +++ b/sql/catalyst/src/test/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtilsSuite.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst.expressions +import org.apache.commons.lang3.{JavaVersion, SystemUtils} + import org.apache.spark.{SparkFunSuite, SparkRuntimeException} import org.apache.spark.unsafe.types.UTF8String @@ -285,6 +287,12 @@ class ExpressionImplUtilsSuite extends SparkFunSuite { } } + // JDK-8267125 changes tag error message at Java 18 + val msgTagMismatch = if (SystemUtils.isJavaVersionAtMost(JavaVersion.JAVA_17)) { +"Tag mismatch!" + } else { +"Tag mismatch" + } val corruptedCiphertexts = Seq( // This is truncated TestCase( @@ -310,7 +318,7 @@ class ExpressionImplUtilsSuite extends SparkFunSuite { errorParamsMap = Map( "parameter" -> "`expr`, `key`", "functionName" -> "`aes_encrypt`/`aes_decrypt`", -"detailMessage" -> "Tag mismatch!" +"detailMessage" -> msgTagMismatch ) ), // Valid ciphertext, wrong AAD @@ -324,7 +332,7 @@ class ExpressionImplUtilsSuite extends SparkFunSuite { errorParamsMap = Map( "parameter" -> "`expr`, `key`", "functionName" -> "`aes_encrypt`/`aes_decrypt`", -"detailMessage" -> "Tag mismatch!" +"detailMessage" -> msgTagMismatch ) ) ) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44094][BUILD] Upgrade Apache Arrow to 12.0.1
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 9054e46b12c [SPARK-44094][BUILD] Upgrade Apache Arrow to 12.0.1 9054e46b12c is described below commit 9054e46b12cb0798f5b50727968257e0cf4356da Author: Dongjoon Hyun AuthorDate: Sun Jun 18 13:27:39 2023 -0700 [SPARK-44094][BUILD] Upgrade Apache Arrow to 12.0.1 ### What changes were proposed in this pull request? This PR aims to upgrade Apache Arrow to 12.0.1. ### Why are the changes needed? This brings the following bug fixes. - https://arrow.apache.org/release/12.0.1.html ### Does this PR introduce _any_ user-facing change? No. SPARK-43446 introduced Apache Arrow 12.0.0 for Apache Spark 3.5.0. ### How was this patch tested? Pass the CIs. Closes #41650 from dongjoon-hyun/SPARK-44094. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- dev/deps/spark-deps-hadoop-3-hive-2.3 | 8 pom.xml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 2b97a343de6..e0bbc724990 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -16,10 +16,10 @@ antlr4-runtime/4.9.3//antlr4-runtime-4.9.3.jar aopalliance-repackaged/2.6.1//aopalliance-repackaged-2.6.1.jar arpack/3.0.3//arpack-3.0.3.jar arpack_combined_all/0.1//arpack_combined_all-0.1.jar -arrow-format/12.0.0//arrow-format-12.0.0.jar -arrow-memory-core/12.0.0//arrow-memory-core-12.0.0.jar -arrow-memory-netty/12.0.0//arrow-memory-netty-12.0.0.jar -arrow-vector/12.0.0//arrow-vector-12.0.0.jar +arrow-format/12.0.1//arrow-format-12.0.1.jar +arrow-memory-core/12.0.1//arrow-memory-core-12.0.1.jar +arrow-memory-netty/12.0.1//arrow-memory-netty-12.0.1.jar +arrow-vector/12.0.1//arrow-vector-12.0.1.jar audience-annotations/0.5.0//audience-annotations-0.5.0.jar avro-ipc/1.11.1//avro-ipc-1.11.1.jar avro-mapred/1.11.1//avro-mapred-1.11.1.jar diff --git a/pom.xml b/pom.xml index f84e1094e7d..a5f6d5f9981 100644 --- a/pom.xml +++ b/pom.xml @@ -217,7 +217,7 @@ If you are changing Arrow version specification, please check ./python/pyspark/sql/pandas/utils.py, and ./python/setup.py too. --> -12.0.0 +12.0.1 2.5.9 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43979][SQL][FOLLOW-UP] Simplify metrics plan should replace nodes by new attributes
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new b676a2d9a67 [SPARK-43979][SQL][FOLLOW-UP] Simplify metrics plan should replace nodes by new attributes b676a2d9a67 is described below commit b676a2d9a67c121c563979f93605d1215473ae32 Author: Rui Wang AuthorDate: Sun Jun 18 13:45:46 2023 -0700 [SPARK-43979][SQL][FOLLOW-UP] Simplify metrics plan should replace nodes by new attributes ### What changes were proposed in this pull request? https://github.com/apache/spark/pull/41475 introduces a fix that we remove extra alias-only project which might cause same metrics mismatch over the query plan. However, to make it more robust, we need to replace the attributes if we need to drop the extra Project. ### Why are the changes needed? Enhance the fix to cover more test case. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? UT Closes #41620 from amaliujia/fix_json_followup. Authored-by: Rui Wang Signed-off-by: Dongjoon Hyun --- .../spark/sql/catalyst/analysis/CheckAnalysis.scala | 6 +++--- .../test/scala/org/apache/spark/sql/SQLQuerySuite.scala | 16 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index d3dc9a75dd5..e47966f1e27 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -1081,7 +1081,7 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB * duplicates metric definition. */ private def simplifyPlanForCollectedMetrics(plan: LogicalPlan): LogicalPlan = { -plan.resolveOperatorsDown { +plan.resolveOperatorsUpWithNewOutput { case p: Project if p.projectList.size == p.child.output.size => val assignExprIdOnly = p.projectList.zip(p.child.output).forall { case (left: Alias, right: Attribute) => @@ -1089,9 +1089,9 @@ trait CheckAnalysis extends PredicateHelper with LookupCatalog with QueryErrorsB case _ => false } if (assignExprIdOnly) { - p.child + (p.child, p.output.zip(p.child.output)) } else { - p + (p, Nil) } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala index 7b4a4a52a85..381c7714402 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala @@ -4682,6 +4682,22 @@ class SQLQuerySuite extends QueryTest with SharedSparkSession with AdaptiveSpark """ ).observe("my_event", count("*")) df1.crossJoin(df1) + +val df2 = spark.sql( + """ + WITH t1 AS ( + SELECT customer_id, age, row_number() OVER(PARTITION BY customer_id ORDER BY age ASC) rn + FROM tmp_view) + SELECT customer_id, age FROM t1 WHERE rn = 1 + """.stripMargin +).observe("my_event2", count("*")).as("df2") + +val df3 = spark.range(1, 5).toDF("id").withColumn("zaak_id", lit(1)) + .withColumn("targetid", lit(2)).as("df3") +val df4 = spark.range(1, 5).toDF("id").withColumn("zaak_id", lit(2)).as("df4") +val df5 = df4.join(df2, col("df4.id") === col("df2.customer_id"), "inner") +val df6 = df3.join(df2, col("df3.zaak_id") === col("df2.customer_id"), "outer") +df5.crossJoin(df6) } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44034][TESTS] Add a new test group for sql module
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 1379680be8d [SPARK-44034][TESTS] Add a new test group for sql module 1379680be8d is described below commit 1379680be8d54be466c5b80b0fc578126206e77e Author: yangjie01 AuthorDate: Sun Jun 18 13:50:36 2023 -0700 [SPARK-44034][TESTS] Add a new test group for sql module ### What changes were proposed in this pull request? The purpose of this pr is to add a new test tag `SlowSQLTest` to the sql module, and identified some Suites with test cases more than 3 seconds, and apply it to GA testing task to reduce the testing pressure of the `sql others` group. For branches-3.3 and branches-3.4, a tag that will not appear in the sql module was assigned to the new test group to avoid `java.lang.ClassNotFoundException` and make this group build only without running any tests. ### Why are the changes needed? For a long time, the sql module UTs has only two groups: `slow` and `others`. The test cases in group `slow` are fixed, while the number of test cases in group `others` continues to increase, which has had a certain impact on the testing duration and stability of group `others`. So this PR proposes to add a new testing group to share the testing pressure of `sql others` group, which has made the testing time of the three groups more average, and hope it can improve the stability of the GA task. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Should monitor GA Closes #41638 from LuciferYang/SPARK-44034-2. Authored-by: yangjie01 Signed-off-by: Dongjoon Hyun --- .github/workflows/build_and_test.yml| 17 - .../java/org/apache/spark/tags/SlowSQLTest.java | 21 ++--- .../spark/sql/ApproximatePercentileQuerySuite.scala | 2 ++ .../org/apache/spark/sql/CachedTableSuite.scala | 2 ++ .../apache/spark/sql/DataFrameAsOfJoinSuite.scala | 2 ++ .../scala/org/apache/spark/sql/DataFrameSuite.scala | 2 ++ .../spark/sql/DataFrameWindowFunctionsSuite.scala | 2 ++ .../org/apache/spark/sql/DatasetCacheSuite.scala| 3 ++- .../test/scala/org/apache/spark/sql/JoinSuite.scala | 2 ++ .../WriteDistributionAndOrderingSuite.scala | 2 ++ .../sql/execution/BroadcastExchangeSuite.scala | 1 + .../execution/OptimizeMetadataOnlyQuerySuite.scala | 2 ++ .../spark/sql/execution/QueryExecutionSuite.scala | 3 ++- .../execution/adaptive/AdaptiveQueryExecSuite.scala | 2 ++ .../FileSourceAggregatePushDownSuite.scala | 5 + .../execution/datasources/V1WriteCommandSuite.scala | 2 ++ .../parquet/ParquetRebaseDatetimeSuite.scala| 3 +++ .../datasources/parquet/ParquetRowIndexSuite.scala | 2 ++ .../execution/streaming/state/RocksDBSuite.scala| 2 ++ .../sql/execution/ui/AllExecutionsPageSuite.scala | 2 ++ .../spark/sql/expressions/ExpressionInfoSuite.scala | 2 ++ .../spark/sql/sources/BucketedReadSuite.scala | 2 ++ .../spark/sql/sources/BucketedWriteSuite.scala | 2 ++ .../DisableUnnecessaryBucketedScanSuite.scala | 3 +++ .../streaming/AcceptsLatestSeenOffsetSuite.scala| 2 ++ .../DeprecatedStreamingAggregationSuite.scala | 2 ++ .../sql/streaming/EventTimeWatermarkSuite.scala | 2 ++ .../spark/sql/streaming/FileStreamSinkSuite.scala | 3 +++ .../spark/sql/streaming/FileStreamSourceSuite.scala | 5 - .../spark/sql/streaming/FileStreamStressSuite.scala | 2 ++ ...apGroupsInPandasWithStateDistributionSuite.scala | 2 ++ .../FlatMapGroupsInPandasWithStateSuite.scala | 2 ++ .../FlatMapGroupsWithStateDistributionSuite.scala | 2 ++ .../sql/streaming/FlatMapGroupsWithStateSuite.scala | 3 +++ ...latMapGroupsWithStateWithInitialStateSuite.scala | 2 ++ .../sql/streaming/MemorySourceStressSuite.scala | 2 ++ .../sql/streaming/MultiStatefulOperatorsSuite.scala | 2 ++ .../spark/sql/streaming/RocksDBStateStoreTest.scala | 2 ++ .../apache/spark/sql/streaming/StreamSuite.scala| 2 ++ .../sql/streaming/StreamingAggregationSuite.scala | 3 +++ .../sql/streaming/StreamingDeduplicationSuite.scala | 3 +++ ...StreamingDeduplicationWithinWatermarkSuite.scala | 2 ++ .../spark/sql/streaming/StreamingJoinSuite.scala| 5 + .../sql/streaming/StreamingQueryListenerSuite.scala | 2 ++ .../sql/streaming/StreamingQueryManagerSuite.scala | 2 ++ .../spark/sql/streaming/StreamingQuerySuite.scala | 2 ++ .../StreamingSessionWindowDistributionSuite.scala | 2 ++ .../sql/streaming/StreamingSessionWindowSuite.scala | 2 ++ ...treamingStateStoreFormatCompatibilitySuite.scala | 2 ++ .../sql/streaming/TriggerAvailableNowSuite.scala| 2 ++ .../ContinuousQueryStatu
[spark] branch master updated: [SPARK-43009][PYTHON][FOLLOWUP] Parameterized `sql_formatter.sql()` with Any constants
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 53dae3d0440 [SPARK-43009][PYTHON][FOLLOWUP] Parameterized `sql_formatter.sql()` with Any constants 53dae3d0440 is described below commit 53dae3d0440f5acad1fd30b17fe27ed208860960 Author: Max Gekk AuthorDate: Mon Jun 19 09:31:50 2023 +0900 [SPARK-43009][PYTHON][FOLLOWUP] Parameterized `sql_formatter.sql()` with Any constants ### What changes were proposed in this pull request? In the PR, I propose to change API of parameterized SQL, and replace type of argument values from `string` to `Any` in `sql_formatter`. Language API can accept `Any` objects from which it is possible to construct literal expressions. ### Why are the changes needed? To align the API to PySpark's `sql()`. And the current implementation the parameterized `sql()` requires arguments as string values parsed to SQL literal expressions that causes the following issues: 1. SQL comments are skipped while parsing, so, some fragments of input might be skipped. For example, `'Europe -- Amsterdam'`. In this case, `-- Amsterdam` is excluded from the input. 2. Special chars in string values must be escaped, for instance `'E\'Twaun Moore'` ### Does this PR introduce _any_ user-facing change? Yes. ### How was this patch tested? By running the affected test suite: ``` $ python/run-tests --parallelism=1 --testnames 'pyspark.pandas.sql_formatter' ``` Closes #41644 from MaxGekk/fix-pandas-sql_formatter. Authored-by: Max Gekk Signed-off-by: Hyukjin Kwon --- python/pyspark/pandas/sql_formatter.py | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/python/pyspark/pandas/sql_formatter.py b/python/pyspark/pandas/sql_formatter.py index f87dd3ff29f..4387a1e0909 100644 --- a/python/pyspark/pandas/sql_formatter.py +++ b/python/pyspark/pandas/sql_formatter.py @@ -43,7 +43,7 @@ _CAPTURE_SCOPES = 3 def sql( query: str, index_col: Optional[Union[str, List[str]]] = None, -args: Dict[str, str] = {}, +args: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> DataFrame: """ @@ -103,10 +103,14 @@ def sql( Also note that the index name(s) should be matched to the existing name. args : dict -A dictionary of parameter names to string values that are parsed as SQL literal -expressions. For example, dict keys: "rank", "name", "birthdate"; dict values: -"1", "'Steven'", "DATE'2023-03-21'". The fragments of string values belonged to SQL -comments are skipped while parsing. +A dictionary of parameter names to Python objects that can be converted to +SQL literal expressions. See +https://spark.apache.org/docs/latest/sql-ref-datatypes.html";> +Supported Data Types for supported value types in Python. +For example, dictionary keys: "rank", "name", "birthdate"; +dictionary values: 1, "Steven", datetime.date(2023, 4, 2). +Dict value can be also a `Column` of literal expression, in that case it is taken as is. + .. versionadded:: 3.4.0 @@ -166,7 +170,7 @@ def sql( And substitude named parameters with the `:` prefix by SQL literals. ->>> ps.sql("SELECT * FROM range(10) WHERE id > :bound1", args={"bound1":"7"}) +>>> ps.sql("SELECT * FROM range(10) WHERE id > :bound1", args={"bound1":7}) id 0 8 1 9 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [MINOR][PYTHON][PS][TESTS] Rename `k_res` to `ps_res` (drop Koalas reference)
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 7dde27e387e [MINOR][PYTHON][PS][TESTS] Rename `k_res` to `ps_res` (drop Koalas reference) 7dde27e387e is described below commit 7dde27e387eba1bd58cb83b9496c06b4ed2b1f52 Author: Deepyaman Datta AuthorDate: Mon Jun 19 09:33:31 2023 +0900 [MINOR][PYTHON][PS][TESTS] Rename `k_res` to `ps_res` (drop Koalas reference) ### What changes were proposed in this pull request? Rename `k_res` to `ps_res` and `ps` to `pser` in `test_combine.py`. There is no functional change; it's purely stylistic/for consistency. ### Why are the changes needed? As a reader, the variable names are confusing and inconsistent. I only thought that the `k_` prefix meant Koalas because I've contributed to Koalas in the past. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Existing tests should continue to pass; no new tests necessary Closes #41634 from deepyaman/patch-1. Authored-by: Deepyaman Datta Signed-off-by: Hyukjin Kwon --- .../pandas/tests/computation/test_combine.py | 28 -- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/python/pyspark/pandas/tests/computation/test_combine.py b/python/pyspark/pandas/tests/computation/test_combine.py index 4c06b63b268..dd55c0fd686 100644 --- a/python/pyspark/pandas/tests/computation/test_combine.py +++ b/python/pyspark/pandas/tests/computation/test_combine.py @@ -148,21 +148,21 @@ class FrameCombineMixin: }, columns=["rkey", "value", "y"], ) -right_ps = pd.Series(list("defghi"), name="x", index=[5, 6, 7, 8, 9, 10]) +right_pser = pd.Series(list("defghi"), name="x", index=[5, 6, 7, 8, 9, 10]) left_psdf = ps.from_pandas(left_pdf) right_psdf = ps.from_pandas(right_pdf) -right_psser = ps.from_pandas(right_ps) +right_psser = ps.from_pandas(right_pser) def check(op, right_psdf=right_psdf, right_pdf=right_pdf): -k_res = op(left_psdf, right_psdf) -k_res = k_res._to_pandas() -k_res = k_res.sort_values(by=list(k_res.columns)) -k_res = k_res.reset_index(drop=True) +ps_res = op(left_psdf, right_psdf) +ps_res = ps_res._to_pandas() +ps_res = ps_res.sort_values(by=list(ps_res.columns)) +ps_res = ps_res.reset_index(drop=True) p_res = op(left_pdf, right_pdf) p_res = p_res.sort_values(by=list(p_res.columns)) p_res = p_res.reset_index(drop=True) -self.assert_eq(k_res, p_res) +self.assert_eq(ps_res, p_res) check(lambda left, right: left.merge(right)) check(lambda left, right: left.merge(right, on="value")) @@ -218,23 +218,25 @@ class FrameCombineMixin: ) # Test Series on the right -check(lambda left, right: left.merge(right), right_psser, right_ps) +check(lambda left, right: left.merge(right), right_psser, right_pser) check( -lambda left, right: left.merge(right, left_on="x", right_on="x"), right_psser, right_ps +lambda left, right: left.merge(right, left_on="x", right_on="x"), +right_psser, +right_pser, ) check( lambda left, right: left.set_index("x").merge(right, left_index=True, right_on="x"), right_psser, -right_ps, +right_pser, ) # Test join types with Series for how in ["inner", "left", "right", "outer"]: -check(lambda left, right: left.merge(right, how=how), right_psser, right_ps) +check(lambda left, right: left.merge(right, how=how), right_psser, right_pser) check( lambda left, right: left.merge(right, left_on="x", right_on="x", how=how), right_psser, -right_ps, +right_pser, ) # suffix with Series @@ -247,7 +249,7 @@ class FrameCombineMixin: right_index=True, ), right_psser, -right_ps, +right_pser, ) # multi-index columns - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44095][SQL][TESTS] Make HiveExternalCatalogVersionsSuite skip old Spark versions on Java 21
This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new d1c53208b67 [SPARK-44095][SQL][TESTS] Make HiveExternalCatalogVersionsSuite skip old Spark versions on Java 21 d1c53208b67 is described below commit d1c53208b67b982f5a38cfdb3b1dcde91d9cd029 Author: Dongjoon Hyun AuthorDate: Sun Jun 18 17:50:11 2023 -0700 [SPARK-44095][SQL][TESTS] Make HiveExternalCatalogVersionsSuite skip old Spark versions on Java 21 ### What changes were proposed in this pull request? This PR aims to make `HiveExternalCatalogVersionsSuite` skip old Spark versions when Java 21 is used for testing. ### Why are the changes needed? Old Apache Spark releases are unable to support Java 21. So, it causes a test failure at runtime. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs and manual testing on Java 21. **BEFORE** ``` $ build/sbt "hive/testOnly *.HiveExternalCatalogVersionsSuite" -Phive ... [info] 2023-06-18 16:43:22.448 - stderr> Caused by: java.lang.IllegalStateException: java.lang.NoSuchMethodException: java.nio.DirectByteBuffer.(long,int) ... [info] *** 1 SUITE ABORTED *** [error] Error during tests: [error] org.apache.spark.sql.hive.HiveExternalCatalogVersionsSuite [error] (hive / Test / testOnly) sbt.TestsFailedException: Tests unsuccessful [error] Total time: 33 s, completed Jun 18, 2023, 4:43:23 PM ``` **AFTER** ``` $ build/sbt "hive/testOnly *.HiveExternalCatalogVersionsSuite" -Phive ... [info] HiveExternalCatalogVersionsSuite: [info] - backward compatibility (8 milliseconds) [info] Run completed in 1 second, 26 milliseconds. [info] Total number of tests run: 1 [info] Suites: completed 1, aborted 0 [info] Tests: succeeded 1, failed 0, canceled 0, ignored 0, pending 0 [info] All tests passed. [success] Total time: 14 s, completed Jun 18, 2023, 4:42:24 PM ``` Closes #41652 from dongjoon-hyun/SPARK-44095. Authored-by: Dongjoon Hyun Signed-off-by: Dongjoon Hyun --- .../spark/sql/hive/HiveExternalCatalogVersionsSuite.scala | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala index c0abb93ce0c..bfa6c8c3838 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/HiveExternalCatalogVersionsSuite.scala @@ -203,7 +203,11 @@ class HiveExternalCatalogVersionsSuite extends SparkSubmitTestUtils { if (PROCESS_TABLES.testingVersions.isEmpty) { if (PROCESS_TABLES.isPythonVersionAvailable) { -logError("Fail to get the latest Spark versions to test.") +if (SystemUtils.isJavaVersionAtMost(JavaVersion.JAVA_17)) { + logError("Fail to get the latest Spark versions to test.") +} else { + logInfo("Skip tests because old Spark versions don't support Java 21.") +} } else { logError(s"Python version < ${TestUtils.minimumPythonSupportedVersion}, " + "the running environment is unavailable.") @@ -259,8 +263,9 @@ object PROCESS_TABLES extends QueryTest with SQLTestUtils { val isPythonVersionAvailable = TestUtils.isPythonVersionAvailable val releaseMirror = sys.env.getOrElse("SPARK_RELEASE_MIRROR", "https://dist.apache.org/repos/dist/release";) - // Tests the latest version of every release line. - val testingVersions: Seq[String] = if (isPythonVersionAvailable) { + // Tests the latest version of every release line if Java version is at most 17. + val testingVersions: Seq[String] = if (isPythonVersionAvailable && + SystemUtils.isJavaVersionAtMost(JavaVersion.JAVA_17)) { import scala.io.Source try Utils.tryWithResource( Source.fromURL(s"$releaseMirror/spark")) { source => - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-44092][CORE] Add `Utils.isJavaVersionAtLeast21` and make `core` module pass with Java 21
This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 3a9185964a0 [SPARK-44092][CORE] Add `Utils.isJavaVersionAtLeast21` and make `core` module pass with Java 21 3a9185964a0 is described below commit 3a9185964a0de3c720a6b77d38a446258b73468e Author: Dongjoon Hyun AuthorDate: Mon Jun 19 09:57:14 2023 +0900 [SPARK-44092][CORE] Add `Utils.isJavaVersionAtLeast21` and make `core` module pass with Java 21 ### What changes were proposed in this pull request? This PR aims to make `core` module tests succeed in Java 21. To do that, this PR - Adds a utility variable `Utils.isJavaVersionAtLeast21` because Apache Commons Lang3 `3.12.0` doesn't have a constant for Java 21 yet. - Fix `UtilsSuite` according to the Java behavior change of `Files.createDirectories` API. ### Why are the changes needed? Java 20+ changes the behavior. - https://github.com/openjdk/jdk/commit/169a5d48afbc6627f36a768c17c2a5e56219d9c7 ``` 8294193: Files.createDirectories throws FileAlreadyExistsException for a symbolic link whose target is an existing directory ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Manual tests in Java 21. **JAVA** ``` $ java -version openjdk version "21-ea" 2023-09-19 OpenJDK Runtime Environment (build 21-ea+27-2343) OpenJDK 64-Bit Server VM (build 21-ea+27-2343, mixed mode, sharing) ``` **BEFORE** ``` $ $ build/sbt "core/test" -Dtest.exclude.tags=org.apache.spark.tags.ExtendedLevelDBTest ... [info] *** 1 TEST FAILED *** [error] Failed: Total 3451, Failed 1, Errors 0, Passed 3450, Ignored 10, Canceled 5 [error] Failed tests: [error] org.apache.spark.util.UtilsSuite [error] (core / Test / test) sbt.TestsFailedException: Tests unsuccessful [error] Total time: 1040 s (17:20), completed Jun 18, 2023, 12:27:59 AM ``` **AFTER** ``` $ build/sbt "core/testOnly org.apache.spark.util.UtilsSuite" ... [info] All tests passed. [success] Total time: 29 s, completed Jun 17, 2023, 11:16:23 PM ``` Closes #41648 from dongjoon-hyun/SPARK-44092. Authored-by: Dongjoon Hyun Signed-off-by: Hyukjin Kwon --- core/src/main/scala/org/apache/spark/util/Utils.scala | 6 ++ core/src/test/scala/org/apache/spark/util/UtilsSuite.scala | 6 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index c785c135a45..6e8f2c496e8 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -1972,6 +1972,12 @@ private[spark] object Utils extends Logging with SparkClassUtils { */ val isMac = SystemUtils.IS_OS_MAC_OSX + /** + * Whether the underlying Java version is at least 21. + */ + val isJavaVersionAtLeast21 = +System.getProperty("java.version").split("[+.\\-]+", 3)(0).toInt >= 21 + /** * Whether the underlying operating system is Mac OS X and processor is Apple Silicon. */ diff --git a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala index a2990f087b7..7923e81949d 100644 --- a/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala +++ b/core/src/test/scala/org/apache/spark/util/UtilsSuite.scala @@ -527,7 +527,11 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties { // 6. Symbolic link val scenario6 = java.nio.file.Files.createSymbolicLink(new File(testDir, "scenario6") .toPath, scenario1.toPath).toFile -assert(!Utils.createDirectory(scenario6)) +if (Utils.isJavaVersionAtLeast21) { + assert(Utils.createDirectory(scenario6)) +} else { + assert(!Utils.createDirectory(scenario6)) +} assert(scenario6.exists()) // 7. Directory exists - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
[spark] branch master updated: [SPARK-43929][SQL][PYTHON][CONNECT] Add date time functions to Scala, Python and Connect API - part 1
This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git The following commit(s) were added to refs/heads/master by this push: new 25a14c313bf [SPARK-43929][SQL][PYTHON][CONNECT] Add date time functions to Scala, Python and Connect API - part 1 25a14c313bf is described below commit 25a14c313bfd9e18a0ea06a8a521ee28878c2045 Author: Jiaan Geng AuthorDate: Mon Jun 19 13:28:40 2023 +0800 [SPARK-43929][SQL][PYTHON][CONNECT] Add date time functions to Scala, Python and Connect API - part 1 ### What changes were proposed in this pull request? This PR want add date time functions to Scala, Python and Connect API. These functions show below. - dateadd - date_diff - date_from_unix_date - day The origin plan also contains the two function `date_part` and `datepart`. You can see this PR exclude them, since we can't get the data type for unresolved expressions. Please refer https://github.com/apache/spark/blob/b97ce8b9a99c570fc57dec967e7e9db3d115c1db/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala#L2835 and https://github.com/apache/spark/blob/b97ce8b9a99c570fc57dec967e7e9db3d115c1db/sql/catalyst/src/main/scala/org/apache/spark [...] ### Why are the changes needed? Add date time functions to Scala, Python and Connect API. ### Does this PR introduce _any_ user-facing change? 'No'. New feature. ### How was this patch tested? New test cases. Closes #41636 from beliefer/SPARK-43929. Authored-by: Jiaan Geng Signed-off-by: Ruifeng Zheng --- .../scala/org/apache/spark/sql/functions.scala | 55 ++ .../apache/spark/sql/PlanGenerationTestSuite.scala | 16 +++ .../explain-results/function_date_diff.explain | 2 + .../function_date_from_unix_date.explain | 2 + .../explain-results/function_dateadd.explain | 2 + .../explain-results/function_day.explain | 2 + .../query-tests/queries/function_date_diff.json| 42 .../queries/function_date_diff.proto.bin | Bin 0 -> 158 bytes .../queries/function_date_from_unix_date.json | 25 + .../queries/function_date_from_unix_date.proto.bin | Bin 0 -> 132 bytes .../query-tests/queries/function_dateadd.json | 29 + .../query-tests/queries/function_dateadd.proto.bin | Bin 0 -> 127 bytes .../query-tests/queries/function_day.json | 25 + .../query-tests/queries/function_day.proto.bin | Bin 0 -> 117 bytes .../source/reference/pyspark.sql/functions.rst | 4 + python/pyspark/sql/connect/functions.py| 29 + python/pyspark/sql/functions.py| 119 + .../scala/org/apache/spark/sql/functions.scala | 48 + .../apache/spark/sql/DataFrameFunctionsSuite.scala | 2 +- .../org/apache/spark/sql/DateFunctionsSuite.scala | 37 ++- 20 files changed, 435 insertions(+), 4 deletions(-) diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala index 9c2a5b96182..206b7df2091 100644 --- a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -4107,6 +4107,21 @@ object functions { */ def date_add(start: Column, days: Column): Column = Column.fn("date_add", start, days) + /** + * Returns the date that is `days` days after `start` + * + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `-MM-dd` or `-MM-dd HH:mm:ss.` + * @param days + * A column of the number of days to add to `start`, can be negative to subtract days + * @return + * A date, or null if `start` was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.5.0 + */ + def dateadd(start: Column, days: Column): Column = Column.fn("dateadd", start, days) + /** * Returns the date that is `days` days before `start` * @@ -4161,6 +4176,37 @@ object functions { */ def datediff(end: Column, start: Column): Column = Column.fn("datediff", end, start) + /** + * Returns the number of days from `start` to `end`. + * + * Only considers the date part of the input. For example: + * {{{ + * dateddiff("2018-01-10 00:00:00", "2018-01-09 23:59:59") + * // returns 1 + * }}} + * + * @param end + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `-MM-dd` or `-MM-dd HH:mm:ss.` + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can b